From 034c1348a02a4ea6b86cc45d40b8c3b139881708 Mon Sep 17 00:00:00 2001 From: motiwari Date: Fri, 15 Aug 2025 18:12:55 -0700 Subject: [PATCH 01/29] Removing dynamic database docstrings --- dynamic_database.py | 159 -------------------------------------------- 1 file changed, 159 deletions(-) diff --git a/dynamic_database.py b/dynamic_database.py index ed6e229..4bb2862 100644 --- a/dynamic_database.py +++ b/dynamic_database.py @@ -37,20 +37,6 @@ def parse_pos(pos_str): raise ValueError(f"Unexpected format for Pos: {pos_str}") @dataclass -""" -Annotation class represents a code annotation with its full name, definition path, -and position details. -Attributes: - full_name (str): The full name of the annotation. - def_path (str): The file path where the annotation is defined. - def_pos (Pos): The starting position of the annotation definition. - def_end_pos (Pos): The ending position of the annotation definition. -Methods: - from_dict(data: Dict) -> Annotation: - Creates an Annotation instance from a dictionary. - to_dict() -> Dict: - Converts the Annotation instance to a dictionary. -""" class Annotation: full_name: str def_path: str @@ -77,27 +63,6 @@ def to_dict(self) -> Dict: } @dataclass -""" -AnnotatedTactic is a data class that represents a tactic with its annotations and states before and after its application. -Attributes: - tactic (str): The tactic applied. - annotated_tactic (Tuple[str, List[Annotation]]): A tuple containing the tactic and a list of annotations. - state_before (str): The state before the tactic is applied. - state_after (str): The state after the tactic is applied. -Methods: - from_dict(cls, data: Dict) -> AnnotatedTactic: - Creates an AnnotatedTactic instance from a dictionary. - Args: - data (Dict): A dictionary containing the keys "tactic", "annotated_tactic", "state_before", and "state_after". - Returns: - AnnotatedTactic: An instance of AnnotatedTactic. - Raises: - ValueError: If the dictionary does not contain the required keys. - to_dict(self) -> Dict: - Converts the AnnotatedTactic instance to a dictionary. - Returns: - Dict: A dictionary representation of the AnnotatedTactic instance. -""" class AnnotatedTactic: tactic: str annotated_tactic: Tuple[str, List[Annotation]] @@ -130,24 +95,6 @@ def to_dict(self) -> Dict: } @dataclass -""" -A class to represent a theorem with its associated metadata. -Attributes: - full_name (str): The full name of the theorem. - file_path (Path): The file path where the theorem is located. - start (Pos): The starting position of the theorem in the file. - end (Pos): The ending position of the theorem in the file. - url (str): The URL associated with the theorem. - commit (str): The commit hash associated with the theorem. - theorem_statement (str, optional): The statement of the theorem. - traced_tactics (Optional[List[AnnotatedTactic]], optional): A list of traced tactics. - difficulty_rating (Optional[float], optional): The difficulty rating of the theorem. -Methods: - __eq__(self, other): Checks if two Theorem instances are equal. - is_same_theorem(self, other: Theorem) -> bool: Checks if two Theorem instances represent the same theorem. - from_dict(cls, data: Dict, url: str, commit: str) -> Theorem: Creates a Theorem instance from a dictionary. - to_dict(self) -> Dict: Converts the Theorem instance to a dictionary. -""" class Theorem: full_name: str file_path: Path @@ -202,28 +149,6 @@ def to_dict(self) -> Dict: } @dataclass -""" -A class representing a Premise with various attributes. -Attributes: - full_name (str): The full name of the premise. - code (str): The code associated with the premise. - start (Pos): The starting position of the premise. - end (Pos): The ending position of the premise. - kind (str): The kind or type of the premise. -Methods: - from_dict(cls, data: Dict) -> Premise: - Creates an instance of Premise from a dictionary. - Args: - data (Dict): A dictionary containing the premise data. - Returns: - Premise: An instance of the Premise class. - Raises: - ValueError: If the dictionary does not contain the required keys. - to_dict(self) -> Dict: - Converts the Premise instance to a dictionary. - Returns: - Dict: A dictionary representation of the Premise instance. -""" class Premise: full_name: str code: str @@ -253,26 +178,6 @@ def to_dict(self) -> Dict: } @dataclass -""" -Represents a file containing premises and their associated imports. -Attributes: - path (Path): The file path. - imports (List[str]): A list of import statements. - premises (List[Premise]): A list of premises. -Methods: - from_dict(cls, data: Dict) -> PremiseFile: - Creates an instance of PremiseFile from a dictionary. - Args: - data (Dict): A dictionary containing the keys "path", "imports", and "premises". - Returns: - PremiseFile: An instance of PremiseFile. - Raises: - ValueError: If the dictionary does not contain the required keys. - to_dict(self) -> Dict: - Converts the PremiseFile instance to a dictionary. - Returns: - Dict: A dictionary representation of the PremiseFile instance. -""" class PremiseFile: path: Path imports: List[str] @@ -296,41 +201,6 @@ def to_dict(self) -> Dict: } @dataclass -""" -Repository class represents a repository with various attributes and methods to manage theorems and premise files. -Attributes: - url (str): URL of the repository. - name (str): Name of the repository. - commit (str): Commit hash of the repository. - lean_version (str): Version of Lean used in the repository. - lean_dojo_version (str): Version of Lean Dojo used in the repository. - metadata (Dict[str, str]): Metadata associated with the repository. - proven_theorems (List[Theorem]): List of proven theorems. - sorry_theorems_proved (List[Theorem]): List of sorry theorems that have been proved. - sorry_theorems_unproved (List[Theorem]): List of sorry theorems that are unproved. - premise_files (List[PremiseFile]): List of premise files. - files_traced (List[Path]): List of traced files. - pr_url (Optional[str]): URL of the pull request. -Methods: - __eq__(self, other): Checks equality between two Repository instances. - __hash__(self): Returns the hash value of the Repository instance. - total_theorems(self) -> int: Returns the total number of theorems. - num_proven_theorems(self) -> int: Returns the number of proven theorems. - num_sorry_theorems_proved(self) -> int: Returns the number of sorry theorems that have been proved. - num_sorry_theorems_unproved(self) -> int: Returns the number of sorry theorems that are unproved. - num_sorry_theorems(self) -> int: Returns the total number of sorry theorems. - num_premise_files(self) -> int: Returns the number of premise files. - num_premises(self) -> int: Returns the total number of premises. - num_files_traced(self) -> int: Returns the number of traced files. - get_all_theorems(self) -> List[Theorem]: Returns a list of all theorems. - get_theorem(self, full_name: str, file_path: str) -> Optional[Theorem]: Retrieves a theorem by its full name and file path. - update_theorem(self, theorem: Theorem) -> None: Updates an existing theorem. - get_premise_file(self, path: str) -> Optional[PremiseFile]: Retrieves a premise file by its path. - get_file_traced(self, path: str) -> Optional[Path]: Retrieves a traced file by its path. - from_dict(cls, data: Dict) -> Repository: Creates a Repository instance from a dictionary. - to_dict(self) -> Dict: Converts the Repository instance to a dictionary. - change_sorry_to_proven(self, theorem: Theorem, log_file: str) -> None: Changes a sorry theorem to a proven theorem and logs the change. -""" class Repository: url: str name: str @@ -544,35 +414,6 @@ def safe_remove_dir_path(dir_path): raise @dataclass -""" -A class that manages a collection of repositories containing Lean theorem proofs. -The DynamicDatabase class provides functionality for: -1. Managing repositories (adding, retrieving, updating, deleting) -2. Generating merged datasets from multiple repositories -3. Splitting theorem data for training/validation/testing -4. Exporting proofs, corpus data, and metadata -Attributes: - repositories: List of Repository objects managed by the database -Methods: - generate_merged_dataset: Creates a merged dataset from multiple repositories - _merge_corpus: Merges premise files from multiple repositories - _split_data: Splits theorem data using different strategies - _split_randomly: Splits theorems randomly into train/val/test sets - _split_by_premise: Splits theorems based on premises to ensure premise novelty - _export_proofs: Exports theorem proofs in JSON format - _export_traced_files: Exports information about traced files - _export_metadata: Exports metadata about repositories and statistics - add_repository: Adds a new repository to the database - get_repository: Retrieves a repository by URL and commit - update_repository: Updates an existing repository - print_database_contents: Logs the current database contents - delete_repository: Removes a repository from the database - to_dict: Converts the database to a dictionary representation - from_dict: Creates a database instance from a dictionary - to_json: Serializes the database to a JSON file - from_json: Deserializes a database from a JSON file - update_json: Updates an existing JSON file with current database state -""" class DynamicDatabase: repositories: List[Repository] = field(default_factory=list) From a475cfbcf9876aaff1240024a1da44b36b673976 Mon Sep 17 00:00:00 2001 From: motiwari Date: Mon, 25 Aug 2025 09:23:09 -0700 Subject: [PATCH 02/29] Running formatter --- .gitignore | 3 + common.py | 15 +- compute_fisher.py | 22 +- custom_progress.py | 26 +- custom_traced_data.py | 11 +- custom_utils.py | 8 +- dynamic_database.py | 376 ++++-- generate_benchmark_lean4.py | 123 +- generator/datamodule.py | 3 +- generator/model.py | 20 +- leanagent.py | 692 +++++++---- leanagent_utils.py | 3 +- prover/evaluate.py | 7 +- prover/proof_search.py | 30 +- prover/search_tree.py | 3 +- replace_files.sh | 4 +- retrieval/datamodule.py | 25 +- retrieval/evaluate.py | 2 +- retrieval/evaluate_multiple.py | 43 +- retrieval/fisher_computation_module.py | 9 +- retrieval/index.py | 5 +- retrieval/main.py | 43 +- retrieval/model.py | 31 +- run_leanagent.sh | 8 +- tests/test_common.py | 2 +- unittest_dynamic_database.py | 1586 ++++++++++++++++-------- 26 files changed, 2063 insertions(+), 1037 deletions(-) diff --git a/.gitignore b/.gitignore index 99e2eca..757bfd1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ *.pkl retrieval/bm25 +.idea/ +.DS_Store +RAID/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/common.py b/common.py index 04a119d..ac50437 100644 --- a/common.py +++ b/common.py @@ -38,10 +38,12 @@ def __post_init__(self) -> None: assert isinstance(self.theorem_full_name, str) assert isinstance(self.theorem_pos, Pos) if self.state is not None: - if not (isinstance(self.state, str) + if not ( + isinstance(self.state, str) and "⊢" in self.state and MARK_START_SYMBOL not in self.state - and MARK_END_SYMBOL not in self.state): + and MARK_END_SYMBOL not in self.state + ): logger.warning(f"Invalid state: {self.state}") assert ( isinstance(self.state, str) @@ -56,9 +58,11 @@ def serialize(self) -> str: return "" return self.state + def escape_regex_special_chars(text): return re.escape(text) + @dataclass(unsafe_hash=True) class Premise: """Premises are "documents" in our retrieval setup.""" @@ -202,7 +206,6 @@ def __init__(self, jsonl_path: str) -> None: dep_graph = nx.DiGraph() self.all_premises = [] - for line in open(jsonl_path): file_data = json.loads(line) path = file_data["path"] @@ -222,7 +225,7 @@ def __init__(self, jsonl_path: str) -> None: self.imported_premises_cache = {} self.fill_cache() - def _get_file(self, path: str) -> File: + def _get_file(self, path: str) -> File: # for some reason, the `path` in the parameter starts with ./ # but the paths in the corpus don't # so we need to remove the ./ @@ -471,7 +474,9 @@ def _is_deepspeed_checkpoint(path: str): def load_checkpoint(model_cls, ckpt_path: str, device, freeze: bool, config: dict): """Handle DeepSpeed checkpoints in model loading.""" if not _is_deepspeed_checkpoint(ckpt_path): - model = model_cls.load_from_checkpoint(ckpt_path, strict=False, **config).to(device) + model = model_cls.load_from_checkpoint(ckpt_path, strict=False, **config).to( + device + ) else: with tempfile.TemporaryDirectory() as dirname: path = os.path.join(dirname, "lightning.cpkt") diff --git a/compute_fisher.py b/compute_fisher.py index 352d080..56b32fe 100644 --- a/compute_fisher.py +++ b/compute_fisher.py @@ -3,6 +3,7 @@ new_data_path = "/" + def main(): """ The main function that drives LeanAgent. @@ -23,7 +24,7 @@ def main(): try: logger.info("Calculating Fisher Information Matrix for EWC") ### FISHER INFORMATION MATRIX FOR NEXT EWC - + if not torch.cuda.is_available(): logger.warning("Indexing the corpus using CPU can be very slow.") device = torch.device("cpu") @@ -41,7 +42,9 @@ def main(): try: best_model_path = find_latest_checkpoint() logger.info(f"Found latest checkpoint: {best_model_path}") - best_model = PremiseRetriever.load(best_model_path, device, freeze=False, config=config) + best_model = PremiseRetriever.load( + best_model_path, device, freeze=False, config=config + ) except FileNotFoundError as e: logger.error(f"No checkpoint found: {str(e)}") logger.warning("Using the current model state.") @@ -51,8 +54,8 @@ def main(): fisher_module = FisherComputationModule(best_model) VERY_LONG_TIMEOUT = 7 * 24 * 60 * 60 * 52 # 1 year - os.environ['TORCH_NCCL_ASYNC_ERROR_HANDLING'] = '1' - os.environ['NCCL_TIMEOUT'] = str(VERY_LONG_TIMEOUT * 1000) + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1" + os.environ["NCCL_TIMEOUT"] = str(VERY_LONG_TIMEOUT * 1000) ddp_strategy = DDPStrategy(timeout=timedelta(seconds=VERY_LONG_TIMEOUT)) # Setup trainer for Fisher computation @@ -79,9 +82,9 @@ def main(): batch_size=BATCH_SIZE, eval_batch_size=64, max_seq_len=1024, - num_workers=4 + num_workers=4, ) - data_module.setup(stage='fit') + data_module.setup(stage="fit") try: logger.info("right before barrier fisher") @@ -92,7 +95,11 @@ def main(): # Save the FIM if needed if fisher_trainer.is_global_zero: - fisher_file_path = os.path.join(RAID_DIR, FISHER_DIR, f"fisher_info_{new_data_path.split('/')[-1]}_distributed.pkl") + fisher_file_path = os.path.join( + RAID_DIR, + FISHER_DIR, + f"fisher_info_{new_data_path.split('/')[-1]}_distributed.pkl", + ) fisher_module.save_fisher_info(fisher_file_path) logger.info(f"Fisher Information Matrix saved at {fisher_file_path}") except Exception as e: @@ -103,5 +110,6 @@ def main(): logger.info(f"An error occurred: {e}", file=sys.stderr) traceback.print_exc() + if __name__ == "__main__": main() diff --git a/custom_progress.py b/custom_progress.py index 6d647c9..7c02ce8 100644 --- a/custom_progress.py +++ b/custom_progress.py @@ -139,7 +139,9 @@ class _Progress(_BaseProgress): def __post_init__(self) -> None: if self.total.__class__ is not self.current.__class__: - raise ValueError("The `total` and `current` instances should be of the same class") + raise ValueError( + "The `total` and `current` instances should be of the same class" + ) def increment_ready(self) -> None: self.total.ready += 1 @@ -147,13 +149,17 @@ def increment_ready(self) -> None: def increment_started(self) -> None: if not isinstance(self.total, _StartedTracker): - raise TypeError(f"`{self.total.__class__.__name__}` doesn't have a `started` attribute") + raise TypeError( + f"`{self.total.__class__.__name__}` doesn't have a `started` attribute" + ) self.total.started += 1 self.current.started += 1 def increment_processed(self) -> None: if not isinstance(self.total, _ProcessedTracker): - raise TypeError(f"`{self.total.__class__.__name__}` doesn't have a `processed` attribute") + raise TypeError( + f"`{self.total.__class__.__name__}` doesn't have a `processed` attribute" + ) self.total.processed += 1 self.current.processed += 1 @@ -162,7 +168,9 @@ def increment_completed(self) -> None: self.current.completed += 1 @classmethod - def from_defaults(cls, tracker_cls: Type[_ReadyCompletedTracker], **kwargs: int) -> "_Progress": + def from_defaults( + cls, tracker_cls: Type[_ReadyCompletedTracker], **kwargs: int + ) -> "_Progress": """Utility function to easily create an instance from keyword arguments to both ``Tracker``s.""" return cls(total=tracker_cls(**kwargs), current=tracker_cls(**kwargs)) @@ -244,8 +252,12 @@ class _OptimizerProgress(_BaseProgress): """ - step: _Progress = field(default_factory=lambda: _Progress.from_defaults(_ReadyCompletedTracker)) - zero_grad: _Progress = field(default_factory=lambda: _Progress.from_defaults(_StartedTracker)) + step: _Progress = field( + default_factory=lambda: _Progress.from_defaults(_ReadyCompletedTracker) + ) + zero_grad: _Progress = field( + default_factory=lambda: _Progress.from_defaults(_StartedTracker) + ) @override def reset(self) -> None: @@ -297,4 +309,4 @@ def reset_on_restart(self) -> None: def load_state_dict(self, state_dict: dict) -> None: if state_dict["optimizer"]["step"]["total"]["completed"] == None: state_dict["optimizer"]["step"]["total"]["completed"] = 0 - self.optimizer.load_state_dict(state_dict["optimizer"]) \ No newline at end of file + self.optimizer.load_state_dict(state_dict["optimizer"]) diff --git a/custom_traced_data.py b/custom_traced_data.py index 26e7117..ae569e4 100644 --- a/custom_traced_data.py +++ b/custom_traced_data.py @@ -1,5 +1,4 @@ -"""This module defines traced repos/files/theorems. -""" +"""This module defines traced repos/files/theorems.""" import re import os @@ -1080,7 +1079,7 @@ def from_traced_files( TracedFile.from_traced_file(root_dir, path, repo) for path in tqdm(json_paths) ] - + dependencies = repo.get_dependencies(root_dir) if build_deps: traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo) @@ -1110,7 +1109,7 @@ def save_to_disk(self) -> None: for tf in tqdm(self.traced_files, total=num_traced_files): _save_xml_to_disk(tf) - + @classmethod def load_from_disk( cls, root_dir: Union[str, Path], build_deps: bool = True @@ -1138,7 +1137,7 @@ def load_from_disk( traced_files = [ TracedFile.from_xml(root_dir, path, repo) for path in tqdm(xml_paths) ] - + dependencies = repo.get_dependencies(root_dir) if build_deps: traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo) @@ -1166,4 +1165,4 @@ def get_traced_theorem(self, thm: Theorem) -> Optional[TracedTheorem]: else: assert thm.repo in self.dependencies.values() path = Path(self.name) / LEAN4_PACKAGES_DIR / thm.repo.name / thm.file_path - return self.get_traced_file(path).get_traced_theorem(thm.full_name) \ No newline at end of file + return self.get_traced_file(path).get_traced_theorem(thm.full_name) diff --git a/custom_utils.py b/custom_utils.py index 8f8ea79..dd587a4 100644 --- a/custom_utils.py +++ b/custom_utils.py @@ -1,5 +1,4 @@ -"""Utility functions used internally by LeanDojo. -""" +"""Utility functions used internally by LeanDojo.""" import re import os @@ -20,7 +19,7 @@ @contextmanager def working_directory( - path: Optional[Union[str, Path]] = None + path: Optional[Union[str, Path]] = None, ) -> Generator[Path, None, None]: """Context manager setting the current working directory (CWD) to ``path`` (or a temporary directory if ``path`` is None). @@ -52,6 +51,7 @@ def working_directory( if is_temporary: tmp_dir.__exit__(None, None, None) + @contextmanager def report_critical_failure(msg: str) -> Generator[None, None, None]: """Context manager logging ``msg`` in case of any exception. @@ -285,4 +285,4 @@ def to_lean_path(root_dir: Path, path: Path, repo) -> bool: else: # E.g., ".lake/build/ir/Mathlib/LinearAlgebra/Basics.lean" or "build/ir/Mathlib/LinearAlgebra/Basics.lean" assert path.is_relative_to(LEAN4_BUILD_DIR / "ir"), path - return path.relative_to(LEAN4_BUILD_DIR / "ir") \ No newline at end of file + return path.relative_to(LEAN4_BUILD_DIR / "ir") diff --git a/dynamic_database.py b/dynamic_database.py index 4bb2862..d819000 100644 --- a/dynamic_database.py +++ b/dynamic_database.py @@ -12,12 +12,13 @@ from loguru import logger import shutil + def parse_pos(pos_str): """ Parses a position string or list into a Pos object. Args: - pos_str (str or list): The position data, either as a string in the format 'Pos(x, y)' + pos_str (str or list): The position data, either as a string in the format 'Pos(x, y)' or as a list [x, y]. Returns: @@ -28,7 +29,9 @@ def parse_pos(pos_str): """ if isinstance(pos_str, str): # pos_str came from a JSON file - pos_parts = pos_str.replace('Pos', '').replace('(', '').replace(')', '').split(',') + pos_parts = ( + pos_str.replace("Pos", "").replace("(", "").replace(")", "").split(",") + ) return Pos(int(pos_parts[0]), int(pos_parts[1])) elif isinstance(pos_str, list): # pos_str came from a dictionary initialization @@ -36,6 +39,7 @@ def parse_pos(pos_str): else: raise ValueError(f"Unexpected format for Pos: {pos_str}") + @dataclass class Annotation: full_name: str @@ -45,23 +49,26 @@ class Annotation: @classmethod def from_dict(cls, data: Dict) -> Annotation: - if not all(key in data for key in ["full_name", "def_path", "def_pos", "def_end_pos"]): + if not all( + key in data for key in ["full_name", "def_path", "def_pos", "def_end_pos"] + ): raise ValueError("Invalid Annotation data format") return cls( full_name=data["full_name"], def_path=data["def_path"], def_pos=parse_pos(data["def_pos"]), - def_end_pos=parse_pos(data["def_end_pos"]) + def_end_pos=parse_pos(data["def_end_pos"]), ) - + def to_dict(self) -> Dict: return { "full_name": self.full_name, "def_path": self.def_path, "def_pos": repr(self.def_pos), - "def_end_pos": repr(self.def_end_pos) + "def_end_pos": repr(self.def_end_pos), } + @dataclass class AnnotatedTactic: tactic: str @@ -71,29 +78,33 @@ class AnnotatedTactic: @classmethod def from_dict(cls, data: Dict) -> AnnotatedTactic: - if not all(key in data for key in ["tactic", "annotated_tactic", "state_before", "state_after"]): + if not all( + key in data + for key in ["tactic", "annotated_tactic", "state_before", "state_after"] + ): raise ValueError("Invalid AnnotatedTactic data format") return cls( tactic=data["tactic"], annotated_tactic=( data["annotated_tactic"][0], - [Annotation.from_dict(a) for a in data["annotated_tactic"][1]] + [Annotation.from_dict(a) for a in data["annotated_tactic"][1]], ), state_before=data["state_before"], - state_after=data["state_after"] + state_after=data["state_after"], ) - + def to_dict(self) -> Dict: return { "tactic": self.tactic, "annotated_tactic": [ self.annotated_tactic[0], - [a.to_dict() for a in self.annotated_tactic[1]] + [a.to_dict() for a in self.annotated_tactic[1]], ], "state_before": self.state_before, - "state_after": self.state_after + "state_after": self.state_after, } + @dataclass class Theorem: full_name: str @@ -112,10 +123,12 @@ def __eq__(self, other): return self.is_same_theorem(other) def is_same_theorem(self, other: Theorem) -> bool: - return (self.full_name == other.full_name and - self.file_path == other.file_path and - self.start == other.start and - self.end == other.end) + return ( + self.full_name == other.full_name + and self.file_path == other.file_path + and self.start == other.start + and self.end == other.end + ) @classmethod def from_dict(cls, data: Dict, url: str, commit: str) -> Theorem: @@ -132,9 +145,9 @@ def from_dict(cls, data: Dict, url: str, commit: str) -> Theorem: traced_tactics=[ AnnotatedTactic.from_dict(t) for t in data.get("traced_tactics", []) ], - difficulty_rating=data.get("difficulty_rating") + difficulty_rating=data.get("difficulty_rating"), ) - + def to_dict(self) -> Dict: return { "full_name": self.full_name, @@ -145,9 +158,10 @@ def to_dict(self) -> Dict: "url": self.url, "commit": self.commit, "traced_tactics": [t.to_dict() for t in (self.traced_tactics or [])], - "difficulty_rating": self.difficulty_rating + "difficulty_rating": self.difficulty_rating, } + @dataclass class Premise: full_name: str @@ -158,25 +172,28 @@ class Premise: @classmethod def from_dict(cls, data: Dict) -> Premise: - if not all(key in data for key in ["full_name", "code", "start", "end", "kind"]): + if not all( + key in data for key in ["full_name", "code", "start", "end", "kind"] + ): raise ValueError("Invalid Premise data format") return cls( full_name=data["full_name"], code=data["code"], start=parse_pos(data["start"]), end=parse_pos(data["end"]), - kind=data["kind"] + kind=data["kind"], ) - + def to_dict(self) -> Dict: return { "full_name": self.full_name, "code": self.code, "start": repr(self.start), "end": repr(self.end), - "kind": self.kind + "kind": self.kind, } + @dataclass class PremiseFile: path: Path @@ -190,16 +207,17 @@ def from_dict(cls, data: Dict) -> PremiseFile: return cls( path=Path(data["path"]), imports=data["imports"], - premises=[Premise.from_dict(p) for p in data["premises"]] + premises=[Premise.from_dict(p) for p in data["premises"]], ) - + def to_dict(self) -> Dict: return { "path": str(self.path), "imports": self.imports, - "premises": [p.to_dict() for p in self.premises] + "premises": [p.to_dict() for p in self.premises], } + @dataclass class Repository: url: str @@ -218,14 +236,24 @@ class Repository: def __eq__(self, other): if not isinstance(other, Repository): return NotImplemented - return (self.url == other.url and - self.name == other.name and - self.commit == other.commit and - self.lean_version == other.lean_version and - self.lean_dojo_version == other.lean_dojo_version) + return ( + self.url == other.url + and self.name == other.name + and self.commit == other.commit + and self.lean_version == other.lean_version + and self.lean_dojo_version == other.lean_dojo_version + ) def __hash__(self): - return hash((self.url, self.name, self.commit, self.lean_version, self.lean_dojo_version)) + return hash( + ( + self.url, + self.name, + self.commit, + self.lean_version, + self.lean_dojo_version, + ) + ) @property def total_theorems(self) -> int: @@ -246,7 +274,7 @@ def num_sorry_theorems_unproved(self) -> int: @property def num_sorry_theorems(self) -> int: return self.num_sorry_theorems_proved + self.num_sorry_theorems_unproved - + @property def num_premise_files(self) -> int: return len(self.premise_files) @@ -258,26 +286,41 @@ def num_premises(self) -> int: @property def num_files_traced(self) -> int: return len(self.files_traced) - + @property def get_all_theorems(self) -> List[Theorem]: - return self.proven_theorems + self.sorry_theorems_proved + self.sorry_theorems_unproved - + return ( + self.proven_theorems + + self.sorry_theorems_proved + + self.sorry_theorems_unproved + ) + def get_theorem(self, full_name: str, file_path: str) -> Optional[Theorem]: - for thm_list in [self.proven_theorems, self.sorry_theorems_proved, self.sorry_theorems_unproved]: + for thm_list in [ + self.proven_theorems, + self.sorry_theorems_proved, + self.sorry_theorems_unproved, + ]: for thm in thm_list: - if thm.full_name == full_name and (str(thm.file_path) == file_path or (file_path == "" and str(thm.file_path) == ".")): + if thm.full_name == full_name and ( + str(thm.file_path) == file_path + or (file_path == "" and str(thm.file_path) == ".") + ): return thm return None - + def update_theorem(self, theorem: Theorem) -> None: - for thm_list in [self.proven_theorems, self.sorry_theorems_proved, self.sorry_theorems_unproved]: + for thm_list in [ + self.proven_theorems, + self.sorry_theorems_proved, + self.sorry_theorems_unproved, + ]: for i, thm in enumerate(thm_list): if thm.is_same_theorem(theorem): thm_list[i] = theorem return raise ValueError(f"Theorem '{theorem.full_name}' not found.") - + def get_premise_file(self, path: str) -> Optional[PremiseFile]: return next((pf for pf in self.premise_files if str(pf.path) == path), None) @@ -286,15 +329,27 @@ def get_file_traced(self, path: str) -> Optional[Path]: @classmethod def from_dict(cls, data: Dict) -> Repository: - if not all(key in data for key in ["url", "name", "commit", "lean_version", "lean_dojo_version", "metadata"]): + if not all( + key in data + for key in [ + "url", + "name", + "commit", + "lean_version", + "lean_dojo_version", + "metadata", + ] + ): raise ValueError("Invalid Repository data format") if "date_processed" not in data["metadata"]: raise ValueError("Metadata must contain the 'date_processed' key") metadata = data["metadata"].copy() if isinstance(metadata["date_processed"], str): - metadata["date_processed"] = datetime.datetime.fromisoformat(metadata["date_processed"]) - + metadata["date_processed"] = datetime.datetime.fromisoformat( + metadata["date_processed"] + ) + repo = cls( url=data["url"], name=data["name"], @@ -303,48 +358,72 @@ def from_dict(cls, data: Dict) -> Repository: lean_dojo_version=data["lean_dojo_version"], metadata=metadata, files_traced=[], - pr_url=data.get("pr_url") + pr_url=data.get("pr_url"), ) - if all(key in data for key in ["theorems_folder", "premise_files_corpus", "files_traced"]): - if not all(os.path.exists(data[key]) for key in ["theorems_folder", "premise_files_corpus", "files_traced"]): - raise ValueError("Paths to data cannot be empty when creating repo from dataset") + if all( + key in data + for key in ["theorems_folder", "premise_files_corpus", "files_traced"] + ): + if not all( + os.path.exists(data[key]) + for key in ["theorems_folder", "premise_files_corpus", "files_traced"] + ): + raise ValueError( + "Paths to data cannot be empty when creating repo from dataset" + ) theorems_folder = Path(data["theorems_folder"]) for file in theorems_folder.glob("*.json"): - with open(file, 'r') as f: + with open(file, "r") as f: theorem_data = json.load(f) for t_data in tqdm(theorem_data): theorem = Theorem.from_dict(t_data, repo.url, repo.commit) - if any('sorry' in step.tactic for step in (theorem.traced_tactics or [])): + if any( + "sorry" in step.tactic + for step in (theorem.traced_tactics or []) + ): repo.sorry_theorems_unproved.append(theorem) else: repo.proven_theorems.append(theorem) - with open(data["premise_files_corpus"], 'r') as f: + with open(data["premise_files_corpus"], "r") as f: for line in f: premise_file_data = json.loads(line) premise_file = PremiseFile.from_dict(premise_file_data) repo.premise_files.append(premise_file) - with open(data["files_traced"], 'r') as f: + with open(data["files_traced"], "r") as f: for line in f: traced_file_data = json.loads(line) repo.files_traced.append(Path(traced_file_data["traced_file_path"])) else: # Process theorems and premises from the existing data structure - repo.proven_theorems = [Theorem.from_dict(t, repo.url, repo.commit) for t in data.get("proven_theorems", [])] - repo.sorry_theorems_proved = [Theorem.from_dict(t, repo.url, repo.commit) for t in data.get("sorry_theorems_proved", [])] - repo.sorry_theorems_unproved = [Theorem.from_dict(t, repo.url, repo.commit) for t in data.get("sorry_theorems_unproved", [])] - repo.premise_files = [PremiseFile.from_dict(pf) for pf in data.get("premise_files", [])] + repo.proven_theorems = [ + Theorem.from_dict(t, repo.url, repo.commit) + for t in data.get("proven_theorems", []) + ] + repo.sorry_theorems_proved = [ + Theorem.from_dict(t, repo.url, repo.commit) + for t in data.get("sorry_theorems_proved", []) + ] + repo.sorry_theorems_unproved = [ + Theorem.from_dict(t, repo.url, repo.commit) + for t in data.get("sorry_theorems_unproved", []) + ] + repo.premise_files = [ + PremiseFile.from_dict(pf) for pf in data.get("premise_files", []) + ] repo.files_traced = [Path(file) for file in data.get("files_traced", [])] return repo - + def to_dict(self) -> Dict: metadata_copy = self.metadata.copy() if isinstance(metadata_copy["date_processed"], datetime.datetime): - metadata_copy["date_processed"] = metadata_copy["date_processed"].isoformat() + metadata_copy["date_processed"] = metadata_copy[ + "date_processed" + ].isoformat() return { "url": self.url, "name": self.name, @@ -362,10 +441,12 @@ def to_dict(self) -> Dict: "num_files_traced": self.num_files_traced, "proven_theorems": [t.to_dict() for t in self.proven_theorems], "sorry_theorems_proved": [t.to_dict() for t in self.sorry_theorems_proved], - "sorry_theorems_unproved": [t.to_dict() for t in self.sorry_theorems_unproved], + "sorry_theorems_unproved": [ + t.to_dict() for t in self.sorry_theorems_unproved + ], "premise_files": [pf.to_dict() for pf in self.premise_files], "files_traced": [str(file) for file in self.files_traced], - "pr_url": self.pr_url + "pr_url": self.pr_url, } def change_sorry_to_proven(self, theorem: Theorem, log_file: str) -> None: @@ -374,28 +455,31 @@ def change_sorry_to_proven(self, theorem: Theorem, log_file: str) -> None: self.sorry_theorems_proved.append(theorem) message = f"Theorem proved: {theorem.full_name} in {theorem.file_path} for repo {self.name} (commit: {self.commit})" - timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") log_entry = f"{timestamp} - {message}\n" - + os.makedirs(os.path.dirname(log_file), exist_ok=True) - - with open(log_file, 'a') as f: + + with open(log_file, "a") as f: f.write(log_entry) else: - raise ValueError("The theorem is not in the list of unproved sorry theorems.") + raise ValueError( + "The theorem is not in the list of unproved sorry theorems." + ) + def safe_remove_dir_path(dir_path): """ Safely removes a directory if it exists. - + Attempts to remove the directory multiple times in case of permission errors. - + Args: dir_path (Path): Path object representing the directory to remove - + Raises: PermissionError: If the directory cannot be removed after multiple attempts - + Returns: None """ @@ -410,30 +494,43 @@ def safe_remove_dir_path(dir_path): if attempt < max_retries - 1: time.sleep(0.1) # Wait a bit before retrying else: - logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}") + logger.error( + f"Failed to remove {dir_path} after {max_retries} attempts: {e}" + ) raise + @dataclass class DynamicDatabase: repositories: List[Repository] = field(default_factory=list) SPLIT = Dict[str, List[Theorem]] - def generate_merged_dataset(self, output_path: Path, repos_to_include: Optional[List[Tuple[str, str]]] = None) -> None: + def generate_merged_dataset( + self, + output_path: Path, + repos_to_include: Optional[List[Tuple[str, str]]] = None, + ) -> None: """ Generate a merged dataset from multiple repositories in the database. - + :param output_path: Path where the merged dataset will be saved - :param repos_to_include: List of tuples (url, commit) of repositories to include in the dataset. + :param repos_to_include: List of tuples (url, commit) of repositories to include in the dataset. If None, all repos are included. """ random.seed(3407) - + output_path.mkdir(parents=True, exist_ok=True) - repos_to_process = self.repositories if repos_to_include is None else [ - repo for repo in self.repositories if (repo.url, repo.commit) in repos_to_include - ] + repos_to_process = ( + self.repositories + if repos_to_include is None + else [ + repo + for repo in self.repositories + if (repo.url, repo.commit) in repos_to_include + ] + ) if repos_to_include is None: logger.info("Merging all repositories in the database.") @@ -447,7 +544,14 @@ def generate_merged_dataset(self, output_path: Path, repos_to_include: Optional[ for repo in repos_to_process: for theorem in repo.get_all_theorems: - key = (theorem.file_path, theorem.full_name, list(theorem.start)[0], list(theorem.start)[1], list(theorem.end)[0], list(theorem.end)[1]) + key = ( + theorem.file_path, + theorem.full_name, + list(theorem.start)[0], + list(theorem.start)[1], + list(theorem.end)[0], + list(theorem.end)[1], + ) date_processed = repo.metadata["date_processed"] if isinstance(date_processed, str): date_processed = datetime.datetime.fromisoformat(date_processed) @@ -486,19 +590,25 @@ def _merge_corpus(self, repos: List[Repository], output_path: Path) -> None: "code": premise.code, "start": list(premise.start), "end": list(premise.end), - "kind": premise.kind - } for premise in premise_file.premises - ] + "kind": premise.kind, + } + for premise in premise_file.premises + ], } - path = file_data['path'] + path = file_data["path"] if path not in merged_corpus: merged_corpus[path] = json.dumps(file_data) - with open(output_path / "corpus.jsonl", 'w') as f: + with open(output_path / "corpus.jsonl", "w") as f: for line in merged_corpus.values(): f.write(line + "\n") - def _split_data(self, theorems: List[Theorem], num_val_pct: float = 0.02, num_test_pct: float = 0.02) -> Dict[str, SPLIT]: + def _split_data( + self, + theorems: List[Theorem], + num_val_pct: float = 0.02, + num_test_pct: float = 0.02, + ) -> Dict[str, SPLIT]: num_theorems = len(theorems) num_val = int(num_theorems * num_val_pct) num_test = int(num_theorems * num_test_pct) @@ -508,7 +618,9 @@ def _split_data(self, theorems: List[Theorem], num_val_pct: float = 0.02, num_te "novel_premises": self._split_by_premise(theorems, num_val, num_test), } - def _split_randomly(self, theorems: List[Theorem], num_val: int, num_test: int) -> SPLIT: + def _split_randomly( + self, theorems: List[Theorem], num_val: int, num_test: int + ) -> SPLIT: random.shuffle(theorems) num_train = len(theorems) - num_val - num_test return { @@ -517,7 +629,9 @@ def _split_randomly(self, theorems: List[Theorem], num_val: int, num_test: int) "test": theorems[num_train + num_val :], } - def _split_by_premise(self, theorems: List[Theorem], num_val: int, num_test: int) -> SPLIT: + def _split_by_premise( + self, theorems: List[Theorem], num_val: int, num_test: int + ) -> SPLIT: num_val_test = num_val + num_test theorems_val_test = [] @@ -528,11 +642,15 @@ def _split_by_premise(self, theorems: List[Theorem], num_val: int, num_test: int for annotation in tactic.annotated_tactic[1]: theorems_by_premises[annotation.full_name].append(t) - theorems_by_premises = sorted(theorems_by_premises.items(), key=lambda x: len(x[1])) + theorems_by_premises = sorted( + theorems_by_premises.items(), key=lambda x: len(x[1]) + ) for _, thms in theorems_by_premises: if len(theorems_val_test) < num_val_test: - theorems_val_test.extend([t for t in thms if t not in theorems_val_test]) + theorems_val_test.extend( + [t for t in thms if t not in theorems_val_test] + ) else: break @@ -563,9 +681,10 @@ def _export_proofs(self, splits: Dict[str, SPLIT], output_path: Path) -> None: "full_name": a.full_name, "def_path": str(a.def_path), "def_pos": list(a.def_pos), - "def_end_pos": list(a.def_end_pos) - } for a in t.annotated_tactic[1] - ] + "def_end_pos": list(a.def_end_pos), + } + for a in t.annotated_tactic[1] + ], ], "state_before": t.state_before, "state_after": t.state_after, @@ -573,27 +692,29 @@ def _export_proofs(self, splits: Dict[str, SPLIT], output_path: Path) -> None: for t in thm.traced_tactics if t.state_before != "no goals" and "·" not in t.tactic ] - data.append({ - "url": thm.url, - "commit": thm.commit, - "file_path": str(thm.file_path), - "full_name": thm.full_name, - "theorem_statement": thm.theorem_statement, - "start": list(thm.start), - "end": list(thm.end), - "traced_tactics": tactics, - }) + data.append( + { + "url": thm.url, + "commit": thm.commit, + "file_path": str(thm.file_path), + "full_name": thm.full_name, + "theorem_statement": thm.theorem_statement, + "start": list(thm.start), + "end": list(thm.end), + "traced_tactics": tactics, + } + ) output_file = strategy_dir / f"{name}.json" - with open(output_file, 'w') as f: + with open(output_file, "w") as f: json.dump(data, f, indent=2) - def _export_traced_files(self, all_traced_files: Set[Path], output_path: Path) -> None: - with open(output_path / "traced_files.jsonl", 'w') as f: + def _export_traced_files( + self, all_traced_files: Set[Path], output_path: Path + ) -> None: + with open(output_path / "traced_files.jsonl", "w") as f: for file in all_traced_files: - f.write(json.dumps({ - "traced_file_path": str(file) - }) + "\n") + f.write(json.dumps({"traced_file_path": str(file)}) + "\n") def _export_metadata(self, repos: List[Repository], output_path: Path) -> None: metadata = { @@ -605,7 +726,8 @@ def _export_metadata(self, repos: List[Repository], output_path: Path) -> None: "lean_version": repo.lean_version, "lean_dojo_version": repo.lean_dojo_version, "metadata": repo.metadata, - } for repo in repos + } + for repo in repos ], "total_theorems": sum(repo.total_theorems for repo in repos), "num_proven_theorems": sum(repo.num_proven_theorems for repo in repos), @@ -617,9 +739,11 @@ def _export_metadata(self, repos: List[Repository], output_path: Path) -> None: for repo_data in metadata["repositories"]: if isinstance(repo_data["metadata"]["date_processed"], datetime.datetime): - repo_data["metadata"]["date_processed"] = repo_data["metadata"]["date_processed"].isoformat() - - with open(output_path / "metadata.json", 'w') as f: + repo_data["metadata"]["date_processed"] = repo_data["metadata"][ + "date_processed" + ].isoformat() + + with open(output_path / "metadata.json", "w") as f: json.dump(metadata, f, indent=2) def add_repository(self, repo: Repository) -> None: @@ -628,7 +752,9 @@ def add_repository(self, repo: Repository) -> None: self.repositories.append(repo) logger.info(f"Added new repository: {repo.url} (commit: {repo.commit})") else: - logger.info(f"Repository '{repo.url}' with commit '{repo.commit}' already exists in the database.") + logger.info( + f"Repository '{repo.url}' with commit '{repo.commit}' already exists in the database." + ) def get_repository(self, url: str, commit: str) -> Optional[Repository]: for repo in self.repositories: @@ -637,14 +763,22 @@ def get_repository(self, url: str, commit: str) -> Optional[Repository]: return None def update_repository(self, updated_repo: Repository) -> None: - logger.info(f"Attempting to update repository: {updated_repo.url} (commit: {updated_repo.commit})") + logger.info( + f"Attempting to update repository: {updated_repo.url} (commit: {updated_repo.commit})" + ) for i, repo in enumerate(self.repositories): if repo == updated_repo: self.repositories[i] = updated_repo - logger.info(f"Updated repository: {updated_repo.url} (commit: {updated_repo.commit})") + logger.info( + f"Updated repository: {updated_repo.url} (commit: {updated_repo.commit})" + ) return - logger.error(f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found for update.") - raise ValueError(f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found.") + logger.error( + f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found for update." + ) + raise ValueError( + f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found." + ) def print_database_contents(self): logger.info("Current database contents:") @@ -659,9 +793,7 @@ def delete_repository(self, url: str, commit: str) -> None: raise ValueError(f"Repository '{url}' with commit '{commit}' not found.") def to_dict(self) -> Dict: - return { - "repositories": [repo.to_dict() for repo in self.repositories] - } + return {"repositories": [repo.to_dict() for repo in self.repositories]} @classmethod def from_dict(cls, data: Dict) -> DynamicDatabase: @@ -675,13 +807,13 @@ def from_dict(cls, data: Dict) -> DynamicDatabase: def to_json(self, file_path: str) -> None: """Serialize the database to a JSON file.""" - with open(file_path, 'w', encoding='utf-8') as f: + with open(file_path, "w", encoding="utf-8") as f: json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) @classmethod def from_json(cls, file_path: str) -> DynamicDatabase: """Deserialize the database from a JSON file.""" - with open(file_path, 'r') as f: + with open(file_path, "r") as f: data = json.load(f) return cls.from_dict(data) @@ -695,4 +827,4 @@ def update_json(self, file_path: str) -> None: for repo in self.repositories: existing_db.update_repository(repo) - existing_db.to_json(file_path) \ No newline at end of file + existing_db.to_json(file_path) diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index deefbc6..0c72377 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -18,18 +18,20 @@ random.seed(3407) # https://arxiv.org/abs/2109.08203 -RAID_DIR = os.environ.get('RAID_DIR') +RAID_DIR = os.environ.get("RAID_DIR") SPLIT_NAME = str # train/val/test SPLIT = Dict[SPLIT_NAME, List[TracedTheorem]] SPLIT_STRATEGY = str _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P.+?)") + def get_lean4_version_from_config(toolchain: str) -> str: """Return the required Lean version given a ``lean-toolchain`` config.""" m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) assert m is not None, "Invalid config." return m["version"] + def is_supported_version(v) -> bool: """ Check if ``v`` is at least `v4.3.0-rc2` and at most `v4.8.0-rc1`. @@ -42,7 +44,12 @@ def is_supported_version(v) -> bool: return False v = v[1:] major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")] - if major < 4 or (major == 4 and minor < 3) or (major == 4 and minor > 8) or (major == 4 and minor == 8 and patch > 1): + if ( + major < 4 + or (major == 4 and minor < 3) + or (major == 4 and minor > 8) + or (major == 4 and minor == 8 and patch > 1) + ): return False if ( major > 4 @@ -57,10 +64,9 @@ def is_supported_version(v) -> bool: else: return True + def _split_sequentially( - traced_theorems: List[TracedTheorem], - num_val: int, - num_test: int + traced_theorems: List[TracedTheorem], num_val: int, num_test: int ) -> SPLIT: """Split ``traced_theorems`` sequentially into train/val/test.""" num_theorems = len(traced_theorems) @@ -73,9 +79,7 @@ def _split_sequentially( def split_randomly( - traced_theorems: List[TracedTheorem], - num_val: int, - num_test: int + traced_theorems: List[TracedTheorem], num_val: int, num_test: int ) -> SPLIT: """Split ``traced_theorems`` randomly into train/val/test.""" logger.info("Splitting the theorems randomly") @@ -83,10 +87,9 @@ def split_randomly( random.shuffle(traced_theorems) return _split_sequentially(traced_theorems, num_val, num_test) + def split_by_premise( - traced_theorems: List[TracedTheorem], - num_val: int, - num_test: int + traced_theorems: List[TracedTheorem], num_val: int, num_test: int ) -> SPLIT: """ Split theorems into train/val/test so that proofs in val/test rely on at @@ -125,12 +128,15 @@ def split_by_premise( "test": theorems_val_test[num_val:], } -def split_data(traced_repo: TracedRepo, num_val_pct: float = 0.02, num_test_pct: float = 0.02) -> Dict[SPLIT_STRATEGY, SPLIT]: + +def split_data( + traced_repo: TracedRepo, num_val_pct: float = 0.02, num_test_pct: float = 0.02 +) -> Dict[SPLIT_STRATEGY, SPLIT]: """ Split the traced theorems into training, validation, and test sets. This function extracts theorems from the provided TracedRepo object, excluding - theorems from the Lean 4 repository itself. The theorems are then split using + theorems from the Lean 4 repository itself. The theorems are then split using multiple strategies, including random splitting and splitting by novel premises. Args: @@ -159,13 +165,16 @@ def split_data(traced_repo: TracedRepo, num_val_pct: float = 0.02, num_test_pct: num_val = int(num_theorems * num_val_pct) num_test = int(num_theorems * num_test_pct) - logger.info(f"{num_theorems} theorems in total, with {num_val} for validation and {num_test} for testing") + logger.info( + f"{num_theorems} theorems in total, with {num_val} for validation and {num_test} for testing" + ) return { "random": split_randomly(traced_theorems, num_val, num_test), "novel_premises": split_by_premise(traced_theorems, num_val, num_test), } + def _get_file_path(traced_repo: TracedRepo, thm: TracedTheorem) -> str: """ Get the file path for a given theorem in a traced repository. @@ -201,8 +210,8 @@ def export_proofs( ) -> None: """ Export proofs from a traced repository to the specified destination path. - This function processes the given splits (organized by strategy) and writes the theorem proofs - to JSON files in the destination directory. Each theorem is exported with its metadata, + This function processes the given splits (organized by strategy) and writes the theorem proofs + to JSON files in the destination directory. Each theorem is exported with its metadata, including URL, commit, file path, theorem statement, and traced tactics. Args: splits: Dictionary mapping split strategies to actual splits. Each split maps dataset @@ -246,7 +255,7 @@ def export_proofs( theorem_statement = None if thm.has_tactic_proof() and thm.get_tactic_proof() is not None: theorem_statement = thm.get_theorem_statement() - + data.append( { "url": traced_repo.repo.url, @@ -303,18 +312,13 @@ def export_premises(traced_repo: TracedRepo, dst_path: Path) -> None: logger.info( f"{num_premises} theorems/definitions from {len(traced_repo.traced_files)} files saved to {oup_path}" ) - + oup_path = dst_path / "traced_files.jsonl" with oup_path.open("wt") as oup: for traced_file in traced_repo.traced_files: source_file = traced_file.lean_file source_file_path = source_file.path - oup.write( - json.dumps( - {"traced_file_path": str(source_file_path)} - ) - + "\n" - ) + oup.write(json.dumps({"traced_file_path": str(source_file_path)}) + "\n") return num_premises, len(traced_repo.traced_files) @@ -358,17 +362,17 @@ def export_metadata(traced_repo: TracedRepo, dst_path: Path, **kwargs) -> None: def safe_remove_dir(dir_path): """ Safely removes a directory if it exists. - + This function attempts to remove the specified directory, with multiple retries in case of permission errors. A warning is logged if the directory already exists. - + Args: dir_path (str): Path to the directory to be removed. - + Raises: PermissionError: If the directory cannot be removed after multiple attempts due to permission issues. - + Note: The function will retry up to 5 times with a 0.1 second delay between attempts if a PermissionError occurs. @@ -384,23 +388,25 @@ def safe_remove_dir(dir_path): if attempt < max_retries - 1: time.sleep(0.1) # Wait a bit before retrying else: - logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}") + logger.error( + f"Failed to remove {dir_path} after {max_retries} attempts: {e}" + ) raise def safe_remove_dir_path(dir_path): """ Safely removes a directory and all its contents if it exists. - + Uses multiple attempts with a small delay between them to handle potential permission errors that might occur on some systems when removing directories. - + Args: dir_path (Path): Path object representing the directory to remove - + Raises: PermissionError: If the directory cannot be removed after multiple attempts - + Returns: None """ @@ -415,9 +421,12 @@ def safe_remove_dir_path(dir_path): if attempt < max_retries - 1: time.sleep(0.1) # Wait a bit before retrying else: - logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}") + logger.error( + f"Failed to remove {dir_path} after {max_retries} attempts: {e}" + ) raise + def export_data( traced_repo: TracedRepo, splits: Dict[SPLIT_STRATEGY, SPLIT], @@ -425,20 +434,20 @@ def export_data( **kwargs, ) -> None: """Export a traced repository's content to a specified destination path. - - This function exports proofs, premises, licenses, and metadata from a traced - repository to a specified destination path. The repository's theorems should have + + This function exports proofs, premises, licenses, and metadata from a traced + repository to a specified destination path. The repository's theorems should have been split using a strategy defined in `splits`. - + Args: traced_repo: The traced repository containing the data to export. splits: Dictionary mapping split strategies to their corresponding splits. dst_path: Destination path where the data will be exported. Can be a string or Path object. **kwargs: Additional keyword arguments to pass to export_metadata. - + Returns: tuple: A tuple containing (number of premises, number of files traced, total theorems exported). - + Note: Any existing content at the destination path will be removed. """ @@ -460,17 +469,18 @@ def export_data( return num_premises, num_files_traced, total_theorems + def configure_leandojo(): """ Configure the LeanDojo environment for benchmarking. - + This function sets up the logger configuration for LeanDojo and displays important environment variables including the current working directory and various constants related to process management. - + It removes any existing logger handlers and adds a new handler for stderr with DEBUG level logging. - + No parameters are required, and the function does not return any values. """ constants.logger.remove() @@ -482,11 +492,12 @@ def configure_leandojo(): logger.info(f"Current working directory: {os.getcwd()}") + def main(url, commit, dst_dir): """ Generates a benchmark dataset for Lean 4 proofs from a specified repository. This function clones a Lean 4 repository, configures the appropriate Lean toolchain - version, traces the repository using LeanDojo, and exports the trace data to a + version, traces the repository using LeanDojo, and exports the trace data to a designated directory. Args: url (str): The URL of the Lean 4 Git repository to clone @@ -511,23 +522,27 @@ def main(url, commit, dst_dir): v = get_lean4_version_from_config(config["content"]) logger.info(f"lean version v: {v}") logger.info(f"is supported: {is_supported_version(v)}") - if not is_supported_version(v): # Won't get here since we checked for a compatible commit, but sanity check in case + if not is_supported_version( + v + ): # Won't get here since we checked for a compatible commit, but sanity check in case logger.info("Unsupported version") - v = v[1:] # ignore "v" at beginning - - lean_dir2 = f"/.elan/toolchains/leanprover--lean4---{v}" - lean_dir3 = f"~/.elan/toolchains/leanprover--lean4---{v}" + v = v[1:] # ignore "v" at beginning + + lean_dir2 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}" + lean_dir3 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}" logger.info(f"lean path2 {lean_dir2}") logger.info(f"lean path3 {lean_dir3}") if not os.path.exists(lean_dir2): logger.info(f"Lean toolchain path 2 does not exist: {lean_dir2}") if not os.path.exists(lean_dir3): logger.info(f"Lean toolchain path 3 does not exist: {lean_dir3}") - os.environ['LEAN4_PATH'] = lean_dir2 - os.environ['PATH'] = f"{lean_dir2}/bin:{os.environ.get('PATH', '')}" + os.environ["LEAN4_PATH"] = lean_dir2 + os.environ["PATH"] = f"{lean_dir2}/bin:{os.environ.get('PATH', '')}" logger.info(f"Switched to Lean toolchain at: {lean_dir2}") - logger.info(f"lean --version: {subprocess.run(['lean', '--version'], capture_output=True).stdout.decode('utf-8')}") + logger.info( + f"lean --version: {subprocess.run(['lean', '--version'], capture_output=True).stdout.decode('utf-8')}" + ) logger.info(f"repo: {repo}") logger.info("Configuring LeanDojo again...") @@ -544,6 +559,8 @@ def main(url, commit, dst_dir): safe_remove_dir(dst_dir) splits = split_data(traced_repo) logger.info("Successfully split the data") - num_premises, num_files_traced, total_theorems = export_data(traced_repo, splits, dst_dir) + num_premises, num_files_traced, total_theorems = export_data( + traced_repo, splits, dst_dir + ) logger.info("Successfully exported the data") return traced_repo, num_premises, num_files_traced, total_theorems diff --git a/generator/datamodule.py b/generator/datamodule.py index 95a4af9..882e64a 100644 --- a/generator/datamodule.py +++ b/generator/datamodule.py @@ -25,7 +25,7 @@ class GeneratorDataset(Dataset): """ A PyTorch Dataset for loading and processing data for a generator model that produces tactics given proof states. - This dataset handles loading examples from a JSON file, formatting states and tactics, + This dataset handles loading examples from a JSON file, formatting states and tactics, and optionally augmenting states with retrieved premises. Attributes: @@ -39,6 +39,7 @@ class GeneratorDataset(Dataset): is_train (bool): Whether this dataset is used for training. data (List[Example]): The loaded and processed examples. """ + def __init__( self, data_path: str, diff --git a/generator/model.py b/generator/model.py index d5f5c83..f07e95c 100644 --- a/generator/model.py +++ b/generator/model.py @@ -26,6 +26,7 @@ torch.set_float32_matmul_precision("medium") + def safe_remove_dir(dir_path): """ Safely removes a directory path if it exists, with retries. @@ -57,7 +58,9 @@ def safe_remove_dir(dir_path): if attempt < max_retries - 1: time.sleep(0.1) # Wait a bit before retrying else: - logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}") + logger.error( + f"Failed to remove {dir_path} after {max_retries} attempts: {e}" + ) raise @@ -80,6 +83,7 @@ class TopkAccuracy(Metric): update(batch_preds, batch_gt): Updates the state with batch statistics. compute(): Computes the accuracy based on collected state. """ + is_differentiable: Optional[bool] = False higher_is_better: Optional[bool] = True full_state_update: bool = True @@ -295,11 +299,11 @@ def on_fit_start(self) -> None: def validation_step(self, batch: Dict[str, Any], _) -> None: """ Performs a validation step on a batch of data. - - The method computes the loss on the validation data, logs the loss, and generates - tactic candidates using Beam Search. It also logs example inputs/outputs and + + The method computes the loss on the validation data, logs the loss, and generates + tactic candidates using Beam Search. It also logs example inputs/outputs and calculates top-k accuracy metrics for the generated tactics. - + Args: batch: A dictionary containing batch data with the following keys: - state_ids: Tensor of input state token IDs @@ -307,10 +311,10 @@ def validation_step(self, batch: Dict[str, Any], _) -> None: - tactic_ids: Tensor of target tactic token IDs - tactic: List of reference tactic strings _: Batch index (unused) - + Returns: None - + Side effects: - Logs validation loss - Logs example inputs/outputs as text @@ -442,7 +446,7 @@ def batch_generate( Returns: List[List[Tuple[str, float]]]: A list of lists where each inner list contains tuples of (tactic_text, score) for each state. Duplicate tactics are removed. - + Note: If a retriever is configured, it will be used to augment states with relevant premises before generation. diff --git a/leanagent.py b/leanagent.py index 969203b..ea7d062 100644 --- a/leanagent.py +++ b/leanagent.py @@ -42,23 +42,28 @@ from retrieval.datamodule import RetrievalDataModule from retrieval.main import run_cli import torch -from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor, Callback +from pytorch_lightning.callbacks import ( + ModelCheckpoint, + EarlyStopping, + LearningRateMonitor, + Callback, +) from pytorch_lightning import seed_everything # Set the seed for reproducibility random.seed(3407) # https://arxiv.org/abs/2109.08203 BATCH_SIZE = 4 -RAID_DIR = os.environ.get('RAID_DIR') -os.environ['RAY_TMPDIR'] = f"{RAID_DIR}/tmp" +RAID_DIR = os.environ.get("RAID_DIR") +os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp" repo_dir = f"{RAID_DIR}/repos_new" -DATA_DIR = "" -CHECKPOINT_DIR = "" -EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/LeanAgent/" -DB_FILE_NAME = "" -PROOF_LOG_FILE_NAME = "proof_logs/" -ENCOUNTERED_THEOREMS_FILE = "" -FISHER_DIR = "" # Optional +DATA_DIR = f"{RAID_DIR}/data" +CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints" +EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt" +DB_FILE_NAME = "db_file.txt" +PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt" +ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl" +FISHER_DIR = f"{RAID_DIR}/fisher" # Optional repos_for_merged_dataset = [] repos_for_proving = [] @@ -67,10 +72,10 @@ # Feel free to remove any repos from this list if you would like to test on them known_repositories = [ "leanprover-community/mathlib4", # ReProver is trained on this - "leanprover-community/batteries", # functional programming instead of math + "leanprover-community/batteries", # functional programming instead of math "leanprover-community/aesop", "leanprover/lean4", - "leanprover-community/mathlib", # Mathlib3 version + "leanprover-community/mathlib", # Mathlib3 version "leanprover-community/mathlib3", "leanprover/std4", # moved to batteries "leanprover-community/duper", # functional programming instead of math @@ -86,7 +91,7 @@ "ufmg-smite/lean-smt", # fails to trace due to windows-style line endings "teorth/symmetric_project", # no compatible commit "cmu-l3/llmlean", # irrelevant + only 4 theorems - "PatrickMassot/GlimpseOfLean", # strange trace problems with _parse_deps + "PatrickMassot/GlimpseOfLean", # strange trace problems with _parse_deps "avigad/lamr", # trace problems "leanprover-community/quote4", # no theorems "leanprover-community/iris-lean", # trace problems @@ -101,25 +106,25 @@ "risc0/risc0-lean4", "PatrickMassot/verbose-lean4", # no theorems "tydeu/lean4-alloy", # no theorems - "leanprover/leansat", # deprecated - "BoltonBailey/formal-snarks-project", # two theorems - "dwrensha/lean4-maze", # two theorems - "leanprover-community/mathport", # irrelevant + "leanprover/leansat", # deprecated + "BoltonBailey/formal-snarks-project", # two theorems + "dwrensha/lean4-maze", # two theorems + "leanprover-community/mathport", # irrelevant "argumentcomputer/LSpec", # one theorem - "reaslab/jixia", # no theorems - "riccardobrasca/flt3", # no theorems - "dwrensha/animate-lean-proofs", # irrelevant - "lean-ja/lean-by-example", # irrelevant - "NethermindEth/Clear", # no theorems - "fgdorais/lean4-parser", # irrelevant - "semorrison/lean-training-data", # irrelevant - "verse-lab/lean-ssr", # irrelevant - "GaloisInc/lean-llvm", # irrelevant - "argumentcomputer/Wasm.lean", # irrelevant - "NethermindEth/EVMYulLean", # irrelevant - "rwbarton/advent-of-lean-4", # irrelevant - "leanprover-community/tutorials4", # irrelevant - "haruhisa-enomoto/mathlib4-all-tactics", # irrelevant + "reaslab/jixia", # no theorems + "riccardobrasca/flt3", # no theorems + "dwrensha/animate-lean-proofs", # irrelevant + "lean-ja/lean-by-example", # irrelevant + "NethermindEth/Clear", # no theorems + "fgdorais/lean4-parser", # irrelevant + "semorrison/lean-training-data", # irrelevant + "verse-lab/lean-ssr", # irrelevant + "GaloisInc/lean-llvm", # irrelevant + "argumentcomputer/Wasm.lean", # irrelevant + "NethermindEth/EVMYulLean", # irrelevant + "rwbarton/advent-of-lean-4", # irrelevant + "leanprover-community/tutorials4", # irrelevant + "haruhisa-enomoto/mathlib4-all-tactics", # irrelevant "leanprover/LNSym", "leanprover-community/flt-regular", "opencompl/lean-mlir-old", @@ -160,7 +165,7 @@ "digama0/mm-lean4", "KislyjKisel/Raylib.lean", "algebraic-dev/melp", - "hhu-adam/Robo", # same as other tutorials but has lots of sorries + "hhu-adam/Robo", # same as other tutorials but has lots of sorries "hargoniX/socket.lean", "kovach/etch", "damek/gd-lean", @@ -169,7 +174,7 @@ "katydid/proofs", "alexjbest/leaff", "sinhp/Poly", - "lftcm2023/lftcm2023", # same as other tutorials but has lots of sorries + "lftcm2023/lftcm2023", # same as other tutorials but has lots of sorries "lean-ja/lean99", "leanprover/SHerLOC", "Seasawher/mdgen", @@ -183,7 +188,7 @@ "madvorak/fecssk", "david-christiansen/bob24", "awodey/joyal", - "BrownCS1951x/fpv2023", # same as other tutorials but has lots of sorries + "BrownCS1951x/fpv2023", # same as other tutorials but has lots of sorries "paulch42/lean-spec", "siddhartha-gadgil/MetaExamples", "dannypsnl/violet", @@ -194,7 +199,7 @@ "kmill/LeanTeX", "leanprover/lean4export", "leanprover-community/mathlib3port", - "brown-cs22/CS22-Lean-2024", # same as other tutorials but has lots of sorries + "brown-cs22/CS22-Lean-2024", # same as other tutorials but has lots of sorries "T-Brick/lean-wasm", "crabbo-rave/Soup", "argumentcomputer/RustFFI.lean", @@ -237,14 +242,14 @@ "arthurpaulino/LeanMusic", "argumentcomputer/Ipld.lean", "Odomontois/advent2022-lean", - "kbuzzard/IISc-experiments", # same as other tutorials but has lots of sorries + "kbuzzard/IISc-experiments", # same as other tutorials but has lots of sorries "ykonstant1/InfinitePrimes", "alexkassil/natural_number_game_lean4", "seewoo5/lean-poly-abc", "rah4927/lean-dojo-mew", "siddhartha-gadgil/proofs-and-programs-2023", "PatrickMassot/lean4-game-server", - "knowsys/Formale-Systeme-in-LEAN", # same as other tutorials but has lots of sorries + "knowsys/Formale-Systeme-in-LEAN", # same as other tutorials but has lots of sorries "katydid/symbolic-automatic-derivatives", "girving/interval", "ImperialCollegeLondon/group-theory-experiments", @@ -253,14 +258,14 @@ "vasnesterov/HadwigerNelson", "FWuermse/lean-postgres", "leanprover-community/import-graph", - "Human-Oriented-ATP/lean-tactics", # more about tactics than premises + "Human-Oriented-ATP/lean-tactics", # more about tactics than premises "paulcadman/lean4-leetcode", "argumentcomputer/Lurk.lean", "AlexDuchnowski/rubiks-cube", "SchrodingerZhu/lean-gccjit", "JamesGallicchio/http", "jtristan/UnicodeSkipListTableExample", - "adomani/MA4N1_2023", # same as other tutorials but has lots of sorries + "adomani/MA4N1_2023", # same as other tutorials but has lots of sorries "remimimimimi/leansec", "hhu-adam/lean-i18n", "RemyDegenne/testing-lower-bounds", @@ -298,9 +303,10 @@ COMMIT_MESSAGE = "[LeanAgent] Proofs" + def clone_repo(repo_url): """Clone a git repository and return the path to the repository and its sha.""" - repo_name = "/".join(repo_url.split('/')[-2:]).replace('.git', '') + repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "") logger.info(f"Cloning {repo_url}") logger.info(f"Repo name: {repo_name}") repo_name = repo_dir + "/" + repo_name @@ -310,28 +316,53 @@ def clone_repo(repo_url): subprocess.run(["git", "clone", repo_url, repo_name]) process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE) stdout, stderr = process.communicate() - sha = re.split(r'\t+', stdout.decode('utf-8'))[0] + sha = re.split(r"\t+", stdout.decode("utf-8"))[0] return repo_name, sha + def branch_exists(repo_name, branch_name): """Check if a branch exists in a git repository.""" - proc = subprocess.run(["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True) - branches = proc.stdout.split('\n') + proc = subprocess.run( + ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True + ) + branches = proc.stdout.split("\n") local_branch = branch_name - remote_branch = f'remote/{branch_name}' - return any(branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch) for branch in branches) + remote_branch = f"remote/{branch_name}" + return any( + branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch) + for branch in branches + ) + def create_or_switch_branch(repo_name, branch_name, base_branch): """Create a branch in a git repository if it doesn't exist, or switch to it if it does.""" if not branch_exists(repo_name, branch_name): - subprocess.run(["git", "-C", repo_name, "checkout", "-b", branch_name], check=True) + subprocess.run( + ["git", "-C", repo_name, "checkout", "-b", branch_name], check=True + ) else: subprocess.run(["git", "-C", repo_name, "checkout", branch_name], check=True) - subprocess.run(["git", "-C", repo_name, "merge", base_branch, "-m", f"Merging {branch_name} into {base_branch}"], check=True) + subprocess.run( + [ + "git", + "-C", + repo_name, + "merge", + base_branch, + "-m", + f"Merging {branch_name} into {base_branch}", + ], + check=True, + ) + def commit_changes(repo_name, commit_message): """Commit changes to a git repository.""" - status = subprocess.run(["git", "-C", repo_name, "status", "--porcelain"], capture_output=True, text=True).stdout.strip() + status = subprocess.run( + ["git", "-C", repo_name, "status", "--porcelain"], + capture_output=True, + text=True, + ).stdout.strip() if status == "": print("No changes to commit.") return False @@ -339,55 +370,56 @@ def commit_changes(repo_name, commit_message): subprocess.run(["git", "-C", repo_name, "commit", "-m", commit_message], check=True) return True + def push_changes(repo_name, branch_name): """Push changes to a git repository.""" - subprocess.run(["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True) + subprocess.run( + ["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True + ) + def get_default_branch(repo_full_name): """Get the default branch of a repository (default `main`).""" url = f"https://api.github.com/repos/{repo_full_name}" headers = { "Authorization": f"token {personal_access_token}", - "Accept": "application/vnd.github.v3+json" + "Accept": "application/vnd.github.v3+json", } response = requests.get(url, headers=headers) if response.status_code == 200: - return response.json()['default_branch'] + return response.json()["default_branch"] else: logger.info(f"Failed to get default branch for {repo_full_name}") return "main" + def create_pull_request(repo_full_name, title, body, head_branch): """Create a pull request in a repository.""" base_branch = get_default_branch(repo_full_name) url = f"https://api.github.com/repos/{repo_full_name}/pulls" headers = { "Authorization": f"token {personal_access_token}", - "Accept": "application/vnd.github.v3+json" - } - data = { - "title": title, - "body": body, - "head": head_branch, - "base": base_branch + "Accept": "application/vnd.github.v3+json", } + data = {"title": title, "body": body, "head": head_branch, "base": base_branch} response = requests.post(url, headers=headers, json=data) if response.status_code == 201: - print("Pull request created successfully: " + response.json()['html_url']) - return response.json()['html_url'] + print("Pull request created successfully: " + response.json()["html_url"]) + return response.json()["html_url"] else: print("Failed to create pull request", response.text) return "" + def get_compatible_commit(url): """Find the most recent commit with a Lean version that LeanAgent supports.""" try: process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) stdout, stderr = process.communicate() - latest_commit = re.split(r'\t+', stdout.decode('utf-8'))[0] + latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0] logger.info(f"Latest commit: {latest_commit}") - new_url = url.replace('.git', '') + new_url = url.replace(".git", "") logger.info(f"Creating LeanGitRepo for {new_url}") repo = LeanGitRepo(new_url, latest_commit) logger.info(f"Getting config for {url}") @@ -399,19 +431,21 @@ def get_compatible_commit(url): logger.info(f"Searching for compatible commit for {url}") try: - subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) + subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) logger.info("Already in a Git repository") except subprocess.CalledProcessError: logger.info("Not in a Git repository. Initializing one.") subprocess.run(["git", "init"], check=True) - + process = subprocess.Popen( ["git", "fetch", "--depth=1000000", url], # Fetch commits stdout=subprocess.PIPE, - stderr=subprocess.PIPE + stderr=subprocess.PIPE, ) logger.info(f"Fetching commits for {url}") _, stderr = process.communicate() @@ -421,19 +455,21 @@ def get_compatible_commit(url): process = subprocess.Popen( ["git", "log", "--format=%H", "FETCH_HEAD"], # Get list of commits stdout=subprocess.PIPE, - stderr=subprocess.PIPE + stderr=subprocess.PIPE, ) logger.info(f"Getting list of commits for {url}") stdout, stderr = process.communicate() if process.returncode != 0: raise Exception(f"Git log command failed: {stderr.decode('utf-8')}") - commits = stdout.decode('utf-8').strip().split('\n') + commits = stdout.decode("utf-8").strip().split("\n") logger.info(f"Found {len(commits)} commits for {url}") for commit in commits: - new_url = url.replace('.git', '') + new_url = url.replace(".git", "") repo = LeanGitRepo(new_url, commit) config = repo.get_config("lean-toolchain") - v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) + v = generate_benchmark_lean4.get_lean4_version_from_config( + config["content"] + ) if generate_benchmark_lean4.is_supported_version(v): logger.info(f"Found compatible commit {commit} for {url}") return commit, v @@ -444,14 +480,15 @@ def get_compatible_commit(url): logger.info(f"Error in get_compatible_commit: {str(e)}") return None, None + def find_and_save_compatible_commits(repo_info_file, lean_git_repos): """Finds compatible commits for various repositories""" updated_repos = [] for repo in lean_git_repos: url = repo.url - if not url.endswith('.git'): - url = url + '.git' - + if not url.endswith(".git"): + url = url + ".git" + sha = None v = None if "mathlib4" in url: @@ -468,45 +505,52 @@ def find_and_save_compatible_commits(repo_info_file, lean_git_repos): if not sha: logger.info(f"Failed to find a compatible commit for {url}") continue - - updated_repos.append({"url": url.replace('.git', ''), "commit": sha, "version": v}) - - with open(repo_info_file, 'w') as f: + + updated_repos.append( + {"url": url.replace(".git", ""), "commit": sha, "version": v} + ) + + with open(repo_info_file, "w") as f: json.dump(updated_repos, f) - + return updated_repos + def search_github_repositories(language="Lean", num_repos=10): """Search for the given number of repositories on GitHub that have the given language.""" - headers = {'Authorization': personal_access_token} + headers = {"Authorization": personal_access_token} query_params = { - 'q': f'language:{language}', - 'sort': 'stars', - 'order': 'desc', - 'per_page': 100, + "q": f"language:{language}", + "sort": "stars", + "order": "desc", + "per_page": 100, } - + cloned_count = 0 page = 1 while cloned_count < num_repos: - query_params['page'] = page - response = requests.get('https://api.github.com/search/repositories', headers=headers, params=query_params) - + query_params["page"] = page + response = requests.get( + "https://api.github.com/search/repositories", + headers=headers, + params=query_params, + ) + if response.status_code == 200: - repositories = response.json()['items'] + repositories = response.json()["items"] for repo in repositories: if cloned_count >= num_repos: break - repo_full_name = repo['full_name'] + repo_full_name = repo["full_name"] logger.info(f"Processing {repo_full_name}") if repo_full_name not in known_repositories: name = None try: - clone_url = repo['clone_url'] + clone_url = repo["clone_url"] repo_name, sha = clone_repo(clone_url) name = repo_name - url = clone_url.replace('.git', '') + url = clone_url.replace(".git", "") lean_git_repo = LeanGitRepo(url, sha) lean_git_repos.append(lean_git_repo) repos.append(repo_full_name) @@ -516,7 +560,9 @@ def search_github_repositories(language="Lean", num_repos=10): shutil.rmtree(name) logger.info(f"Failed to clone {repo_full_name} because of {e}") else: - logger.info(f"Skipping {repo_full_name} since it is a known repository") + logger.info( + f"Skipping {repo_full_name} since it is a known repository" + ) page += 1 else: logger.info("Failed to search GitHub", response.status_code) @@ -525,7 +571,7 @@ def search_github_repositories(language="Lean", num_repos=10): # Check if we've reached the end of the search results if len(repositories) < 100: break - + logger.info(f"Total repositories processed: {cloned_count}") @@ -569,7 +615,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]: def load_fisher_information(file_path): """Loads the Fisher Information Matrix.""" try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: fisher_info = pickle.load(f) logger.info("Fisher Information successfully loaded.") return fisher_info @@ -577,38 +623,61 @@ def load_fisher_information(file_path): logger.error(f"No Fisher Information file found at {file_path}.") return None + def find_latest_checkpoint(): """Finds the most recent checkpoint.""" checkpoint_dir = RAID_DIR + "/" + CHECKPOINT_DIR - all_checkpoints = [os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.endswith(".ckpt")] + all_checkpoints = [ + os.path.join(checkpoint_dir, f) + for f in os.listdir(checkpoint_dir) + if f.endswith(".ckpt") + ] if not all_checkpoints: raise FileNotFoundError("No checkpoints found.") latest_checkpoint = max(all_checkpoints, key=os.path.getmtime) logger.info(f"Using the latest checkpoint: {latest_checkpoint}") return latest_checkpoint + def find_latest_fisher(): """Finds the most recent Fisher Information Matrix.""" fisher_dir = RAID_DIR + "/" + FISHER_DIR - all_fisher = [os.path.join(fisher_dir, f) for f in os.listdir(fisher_dir) if f.endswith(".pkl")] + all_fisher = [ + os.path.join(fisher_dir, f) + for f in os.listdir(fisher_dir) + if f.endswith(".pkl") + ] if not all_fisher: raise FileNotFoundError("No Fisher Information Matrices found.") latest_fisher = max(all_fisher, key=os.path.getmtime) logger.info(f"Using the latest Fisher Information Matrix: {latest_fisher}") return latest_fisher -def theorem_identifier(theorem: Theorem) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]: + +def theorem_identifier( + theorem: Theorem, +) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]: """Returns a unique identifier for a theorem.""" - return (theorem.full_name, str(theorem.file_path), tuple(theorem.start), tuple(theorem.end)) + return ( + theorem.full_name, + str(theorem.file_path), + tuple(theorem.start), + tuple(theorem.end), + ) + -def process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path): +def process_theorem_batch( + theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path +): """Processes a batch of theorems.""" lean_dojo_theorems = [t[1] for t in theorem_batch] - results = prover.search_unordered(LeanGitRepo(repo.url, repo.commit), lean_dojo_theorems, positions_batch) - + results = prover.search_unordered( + LeanGitRepo(repo.url, repo.commit), lean_dojo_theorems, positions_batch + ) + # Create a mapping from LeanDojoTheorem to our Theorem theorem_map = {ldj_thm: thm for thm, ldj_thm in theorem_batch} - + for result in results: if isinstance(result, SearchResult): if result.theorem in theorem_map: @@ -620,8 +689,9 @@ def process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dyna tactic=tactic, annotated_tactic=(tactic, []), state_before="", - state_after="" - ) for tactic in result.proof + state_after="", + ) + for tactic in result.proof ] theorem.traced_tactics = traced_tactics repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME) @@ -633,47 +703,70 @@ def process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dyna logger.warning(f"Theorem not found in theorem_map: {result.theorem}") else: logger.warning(f"Unexpected result type") - + db.to_json(dynamic_database_json_path) + def save_progress(all_encountered_theorems): """Saves the set of encountered theorems.""" logger.info("Saving encountered theorems...") - with open(ENCOUNTERED_THEOREMS_FILE, 'wb') as f: + with open(ENCOUNTERED_THEOREMS_FILE, "wb") as f: pickle.dump(all_encountered_theorems, f) + def load_encountered_theorems(file_path): """Loads the theorems that have been encountered.""" all_encountered_theorems = set() if os.path.exists(file_path): try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: file_content = f.read() if file_content: # Check if the file is not empty all_encountered_theorems = pickle.loads(file_content) else: - logger.warning(f"The file {file_path} is empty. Starting with an empty set.") + logger.warning( + f"The file {file_path} is empty. Starting with an empty set." + ) except (EOFError, pickle.UnpicklingError) as e: - logger.warning(f"Error reading {file_path}: {e}. Starting with an empty set.") + logger.warning( + f"Error reading {file_path}: {e}. Starting with an empty set." + ) except Exception as e: - logger.error(f"Unexpected error when reading {file_path}: {e}. Starting with an empty set.") + logger.error( + f"Unexpected error when reading {file_path}: {e}. Starting with an empty set." + ) else: logger.info(f"The file {file_path} does not exist. Starting with an empty set.") - + return all_encountered_theorems -def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic_database_json_path, repos_to_include: Optional[List[Tuple[str, str]]] = None, batch_size: int = 12): + +def prove_sorry_theorems( + db: DynamicDatabase, + prover: DistributedProver, + dynamic_database_json_path, + repos_to_include: Optional[List[Tuple[str, str]]] = None, + batch_size: int = 12, +): """Proves sorry theorems.""" - repos_to_process = db.repositories if repos_to_include is None else [ - repo for repo in db.repositories if (repo.url, repo.commit) in repos_to_include - ] + repos_to_process = ( + db.repositories + if repos_to_include is None + else [ + repo + for repo in db.repositories + if (repo.url, repo.commit) in repos_to_include + ] + ) # To avoid proving the same theorem multiple times, potentially from different versions of the # same repo, we sort the repositories - repos_to_process.sort(key=lambda r: r.metadata['date_processed'], reverse=True) + repos_to_process.sort(key=lambda r: r.metadata["date_processed"], reverse=True) processed_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = set() - all_encountered_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = set() + all_encountered_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = ( + set() + ) last_save_time = datetime.datetime.now() save_interval = timedelta(minutes=30) @@ -689,8 +782,10 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic theorem_batch = [] positions_batch = [] - - for theorem in tqdm(sorry_theorems, desc=f"Processing theorems from {repo.name}", unit="theorem"): + + for theorem in tqdm( + sorry_theorems, desc=f"Processing theorems from {repo.name}", unit="theorem" + ): # Ignore sorry theorems from the repo's dependencies if theorem.url != repo_url or theorem.commit != repo_commit: continue @@ -698,7 +793,9 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic theorem_id = theorem_identifier(theorem) if theorem_id in all_encountered_theorems: - logger.info(f"Skipping already encountered theorem: {theorem.full_name}") + logger.info( + f"Skipping already encountered theorem: {theorem.full_name}" + ) continue all_encountered_theorems.add(theorem_id) @@ -707,7 +804,7 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic continue processed_theorems.add(theorem_id) - + logger.info(f"Searching for proof for {theorem.full_name}") logger.info(f"Position: {theorem.start}") @@ -715,14 +812,21 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic lean_dojo_theorem = LeanDojoTheorem( repo=LeanGitRepo(repo_url, repo_commit), file_path=theorem.file_path, - full_name=theorem.full_name + full_name=theorem.full_name, ) theorem_batch.append((theorem, lean_dojo_theorem)) positions_batch.append(Pos(*theorem.start)) if len(theorem_batch) == batch_size: - process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path) + process_theorem_batch( + theorem_batch, + positions_batch, + repo, + db, + prover, + dynamic_database_json_path, + ) theorem_batch = [] positions_batch = [] @@ -730,22 +834,30 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic if current_time - last_save_time >= save_interval: save_progress(all_encountered_theorems) last_save_time = current_time - + # Process any remaining theorems in the last batch if theorem_batch: - process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path) + process_theorem_batch( + theorem_batch, + positions_batch, + repo, + db, + prover, + dynamic_database_json_path, + ) save_progress(all_encountered_theorems) logger.info("Finished attempting to prove sorry theorems") + def add_repo_to_database(dynamic_database_json_path, repo, db): """Adds a repository to the dynamic database.""" # Prepare the data necessary to add this repo to the dynamic database url = repo.url - if not url.endswith('.git'): - url = url + '.git' + if not url.endswith(".git"): + url = url + ".git" logger.info(f"Processing {url}") - + if "mathlib4" in url: sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" v = "v4.8.0" @@ -757,19 +869,21 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): v = "v4.8.0-rc1" else: sha, v = get_compatible_commit(url) - + if not sha: logger.info(f"Failed to find a compatible commit for {url}") return None - + logger.info(f"Found compatible commit {sha} for {url}") logger.info(f"Lean version: {v}") - url = url.replace('.git', '') + url = url.replace(".git", "") repo = LeanGitRepo(url, sha) dir_name = repo.url.split("/")[-1] + "_" + sha dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name logger.info(f"Generating benchmark at {dst_dir}") - traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(repo.url, sha, dst_dir) + traced_repo, _, _, total_theorems = generate_benchmark_lean4.main( + repo.url, sha, dst_dir + ) if not traced_repo: logger.info(f"Failed to trace {url}") return None @@ -797,9 +911,9 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): "theorems_folder": theorems_folder, "premise_files_corpus": premise_files_corpus, "files_traced": files_traced, - "pr_url": pr_url + "pr_url": pr_url, } - + repo = Repository.from_dict(data) logger.info("Before adding new repo:") db.print_database_contents() @@ -809,6 +923,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): db.to_json(dynamic_database_json_path) return "Done" + def replace_sorry_with_proof(proofs): """Replace the `sorry` with the proof text in the Lean files.""" logger.info(f"Replacing sorries with {len(proofs)} proofs!") @@ -819,40 +934,44 @@ def replace_sorry_with_proof(proofs): if file_path not in proofs_by_file: proofs_by_file[file_path] = [] proofs_by_file[file_path].append((start, end, proof_text)) - + for file_path, proofs in proofs_by_file.items(): - with open(file_path, 'r') as file: + with open(file_path, "r") as file: lines = file.readlines() # sort proof by starting line and column number (working bottom up retains positions) proofs.sort(key=lambda x: (x[0].line_nb, x[0].column_nb), reverse=True) - + for start, end, proof_text in proofs: start_line, start_col = start.line_nb - 1, start.column_nb - 1 end_line, end_col = end.line_nb - 1, end.column_nb - 1 - original_text = ''.join(lines[start_line:end_line + 1]) - new_text = original_text.replace('sorry', proof_text, 1) - lines[start_line:end_line + 1] = new_text - - with open(file_path, 'w') as file: + original_text = "".join(lines[start_line : end_line + 1]) + new_text = original_text.replace("sorry", proof_text, 1) + lines[start_line : end_line + 1] = new_text + + with open(file_path, "w") as file: file.writelines(lines) logger.info("Finished replacing sorries with proofs!") + def calculate_difficulty(theorem: Theorem) -> Union[float, None]: """Calculates the difficulty of a theorem.""" proof_steps = theorem.traced_tactics - if any('sorry' in step.tactic for step in proof_steps): - return float('inf') # Hard (no proof) + if any("sorry" in step.tactic for step in proof_steps): + return float("inf") # Hard (no proof) if len(proof_steps) == 0: return None # To be distributed later return math.exp(len(proof_steps)) -def categorize_difficulty(difficulty: Union[float, None], percentiles: List[float]) -> str: + +def categorize_difficulty( + difficulty: Union[float, None], percentiles: List[float] +) -> str: """Categorizes the difficulty of a theorem.""" if difficulty is None: return "To_Distribute" - if difficulty == float('inf'): + if difficulty == float("inf"): return "Hard (No proof)" elif difficulty <= percentiles[0]: return "Easy" @@ -861,6 +980,7 @@ def categorize_difficulty(difficulty: Union[float, None], percentiles: List[floa else: return "Hard" + def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]: """Sorts repositories by the difficulty of their theorems.""" difficulties_by_repo = defaultdict(list) @@ -872,10 +992,18 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]: for theorem in repo.get_all_theorems: difficulty = calculate_difficulty(theorem) theorem.difficulty_rating = difficulty - difficulties_by_repo[repo].append((theorem.full_name, str(theorem.file_path), tuple(theorem.start), tuple(theorem.end), difficulty)) + difficulties_by_repo[repo].append( + ( + theorem.full_name, + str(theorem.file_path), + tuple(theorem.start), + tuple(theorem.end), + difficulty, + ) + ) if difficulty is not None: all_difficulties.append(difficulty) - + db.update_repository(repo) print(f"Finished {repo.name}") @@ -888,7 +1016,9 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]: print(f"Starting {repo.name}") for theorem_name, file_path, start, end, difficulty in theorems: category = categorize_difficulty(difficulty, percentiles) - categorized_theorems[repo][category].append((theorem_name, file_path, start, end, difficulty)) + categorized_theorems[repo][category].append( + (theorem_name, file_path, start, end, difficulty) + ) print(f"Finished {repo.name}") print("Distributed theorems with no proofs") @@ -904,43 +1034,49 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]: print(f"Finished {repo.name}") # Sort repositories based on the number of easy theorems - sorted_repos = sorted(categorized_theorems.keys(), key=lambda r: len(categorized_theorems[r]["Easy"]), reverse=True) + sorted_repos = sorted( + categorized_theorems.keys(), + key=lambda r: len(categorized_theorems[r]["Easy"]), + reverse=True, + ) return sorted_repos, categorized_theorems, percentiles + def save_sorted_repos(sorted_repos: List[Repository], file_path: str): """Saves the sorted repositories to a file.""" sorted_repo_data = [ - { - "url": repo.url, - "commit": repo.commit, - "name": repo.name - } for repo in sorted_repos + {"url": repo.url, "commit": repo.commit, "name": repo.name} + for repo in sorted_repos ] - with open(file_path, 'w') as f: + with open(file_path, "w") as f: json.dump(sorted_repo_data, f, indent=2) + def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]: """Loads the sorted repositories from a file.""" - with open(file_path, 'r') as f: + with open(file_path, "r") as f: sorted_repo_data = json.load(f) return [(repo["url"], repo["commit"], repo["name"]) for repo in sorted_repo_data] + def write_skip_file(repo_url): """Writes a repository URL to a file to skip it.""" skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") - with open(skip_file_path, 'w') as f: + with open(skip_file_path, "w") as f: f.write(repo_url) + def should_skip_repo(): """Checks if a repository should be skipped.""" skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") if os.path.exists(skip_file_path): - with open(skip_file_path, 'r') as f: + with open(skip_file_path, "r") as f: repo_url = f.read().strip() return True, repo_url return False, None + def main(): """ Main function to run LeanAgent. @@ -955,9 +1091,9 @@ def main(): use_fisher = False single_repo = True curriculum_learning = True - num_repos = 15 + num_repos = 1 dynamic_database_json_path = RAID_DIR + "/" + DB_FILE_NAME - + lambdas = None if run_progressive_training: logger.info("Running progressive training") @@ -972,14 +1108,19 @@ def main(): logger.info("LeanDojo configured") # Check if the current process is the main one - is_main_process = int(os.environ.get('LOCAL_RANK', '0')) == 0 + is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0 # Initialize the database if it doesn't exist or is empty if is_main_process: logger.info("Starting the main process") - if not os.path.exists(dynamic_database_json_path) or os.path.getsize(dynamic_database_json_path) == 0: + if ( + not os.path.exists(dynamic_database_json_path) + or os.path.getsize(dynamic_database_json_path) == 0 + ): # File doesn't exist or is empty, initialize it - logger.info(f"Initializing new database at {dynamic_database_json_path}") + logger.info( + f"Initializing new database at {dynamic_database_json_path}" + ) db = DynamicDatabase() db.to_json(dynamic_database_json_path) else: @@ -989,7 +1130,9 @@ def main(): logger.info(f"Loaded database from {dynamic_database_json_path}") except json.JSONDecodeError: # If there's an error decoding the JSON, initialize a new database - logger.warning(f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database.") + logger.warning( + f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database." + ) db = DynamicDatabase() db.to_json(dynamic_database_json_path) @@ -1006,10 +1149,14 @@ def main(): logger.info(f"Processing {repo.url}") result = add_repo_to_database(dynamic_database_json_path, repo, db) if result is not None: - logger.info(f"Successfully added repo {repo.url}") - logger.info(f"Successfully added {num_repos} repositories to the database") - - sorted_repos, categorized_theorems, percentiles = sort_repositories_by_difficulty(db) + logger.info(f"Successfully added repo {repo.url}") + logger.info( + f"Successfully added {num_repos} repositories to the database" + ) + + sorted_repos, categorized_theorems, percentiles = ( + sort_repositories_by_difficulty(db) + ) print("Sorted repositories. Saving now...") db.to_json(dynamic_database_json_path) save_sorted_repos(sorted_repos, "sorted_repos.json") @@ -1020,23 +1167,44 @@ def main(): theorems = categorized_theorems[repo][category] print(f" {category}: {len(theorems)} theorems") if theorems: - sorted_theorems = sorted(theorems, key=lambda x: x[2] if x[2] is not None else -float('inf'), reverse=True)[:3] + sorted_theorems = sorted( + theorems, + key=lambda x: ( + x[2] if x[2] is not None else -float("inf") + ), + reverse=True, + )[:3] for name, path, start, end, diff in sorted_theorems: diff_str = f"{diff:.2f}" if diff is not None else "N/A" - print(f" - {name} (File: {path}, Difficulty: {diff_str})") + print( + f" - {name} (File: {path}, Difficulty: {diff_str})" + ) print("\nOverall Statistics:") - total_theorems = sum(len(theorems) for categories in categorized_theorems.values() for theorems in categories.values()) + total_theorems = sum( + len(theorems) + for categories in categorized_theorems.values() + for theorems in categories.values() + ) for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]: - count = sum(len(categories[category]) for categories in categorized_theorems.values()) + count = sum( + len(categories[category]) + for categories in categorized_theorems.values() + ) percentage = (count / total_theorems) * 100 print(f"{category}: {count} theorems ({percentage:.2f}%)") - print(f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}") - + print( + f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}" + ) + logger.info("Finding compatible repositories...") - updated_repos = find_and_save_compatible_commits(repo_info_file, sorted_repos) - lean_git_repos = [LeanGitRepo(repo['url'], repo['commit']) for repo in updated_repos] + updated_repos = find_and_save_compatible_commits( + repo_info_file, sorted_repos + ) + lean_git_repos = [ + LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos + ] logger.info("Finished finding compatible repositories") else: logger.info("Starting without curriculum learning") @@ -1049,28 +1217,39 @@ def main(): logger.info(f"Processing {repo.url}") result = add_repo_to_database(dynamic_database_json_path, repo, db) if result is not None: - logger.info(f"Successfully added repo {repo.url}") - logger.info(f"Successfully added {num_repos} repositories to the database") + logger.info(f"Successfully added repo {repo.url}") + logger.info( + f"Successfully added {num_repos} repositories to the database" + ) logger.info("Finding compatible repositories...") - updated_repos = find_and_save_compatible_commits(repo_info_file, lean_git_repos) - lean_git_repos = [LeanGitRepo(repo['url'], repo['commit']) for repo in updated_repos] + updated_repos = find_and_save_compatible_commits( + repo_info_file, lean_git_repos + ) + lean_git_repos = [ + LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos + ] logger.info("Finished finding compatible repositories") # All processes wait for the file to be created and then read from it max_attempts = 30 for attempt in range(max_attempts): try: - with open(repo_info_file, 'r') as f: + with open(repo_info_file, "r") as f: repo_info = json.load(f) break except (json.JSONDecodeError, FileNotFoundError): if attempt == max_attempts - 1: - raise Exception("Failed to read repository information after multiple attempts") + raise Exception( + "Failed to read repository information after multiple attempts" + ) time.sleep(1) - + # Load compatible repositories - lean_git_repos = [LeanGitRepo(info['url'].replace('.git', ''), info['commit']) for info in repo_info] + lean_git_repos = [ + LeanGitRepo(info["url"].replace(".git", ""), info["commit"]) + for info in repo_info + ] # Iterate over each repository and lambda value for i in range(num_repos): @@ -1101,8 +1280,10 @@ def main(): logger.info("Repo already in repos_for_merged_dataset") db.generate_merged_dataset(dst_dir, repos_for_merged_dataset) - - dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + f"merged_with_new_{dir_name}" + + dst_dir = ( + RAID_DIR + "/" + DATA_DIR + "/" + f"merged_with_new_{dir_name}" + ) new_data_path = dst_dir logger.info("All GPUs") @@ -1116,16 +1297,18 @@ def main(): except FileNotFoundError as e: logger.error(str(e)) model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt" - + # Train the model on the new dataset that we generated from the dynamic database. logger.info("Inside train_test_fisher") logger.info(f"Starting training at epoch {current_epoch}") seed_everything(3407) # Progessive Training - + if not torch.cuda.is_available(): - logger.warning("Indexing the corpus using CPU can be very slow.") + logger.warning( + "Indexing the corpus using CPU can be very slow." + ) device = torch.device("cpu") else: device = torch.device("cuda") @@ -1156,34 +1339,39 @@ def main(): filename_suffix = f"_lambda_{lambda_value}" checkpoint_callback = ModelCheckpoint( dirpath=RAID_DIR + "/" + CHECKPOINT_DIR, - filename=dir_name + filename_suffix + "_{epoch}-{Recall@10_val:.2f}", + filename=dir_name + + filename_suffix + + "_{epoch}-{Recall@10_val:.2f}", verbose=True, save_top_k=-1, # Save all checkpoints every_n_epochs=1, # Save every epoch (which is just once in this case) monitor="Recall@10_val", - mode="max" + mode="max", ) - + early_stop_callback = EarlyStopping( - monitor="Recall@10_val", - patience=5, - mode="max", - verbose=True + monitor="Recall@10_val", patience=5, mode="max", verbose=True ) - lr_monitor = LearningRateMonitor(logging_interval='step') + lr_monitor = LearningRateMonitor(logging_interval="step") # Set up environment variables for NCCL VERY_LONG_TIMEOUT = 7 * 24 * 60 * 60 * 52 # 1 year - os.environ['TORCH_NCCL_ASYNC_ERROR_HANDLING'] = '1' - os.environ['NCCL_TIMEOUT'] = str(VERY_LONG_TIMEOUT * 1000) + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1" + os.environ["NCCL_TIMEOUT"] = str(VERY_LONG_TIMEOUT * 1000) # Create a custom log directory for Lightning - custom_log_dir = os.path.join(RAID_DIR, "lightning_logs", f"{dir_name}_{use_fisher}_lambda_{lambda_value}") + custom_log_dir = os.path.join( + RAID_DIR, + "lightning_logs", + f"{dir_name}_{use_fisher}_lambda_{lambda_value}", + ) os.makedirs(custom_log_dir, exist_ok=True) # Initialize DDP strategy - ddp_strategy = DDPStrategy(timeout=timedelta(seconds=VERY_LONG_TIMEOUT)) + ddp_strategy = DDPStrategy( + timeout=timedelta(seconds=VERY_LONG_TIMEOUT) + ) trainer = pl.Trainer( accelerator="gpu", gradient_clip_val=1.0, @@ -1191,7 +1379,11 @@ def main(): strategy=ddp_strategy, devices=4, accumulate_grad_batches=4, - callbacks=[lr_monitor, checkpoint_callback, early_stop_callback], + callbacks=[ + lr_monitor, + checkpoint_callback, + early_stop_callback, + ], max_epochs=current_epoch + epochs_per_repo, log_every_n_steps=1, num_sanity_val_steps=0, @@ -1203,11 +1395,15 @@ def main(): trainer.strategy.barrier() should_skip, skip_repo_url = should_skip_repo() if should_skip: - logger.info(f"Skipping repository {skip_repo_url} due to preprocessing issues") + logger.info( + f"Skipping repository {skip_repo_url} due to preprocessing issues" + ) trainer.strategy.barrier() if is_main_process: logger.info("Removing skip file") - skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") + skip_file_path = os.path.join( + RAID_DIR, DATA_DIR, "skip_repo.txt" + ) os.remove(skip_file_path) continue @@ -1225,35 +1421,51 @@ def main(): batch_size=BATCH_SIZE, eval_batch_size=64, max_seq_len=1024, - num_workers=4 + num_workers=4, ) - data_module.setup(stage='fit') + data_module.setup(stage="fit") - logger.info(f"Training dataset size after load: {len(data_module.ds_train)}") - logger.info(f"Validation dataset size after load: {len(data_module.ds_val)}") - logger.info(f"Testing dataset size after load: {len(data_module.ds_pred)}") + logger.info( + f"Training dataset size after load: {len(data_module.ds_train)}" + ) + logger.info( + f"Validation dataset size after load: {len(data_module.ds_val)}" + ) + logger.info( + f"Testing dataset size after load: {len(data_module.ds_pred)}" + ) - logger.info(f"Starting progressive training from epoch {current_epoch} to {current_epoch + epochs_per_repo}") + logger.info( + f"Starting progressive training from epoch {current_epoch} to {current_epoch + epochs_per_repo}" + ) # Train the model try: logger.info("hit the barrier before training") trainer.strategy.barrier() - trainer.fit(model, datamodule=data_module, ckpt_path=model_checkpoint_path) + trainer.fit( + model, + datamodule=data_module, + ckpt_path=model_checkpoint_path, + ) logger.info("hit the barrier after training") trainer.strategy.barrier() except Exception as e: print(f"An error occurred during training: {str(e)}") print(traceback.format_exc()) - logger.info(f"Finished progressive training at epoch {trainer.current_epoch}") + logger.info( + f"Finished progressive training at epoch {trainer.current_epoch}" + ) # Testing for Average Recall try: best_model_path = find_latest_checkpoint() logger.info(f"Found latest checkpoint: {best_model_path}") - best_model = PremiseRetriever.load(best_model_path, device, freeze=False, config=config) + best_model = PremiseRetriever.load( + best_model_path, device, freeze=False, config=config + ) except FileNotFoundError as e: logger.error(f"No checkpoint found: {str(e)}") logger.warning("Using the current model state.") @@ -1264,15 +1476,19 @@ def main(): logger.info("Testing...") total_R1, total_R10, total_MRR = [], [], [] dataset_path = RAID_DIR + "/" + DATA_DIR - testing_paths = [os.path.join(dataset_path, d) for d in os.listdir(dataset_path)] + testing_paths = [ + os.path.join(dataset_path, d) for d in os.listdir(dataset_path) + ] if is_main_process: with open(EVAL_RESULTS_FILE_PATH, "a") as f: f.write("\n\n\n") - f.write(f"Results for {dir_name} with lambda = {lambda_value}") + f.write( + f"Results for {dir_name} with lambda = {lambda_value}" + ) for data_path in testing_paths: if "merged" not in data_path: continue - + run_cli(best_model_path, data_path) if is_main_process: num_gpus = 4 @@ -1302,18 +1518,24 @@ def main(): avg_R10 = np.mean(total_R10) avg_MRR = np.mean(total_MRR) - logger.info(f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}") + logger.info( + f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}" + ) if not os.path.exists(EVAL_RESULTS_FILE_PATH): - open(EVAL_RESULTS_FILE_PATH, 'w').close() + open(EVAL_RESULTS_FILE_PATH, "w").close() with open(EVAL_RESULTS_FILE_PATH, "a") as f: f.write("\n\n\n") - f.write(f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}") + f.write( + f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}" + ) else: model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt" if result is None: - logger.info(f"Skipping repository {repo.url} due to preprocessing issues") + logger.info( + f"Skipping repository {repo.url} due to preprocessing issues" + ) continue if is_main_process: @@ -1326,8 +1548,12 @@ def main(): # Set up the prover use_vllm = False corpus_path = dst_dir + "/corpus.jsonl" - tactic = None # `None` since we are not using a fixed tactic generator - module = None # `None` since we are not using a fixed tactic generator + tactic = ( + None # `None` since we are not using a fixed tactic generator + ) + module = ( + None # `None` since we are not using a fixed tactic generator + ) num_workers = 4 num_gpus = 4 timeout = 600 @@ -1349,14 +1575,21 @@ def main(): raid_dir=RAID_DIR, checkpoint_dir=CHECKPOINT_DIR, debug=debug, - run_progressive_training=run_progressive_training + run_progressive_training=run_progressive_training, ) # Prove sorry theorems if single_repo: - prove_sorry_theorems(db, prover, dynamic_database_json_path, repos_for_merged_dataset) + prove_sorry_theorems( + db, + prover, + dynamic_database_json_path, + repos_for_merged_dataset, + ) else: - prove_sorry_theorems(db, prover, dynamic_database_json_path, repos_for_proving) + prove_sorry_theorems( + db, prover, dynamic_database_json_path, repos_for_proving + ) db.to_json(dynamic_database_json_path) logger.info("Finished searching for proofs of sorry theorems") @@ -1379,7 +1612,7 @@ def main(): # push_changes(repo, TMP_BRANCH) # url = str(create_pull_request(repo_no_dir, PR_TITLE, PR_BODY, TMP_BRANCH)) # shutil.rmtree(repo) - + logger.info("Finished processing the repository") current_epoch += epochs_per_repo logger.info(f"current epoch: {current_epoch}") @@ -1391,5 +1624,6 @@ def main(): logger.info(f"An error occurred: {e}", file=sys.stderr) traceback.print_exc() + if __name__ == "__main__": main() diff --git a/leanagent_utils.py b/leanagent_utils.py index 381065e..684b390 100644 --- a/leanagent_utils.py +++ b/leanagent_utils.py @@ -1,6 +1,7 @@ MARK_START_SYMBOL = "" MARK_END_SYMBOL = "" + def remove_marks(s: str) -> str: """Remove all :code:`` and :code:`` from ``s``.""" - return s.replace(MARK_START_SYMBOL, "").replace(MARK_END_SYMBOL, "") + return s.replace(MARK_START_SYMBOL, "").replace(MARK_END_SYMBOL, "") diff --git a/prover/evaluate.py b/prover/evaluate.py index e5ebcf3..44eec39 100644 --- a/prover/evaluate.py +++ b/prover/evaluate.py @@ -1,5 +1,4 @@ -"""Script for evaluating the prover on theorems extracted by LeanDojo. -""" +"""Script for evaluating the prover on theorems extracted by LeanDojo.""" import os import uuid @@ -27,7 +26,7 @@ def _get_theorems( """ Retrieves a list of Lean theorems from specified files based on given filters. - This function fetches theorems from Lean files using internal helper functions and + This function fetches theorems from Lean files using internal helper functions and validates that all repositories containing the theorems have been traced with LeanDojo. Parameters: @@ -194,7 +193,7 @@ def evaluate( num_sampled_tactics=num_sampled_tactics, debug=verbose, ) - + results = prover.search_unordered(repo, theorems, positions) # Calculate the result statistics. diff --git a/prover/proof_search.py b/prover/proof_search.py index 6324add..e927630 100644 --- a/prover/proof_search.py +++ b/prover/proof_search.py @@ -1,5 +1,4 @@ -"""Proof search using best-first search. -""" +"""Proof search using best-first search.""" import os import sys @@ -33,8 +32,9 @@ from prover.search_tree import * from generator.model import RetrievalAugmentedGenerator, FixedTacticGenerator -tolerance = 1 # second -RAID_DIR = os.environ.get('RAID_DIR') +tolerance = 1 # second +RAID_DIR = os.environ.get("RAID_DIR") + @dataclass(frozen=True) class SearchResult: @@ -79,21 +79,21 @@ def search( ) -> Optional[SearchResult]: """ Performs a best-first search to find a proof for the given theorem. - + The search uses a tactic generator to propose tactics and expands the search tree until either a proof is found, the timeout is reached, or the search space is exhausted. - + Args: repo (LeanGitRepo): The Lean Git repository containing the theorem. thm (Theorem): The theorem to be proved. pos (Pos): The position information for the theorem in the source code. - + Returns: Optional[SearchResult]: A SearchResult object containing information about the proof search, including the proof if one was found, or None if there was an initialization error. - + Raises: No explicit exceptions are raised from this method, though internal exceptions are caught and handled. @@ -387,16 +387,22 @@ async def generate(self, prompt: str, num_samples: int) -> RequestOutput: final_output = oup return final_output + def find_latest_checkpoint(raid_dir, checkpoint_dir): """Finds the most recent checkpoint.""" checkpoint_dir = raid_dir + "/" + checkpoint_dir - all_checkpoints = [os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.endswith(".ckpt")] + all_checkpoints = [ + os.path.join(checkpoint_dir, f) + for f in os.listdir(checkpoint_dir) + if f.endswith(".ckpt") + ] if not all_checkpoints: raise FileNotFoundError("No checkpoints found.") latest_checkpoint = max(all_checkpoints, key=os.path.getmtime) logger.info(f"Using the latest checkpoint: {latest_checkpoint}") return latest_checkpoint + class DistributedProver: """A distributed prover that uses Ray to parallelize the proof search. @@ -447,7 +453,7 @@ def __init__( model_checkpoint_path = find_latest_checkpoint(raid_dir, checkpoint_dir) else: model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt" - + config = { "model_name": "kaiyuy/leandojo-lean4-retriever-tacgen-byt5-small", "lr": 1e-3, @@ -473,7 +479,7 @@ def __init__( logger.info(f"Loaded indexed corpus from {indexed_corpus_path}") tac_gen.retriever.reindex_corpus(batch_size=32) logger.info("Finished reindexing!") - + self.distributed = num_workers > 1 if not self.distributed: assert num_gpus <= 1 @@ -537,4 +543,4 @@ def search_unordered( logger.error(ex) sys.exit(1) - return results \ No newline at end of file + return results diff --git a/prover/search_tree.py b/prover/search_tree.py index 3174e67..222d030 100644 --- a/prover/search_tree.py +++ b/prover/search_tree.py @@ -1,5 +1,4 @@ -"""Definitions of the search tree used by the prover. -""" +"""Definitions of the search tree used by the prover.""" import math from enum import Enum diff --git a/replace_files.sh b/replace_files.sh index 87c7805..45a3215 100644 --- a/replace_files.sh +++ b/replace_files.sh @@ -26,7 +26,7 @@ # Note: This script modifies installed Python packages. Use with caution as it may # affect the behavior of any code using these packages. # ----------------------------------------------------------------------------- -export RAID_DIR="" +export RAID_DIR="~/Desktop/LeanAgent/RAID/" # Replace PyTorch Lightning progress.py python -c "import pytorch_lightning as pl; print(pl.__file__)" > pl_path.txt @@ -59,4 +59,4 @@ cat $UTILS_PY_PATH echo "Replacing $UTILS_PY_PATH" cp ${RAID_DIR}/LeanAgent/custom_utils.py $UTILS_PY_PATH echo "Contents after replacement:" -cat $UTILS_PY_PATH \ No newline at end of file +cat $UTILS_PY_PATH diff --git a/retrieval/datamodule.py b/retrieval/datamodule.py index 3070c05..400ca88 100644 --- a/retrieval/datamodule.py +++ b/retrieval/datamodule.py @@ -53,6 +53,7 @@ class RetrievalDataset(Dataset): 'label': tensor, # Additional metadata fields """ + def __init__( self, data_paths: List[str], @@ -76,14 +77,18 @@ def __init__( def load_or_cache_data(self, data_paths: List[str]) -> List[Example]: cache_file = os.path.join(self.cache_path, "cached_data.pkl") - + # Check if cached data exists if os.path.exists(cache_file): - with open(cache_file, 'rb') as file: + with open(cache_file, "rb") as file: data = pickle.load(file) logger.info(f"Loaded data from cache {cache_file}") else: - data = list(itertools.chain.from_iterable(self._load_data(path) for path in data_paths)) + data = list( + itertools.chain.from_iterable( + self._load_data(path) for path in data_paths + ) + ) # Cache the data # create file if it does not already exist try: @@ -92,7 +97,7 @@ def load_or_cache_data(self, data_paths: List[str]) -> List[Example]: if exc.errno != errno.EEXIST: raise pass - with open(cache_file, 'wb') as file: + with open(cache_file, "wb") as file: pickle.dump(data, file) logger.info(f"Saved loaded data to cache {cache_file}") return data @@ -107,7 +112,10 @@ def _load_data(self, data_path: str) -> List[Example]: state = format_state(tac["state_before"]) # Some states are empty because they are from sorry theorems that have been proven. context = Context( - file_path, thm["full_name"], Pos(*thm["start"]), state if state else None + file_path, + thm["full_name"], + Pos(*thm["start"]), + state if state else None, ) all_pos_premises = get_all_pos_premises( tac["annotated_tactic"], self.corpus @@ -295,6 +303,7 @@ class RetrievalDataModule(pl.LightningDataModule): ds_pred : RetrievalDataset Test dataset for prediction """ + def __init__( self, data_path: str, @@ -332,7 +341,7 @@ def setup(self, stage: Optional[str] = None) -> None: self.max_seq_len, self.tokenizer, is_train=True, - cache_path=os.path.join(self.data_path, "cache_train") + cache_path=os.path.join(self.data_path, "cache_train"), ) print(f"Training dataset size: {len(self.ds_train)}") @@ -345,7 +354,7 @@ def setup(self, stage: Optional[str] = None) -> None: self.max_seq_len, self.tokenizer, is_train=False, - cache_path=os.path.join(self.data_path, "cache_val") + cache_path=os.path.join(self.data_path, "cache_val"), ) print(f"Validation dataset size: {len(self.ds_val)}") @@ -358,7 +367,7 @@ def setup(self, stage: Optional[str] = None) -> None: self.max_seq_len, self.tokenizer, is_train=False, - cache_path=os.path.join(self.data_path, "cache_pred") + cache_path=os.path.join(self.data_path, "cache_pred"), ) print(f"Testing dataset size: {len(self.ds_pred)}") diff --git a/retrieval/evaluate.py b/retrieval/evaluate.py index d22df03..c18ddb6 100644 --- a/retrieval/evaluate.py +++ b/retrieval/evaluate.py @@ -117,4 +117,4 @@ def main() -> None: if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/retrieval/evaluate_multiple.py b/retrieval/evaluate_multiple.py index e8a6b95..6c9a4d0 100644 --- a/retrieval/evaluate_multiple.py +++ b/retrieval/evaluate_multiple.py @@ -7,10 +7,11 @@ from typing import List, Tuple from loguru import logger + def _eval(data, preds_map) -> Tuple[float, float, float]: """ Evaluates the performance of premise retrieval against ground truth. - + Parameters: ----------- data : list @@ -19,7 +20,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]: preds_map : dict Dictionary mapping theorem identifiers to prediction results. Each prediction contains 'all_pos_premises' (ground truth) and 'retrieved_premises' (model predictions). - + Returns: -------- Tuple[float, float, float] @@ -27,7 +28,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]: - R1: Top-1 Recall (percentage of times the top retrieved premise is a correct premise) - R10: Recall@10 (percentage of correct premises found in the top 10 retrievals) - MRR: Mean Reciprocal Rank (average of 1/rank where rank is the position of the first correct premise) - + Notes: ------ - For each tactic in each theorem, the function evaluates if the retrieved premises match the ground truth. @@ -37,7 +38,9 @@ def _eval(data, preds_map) -> Tuple[float, float, float]: R1, R10, MRR = [], [], [] for thm in tqdm(data): for i, _ in enumerate(thm["traced_tactics"]): - pred = preds_map.get((thm["file_path"], thm["full_name"], tuple(thm["start"]), i)) + pred = preds_map.get( + (thm["file_path"], thm["full_name"], tuple(thm["start"]), i) + ) if pred is None: continue all_pos_premises = set(pred["all_pos_premises"]) @@ -62,12 +65,13 @@ def _eval(data, preds_map) -> Tuple[float, float, float]: MRR = np.mean(MRR) if MRR else 0 return R1, R10, MRR + def main(): """ Main function for evaluating the premise retriever on multiple data splits. The function loads the predictions from a file, evaluates them against each provided data split, - and calculates average metrics across all splits. Metrics include Recall@1, Recall@10, and Mean + and calculates average metrics across all splits. Metrics include Recall@1, Recall@10, and Mean Reciprocal Rank (MRR). Command Line Arguments: @@ -77,14 +81,30 @@ def main(): Returns: None: Results are logged to the console. """ - parser = argparse.ArgumentParser(description="Script for evaluating the premise retriever.") - parser.add_argument("--preds-file", type=str, required=True, help="Path to the retriever's predictions file.") - parser.add_argument("--data-paths", type=str, nargs='+', required=True, help="Paths to the directories containing the data splits.") + parser = argparse.ArgumentParser( + description="Script for evaluating the premise retriever." + ) + parser.add_argument( + "--preds-file", + type=str, + required=True, + help="Path to the retriever's predictions file.", + ) + parser.add_argument( + "--data-paths", + type=str, + nargs="+", + required=True, + help="Paths to the directories containing the data splits.", + ) args = parser.parse_args() logger.info(f"Loading predictions from {args.preds_file}") preds = pickle.load(open(args.preds_file, "rb")) - preds_map = {(p["file_path"], p["full_name"], tuple(p["start"]), p["tactic_idx"]): p for p in preds} + preds_map = { + (p["file_path"], p["full_name"], tuple(p["start"]), p["tactic_idx"]): p + for p in preds + } total_R1, total_R10, total_MRR = [], [], [] for data_path in args.data_paths: @@ -99,7 +119,10 @@ def main(): avg_R1 = np.mean(total_R1) avg_R10 = np.mean(total_R10) avg_MRR = np.mean(total_MRR) - logger.info(f"Average R@1 = {avg_R1} %, Average R@10 = {avg_R10} %, Average MRR = {avg_MRR}") + logger.info( + f"Average R@1 = {avg_R1} %, Average R@10 = {avg_R10} %, Average MRR = {avg_MRR}" + ) + if __name__ == "__main__": main() diff --git a/retrieval/fisher_computation_module.py b/retrieval/fisher_computation_module.py index d08d314..a333575 100644 --- a/retrieval/fisher_computation_module.py +++ b/retrieval/fisher_computation_module.py @@ -4,6 +4,7 @@ import torch.distributed as dist import pickle + class FisherComputationModule(pl.LightningModule): def __init__(self, model): super().__init__() @@ -79,7 +80,7 @@ def on_train_epoch_end(self): across the entire distributed training process. """ logger.info("Synchronizing and normalizing Fisher Information") - + # Synchronize Fisher Information across GPUs # Each GPU now has the sum of the Fisher Information from all GPUs for each parameter for name in self.fisher_info: @@ -97,10 +98,12 @@ def on_train_epoch_end(self): self.fisher_info[name] /= total_samples def configure_optimizers(self): - return torch.optim.SGD(self.model.parameters(), lr=0) # We don't actually want to update the model + return torch.optim.SGD( + self.model.parameters(), lr=0 + ) # We don't actually want to update the model def save_fisher_info(self, fisher_file_path): if self.trainer.is_global_zero: logger.info(f"Saving Fisher Information Matrix to {fisher_file_path}") with open(fisher_file_path, "wb") as f: - pickle.dump(self.fisher_info, f) \ No newline at end of file + pickle.dump(self.fisher_info, f) diff --git a/retrieval/index.py b/retrieval/index.py index 6e0fbf2..1fb2f05 100644 --- a/retrieval/index.py +++ b/retrieval/index.py @@ -1,5 +1,4 @@ -"""Script for indexing the corpus using the retriever. -""" +"""Script for indexing the corpus using the retriever.""" import torch import pickle @@ -60,4 +59,4 @@ def main() -> None: main() # python retrieval/index.py --ckpt_path leandojo-lean4-retriever-byt5-small --corpus-path /data/yingzi_ma/lean_project/datasets/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5/corpus.jsonl --output-path indexed_corpus.pkl -# python retrieval/index.py --ckpt_path leandojo-lean4-retriever-byt5-small --corpus-path mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5/corpus.jsonl --output-path indexed_corpus.pkl \ No newline at end of file +# python retrieval/index.py --ckpt_path leandojo-lean4-retriever-byt5-small --corpus-path mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5/corpus.jsonl --output-path indexed_corpus.pkl diff --git a/retrieval/main.py b/retrieval/main.py index ed4094f..331846d 100644 --- a/retrieval/main.py +++ b/retrieval/main.py @@ -1,5 +1,4 @@ -"""Script for training the premise retriever. -""" +"""Script for training the premise retriever.""" import os from typing import Tuple @@ -14,31 +13,51 @@ from retrieval.model import PremiseRetriever from retrieval.datamodule import RetrievalDataModule + class CLI(LightningCLI): def __init__(self, *args, **kwargs): - super().__init__(*args, - save_config_kwargs={"overwrite": True}, - **kwargs) + super().__init__(*args, save_config_kwargs={"overwrite": True}, **kwargs) def add_arguments_to_parser(self, parser) -> None: parser.link_arguments("model.model_name", "data.model_name") parser.link_arguments("data.max_seq_len", "model.max_seq_len") - parser.add_argument('--data-path', type=str, required=True, help='Path to the dataset.') - + parser.add_argument( + "--data-path", type=str, required=True, help="Path to the dataset." + ) + def before_instantiate_classes(self): cur_data_path = vars(vars(self.config)["predict"])["data_path"] # Modify the --config YAML file to include the current data_path - vars(vars(vars(self.config)["predict"])["data"])["data_path"] = cur_data_path + "/random" - vars(vars(vars(self.config)["predict"])["data"])["corpus_path"] = cur_data_path + "/corpus.jsonl" - logger.info(f"Data path: {vars(vars(vars(self.config)['predict'])['data'])['data_path']}") - logger.info(f"Corpus path: {vars(vars(vars(self.config)['predict'])['data'])['corpus_path']}") + vars(vars(vars(self.config)["predict"])["data"])["data_path"] = ( + cur_data_path + "/random" + ) + vars(vars(vars(self.config)["predict"])["data"])["corpus_path"] = ( + cur_data_path + "/corpus.jsonl" + ) + logger.info( + f"Data path: {vars(vars(vars(self.config)['predict'])['data'])['data_path']}" + ) + logger.info( + f"Corpus path: {vars(vars(vars(self.config)['predict'])['data'])['corpus_path']}" + ) + def run_cli(model_path, data_path): logger.info(f"PID: {os.getpid()}") # Mimic command line argument passing - sys.argv = ['main.py', 'predict', '--config', 'retrieval/confs/cli_lean4_random.yaml', '--ckpt_path', model_path, '--data-path', data_path] + sys.argv = [ + "main.py", + "predict", + "--config", + "retrieval/confs/cli_lean4_random.yaml", + "--ckpt_path", + model_path, + "--data-path", + data_path, + ] cli = CLI(PremiseRetriever, RetrievalDataModule) + def main() -> None: logger.info(f"PID: {os.getpid()}") cli = CLI(PremiseRetriever, RetrievalDataModule) diff --git a/retrieval/model.py b/retrieval/model.py index 5ccf2d3..6832674 100644 --- a/retrieval/model.py +++ b/retrieval/model.py @@ -29,6 +29,7 @@ torch.set_float32_matmul_precision("medium") + class PremiseRetriever(pl.LightningModule): """ A PyTorch Lightning module implementing a premise retriever for theorem proving. @@ -58,6 +59,7 @@ class PremiseRetriever(pl.LightningModule): max_seq_len (int): Maximum sequence length for tokenization num_retrieved (int, optional): Number of premises to retrieve. Defaults to 100. """ + def __init__( self, model_name: str, @@ -91,22 +93,24 @@ def set_lambda(self, lambda_value): self.lamda = lambda_value def set_previous_params(self): - self.previous_params = {name: param.clone().detach() for name, param in self.named_parameters()} + self.previous_params = { + name: param.clone().detach() for name, param in self.named_parameters() + } def ewc_loss(self): """ Calculate the Elastic Weight Consolidation (EWC) loss. - EWC loss is used to prevent catastrophic forgetting in neural networks by - penalizing changes to important parameters. The penalty is based on the - Fisher Information matrix and the difference between current and previous + EWC loss is used to prevent catastrophic forgetting in neural networks by + penalizing changes to important parameters. The penalty is based on the + Fisher Information matrix and the difference between current and previous parameter values. Returns: - float: The calculated EWC loss. If Fisher information is not available + float: The calculated EWC loss. If Fisher information is not available or lambda is zero, returns 0.0. """ if not self.fisher_info or self.lamda == 0: return 0.0 - + ewc_loss = 0 for name, param in self.named_parameters(): if name in self.fisher_info and name in self.previous_params: @@ -121,7 +125,9 @@ def ewc_loss(self): return total_loss @classmethod - def load(cls, ckpt_path: str, device, freeze: bool, config: dict) -> "PremiseRetriever": + def load( + cls, ckpt_path: str, device, freeze: bool, config: dict + ) -> "PremiseRetriever": return load_checkpoint(cls, ckpt_path, device, freeze, config) @classmethod @@ -158,7 +164,9 @@ def load_corpus(self, path_or_corpus: Union[str, Corpus]) -> None: self.corpus = indexed_corpus.corpus self.corpus_embeddings = indexed_corpus.embeddings self.embeddings_staled = False - logger.info(f"Embeddings staled load corpus pickle: {self.embeddings_staled}") + logger.info( + f"Embeddings staled load corpus pickle: {self.embeddings_staled}" + ) @property def embedding_size(self) -> int: @@ -250,7 +258,6 @@ def on_train_batch_end(self, outputs, batch, _) -> None: """Mark the embeddings as staled after a training batch.""" self.embeddings_staled = True - def configure_optimizers(self) -> Dict[str, Any]: return get_optimizers( self.parameters(), self.trainer, self.lr, self.warmup_steps @@ -274,7 +281,7 @@ def reindex_corpus(self, batch_size: int) -> None: Returns: None - """ + """ if not self.embeddings_staled: return logger.info("Re-indexing the retrieval corpus") @@ -450,12 +457,12 @@ def predict_step(self, batch: Dict[str, Any], _): "scores": s, } ) - + def on_predict_epoch_end(self) -> None: if self.trainer.log_dir is not None: logger.info("About to construct predictions map") gpu_id = self.trainer.local_rank - + preds_map = { (p["file_path"], p["full_name"], tuple(p["start"]), p["tactic_idx"]): p for p in self.predict_step_outputs diff --git a/run_leanagent.sh b/run_leanagent.sh index 66b4870..180ff70 100644 --- a/run_leanagent.sh +++ b/run_leanagent.sh @@ -24,13 +24,13 @@ # # Usage: bash run_leanagent.sh #!/bin/bash -export RAID_DIR="" -cd ${RAID_DIR}/LeanAgent +export RAID_DIR="~/Desktop/LeanAgent/RAID/" +export LEAN_AGENT_DIR="~/Desktop/LeanAgent" +cd ${LEAN_AGENT_DIR} echo "Script executed from: ${PWD}" -source /etc/profile.d/conda.sh +source /Users/motiwari/miniforge3/etc/profile.d/conda.sh conda activate LeanAgent export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent" -export GITHUB_ACCESS_TOKEN="" export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo" echo "Removing old cache files" rm -rf /tmp/ray diff --git a/tests/test_common.py b/tests/test_common.py index 9bf00cd..6855286 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -17,4 +17,4 @@ def test_remove_marks(input_string, expected_output): """ Tests that remove_marks correctly strips and tags from a string. """ - assert remove_marks(input_string) == expected_output + assert remove_marks(input_string) == expected_output diff --git a/unittest_dynamic_database.py b/unittest_dynamic_database.py index dfbcc93..25c9b03 100644 --- a/unittest_dynamic_database.py +++ b/unittest_dynamic_database.py @@ -4,7 +4,15 @@ import unittest import datetime from pathlib import Path -from dynamic_database import DynamicDatabase, Repository, Theorem, AnnotatedTactic, Annotation, PremiseFile, Premise +from dynamic_database import ( + DynamicDatabase, + Repository, + Theorem, + AnnotatedTactic, + Annotation, + PremiseFile, + Premise, +) from lean_dojo.data_extraction.lean import Pos, LeanGitRepo import generate_benchmark_lean4 import lean_dojo @@ -20,11 +28,12 @@ import os from unittest.mock import patch, MagicMock -RAID_DIR = os.environ.get('RAID_DIR') +RAID_DIR = os.environ.get("RAID_DIR") DATA_DIR = "datasets_new_unittest" MERGED_DATA_DIR = "datasets_merged_unittest" PROOF_LOG_FILE_NAME = "proof_logs_unittest/proof_log_unittest.log" + class TestDynamicDatabaseCore(unittest.TestCase): """ Unit tests for the DynamicDatabase class and related functionality. @@ -42,6 +51,7 @@ class TestDynamicDatabaseCore(unittest.TestCase): The tests use a combination of simple and complex test cases to verify that all aspects of the database function correctly, including edge cases. """ + def setUp(self): self.db = DynamicDatabase() self.repo = Repository( @@ -72,7 +82,7 @@ def test_get_update_theorem_in_repo(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) self.repo.proven_theorems.append(theorem) self.db.add_repository(self.repo) @@ -87,7 +97,7 @@ def test_get_update_theorem_in_repo(self): end=Pos(2, 1), url="https://github.com/test/repo", commit="abc123", - theorem_statement="Updated statement" + theorem_statement="Updated statement", ) self.repo.update_theorem(updated_theorem) retrieved_theorem = self.repo.get_theorem("test_theorem", "test.lean") @@ -99,17 +109,13 @@ def test_get_update_theorem_in_repo(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) with self.assertRaises(ValueError): self.repo.update_theorem(non_existent_theorem) def test_get_premise_file_in_repo(self): - premise_file = PremiseFile( - path=Path("test.lean"), - imports=[], - premises=[] - ) + premise_file = PremiseFile(path=Path("test.lean"), imports=[], premises=[]) self.repo.premise_files.append(premise_file) self.db.add_repository(self.repo) @@ -141,7 +147,7 @@ def test_difficulty_rating_in_theorem(self): end=Pos(2, 1), url="https://github.com/test/repo", commit="abc123", - difficulty_rating=0.7 + difficulty_rating=0.7, ) self.repo.proven_theorems.append(theorem) self.db.add_repository(self.repo) @@ -152,7 +158,7 @@ def test_difficulty_rating_in_theorem(self): def test_validation_in_from_dict(self): with self.assertRaises(ValueError): DynamicDatabase.from_dict({}) - + with self.assertRaises(ValueError): Repository.from_dict({}) @@ -181,7 +187,7 @@ def test_empty_path_to_data_in_from_dict_repository(self): "metadata": {"date_processed": datetime.datetime.now().isoformat()}, "theorems_folder": "", "premise_files_corpus": "", - "files_traced": "" + "files_traced": "", } with self.assertRaises(ValueError): Repository.from_dict(data) @@ -193,17 +199,13 @@ def test_to_dict_for_all(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) theorem_dict = theorem.to_dict() self.assertIsInstance(theorem_dict, dict) self.assertEqual(theorem_dict["full_name"], "test_theorem") - premise_file = PremiseFile( - path=Path("test.lean"), - imports=[], - premises=[] - ) + premise_file = PremiseFile(path=Path("test.lean"), imports=[], premises=[]) premise_file_dict = premise_file.to_dict() self.assertIsInstance(premise_file_dict, dict) self.assertEqual(premise_file_dict["path"], "test.lean") @@ -216,7 +218,7 @@ def test_to_dict_for_all(self): full_name="test_annotation", def_path="test/path.lean", def_pos=Pos(1, 1), - def_end_pos=Pos(2, 1) + def_end_pos=Pos(2, 1), ) annotation_dict = annotation.to_dict() self.assertIsInstance(annotation_dict, dict) @@ -229,7 +231,7 @@ def test_to_dict_for_all(self): tactic="test_tactic", annotated_tactic=("test_tactic", [annotation]), state_before="test_state_before", - state_after="test_state_after" + state_after="test_state_after", ) annotated_tactic_dict = annotated_tactic.to_dict() self.assertIsInstance(annotated_tactic_dict, dict) @@ -242,7 +244,7 @@ def test_to_dict_for_all(self): code="test_code", start=Pos(1, 1), end=Pos(2, 1), - kind="theorem" + kind="theorem", ) premise_dict = premise.to_dict() self.assertIsInstance(premise_dict, dict) @@ -251,7 +253,7 @@ def test_to_dict_for_all(self): self.assertEqual(premise_dict["start"], "(1, 1)") self.assertEqual(premise_dict["end"], "(2, 1)") self.assertEqual(premise_dict["kind"], "theorem") - + def test_empty_string_and_none_json_serialization(self): empty_theorem = Theorem( full_name="", @@ -261,25 +263,25 @@ def test_empty_string_and_none_json_serialization(self): url="https://github.com/test/repo", commit="abc123", theorem_statement=None, - difficulty_rating=None + difficulty_rating=None, ) self.repo.proven_theorems.append(empty_theorem) self.db.add_repository(self.repo) - + json_file = "empty_none_test.json" self.db.to_json(json_file) - + loaded_db = DynamicDatabase.from_json(json_file) loaded_repo = loaded_db.get_repository(self.repo.url, self.repo.commit) loaded_theorem = loaded_repo.proven_theorems[-1] - + self.assertEqual(loaded_theorem.full_name, "") self.assertEqual(str(loaded_theorem.file_path), ".") self.assertEqual(loaded_theorem.url, "https://github.com/test/repo") self.assertEqual(loaded_theorem.commit, "abc123") self.assertIsNone(loaded_theorem.theorem_statement) self.assertIsNone(loaded_theorem.difficulty_rating) - + def test_complex_json_serialization(self): theorem1 = Theorem( full_name="theorem1", @@ -292,21 +294,24 @@ def test_complex_json_serialization(self): traced_tactics=[ AnnotatedTactic( tactic="rw [add_comm]", - annotated_tactic=("rw [add_comm]", [ - Annotation( - full_name="add_comm", - def_path="src/add_comm.lean", - def_pos=Pos(5, 1), - def_end_pos=Pos(7, 1) - ) - ]), + annotated_tactic=( + "rw [add_comm]", + [ + Annotation( + full_name="add_comm", + def_path="src/add_comm.lean", + def_pos=Pos(5, 1), + def_end_pos=Pos(7, 1), + ) + ], + ), state_before="⊢ 2 + 2 = 4", - state_after="⊢ 2 + 2 = 4" + state_after="⊢ 2 + 2 = 4", ) ], - difficulty_rating=0.7 + difficulty_rating=0.7, ) - + theorem2 = Theorem( full_name="theorem2", file_path=Path("test2.lean"), @@ -316,9 +321,9 @@ def test_complex_json_serialization(self): commit="abc123", theorem_statement="theorem2 : ∀ x y : ℕ, x + y = y + x", traced_tactics=[], - difficulty_rating=None + difficulty_rating=None, ) - + premise_file = PremiseFile( path=Path("premise.lean"), imports=["import data.nat.basic"], @@ -328,11 +333,11 @@ def test_complex_json_serialization(self): code="theorem nat_add_comm : ∀ a b : ℕ, a + b = b + a := sorry", start=Pos(1, 1), end=Pos(1, 60), - kind="theorem" + kind="theorem", ) - ] + ], ) - + complex_repo = Repository( url="https://github.com/test/complex-repo", name="Complex Test Repo", @@ -341,55 +346,59 @@ def test_complex_json_serialization(self): lean_dojo_version="1.0.0", metadata={ "date_processed": datetime.datetime.now(), - "extra_info": {"key1": "value1", "key2": 2} + "extra_info": {"key1": "value1", "key2": 2}, }, proven_theorems=[theorem1], sorry_theorems_unproved=[theorem2], premise_files=[premise_file], files_traced=[Path("test1.lean"), Path("test2.lean")], - pr_url="https://github.com/test/complex-repo/pull/1" + pr_url="https://github.com/test/complex-repo/pull/1", ) - + self.db.add_repository(complex_repo) - + json_file = "complex_test_database.json" self.db.to_json(json_file) - + loaded_db = DynamicDatabase.from_json(json_file) - + self.assertEqual(len(loaded_db.repositories), len(self.db.repositories)) - loaded_repo = loaded_db.get_repository("https://github.com/test/complex-repo", "complex123") + loaded_repo = loaded_db.get_repository( + "https://github.com/test/complex-repo", "complex123" + ) self.assertIsNotNone(loaded_repo) - + self.assertEqual(loaded_repo.name, "Complex Test Repo") self.assertEqual(loaded_repo.lean_version, "4.0.0") - self.assertEqual(loaded_repo.pr_url, "https://github.com/test/complex-repo/pull/1") - + self.assertEqual( + loaded_repo.pr_url, "https://github.com/test/complex-repo/pull/1" + ) + # Check theorems self.assertEqual(len(loaded_repo.proven_theorems), 1) self.assertEqual(len(loaded_repo.sorry_theorems_unproved), 1) - + loaded_theorem1 = loaded_repo.proven_theorems[0] self.assertEqual(loaded_theorem1.full_name, "theorem1") self.assertEqual(loaded_theorem1.theorem_statement, "theorem1 : 2 + 2 = 4") self.assertEqual(len(loaded_theorem1.traced_tactics), 1) self.assertEqual(loaded_theorem1.difficulty_rating, 0.7) - + loaded_theorem2 = loaded_repo.sorry_theorems_unproved[0] self.assertEqual(loaded_theorem2.full_name, "theorem2") self.assertIsNone(loaded_theorem2.difficulty_rating) - + # Check premise files self.assertEqual(len(loaded_repo.premise_files), 1) loaded_premise_file = loaded_repo.premise_files[0] self.assertEqual(str(loaded_premise_file.path), "premise.lean") self.assertEqual(len(loaded_premise_file.premises), 1) - + # Check metadata self.assertIn("extra_info", loaded_repo.metadata) self.assertEqual(loaded_repo.metadata["extra_info"]["key1"], "value1") self.assertEqual(loaded_repo.metadata["extra_info"]["key2"], 2) - + # Check files traced self.assertEqual(len(loaded_repo.files_traced), 2) self.assertIn(Path("test1.lean"), loaded_repo.files_traced) @@ -402,7 +411,7 @@ def test_is_same_theorem(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) theorem2 = Theorem( full_name="test_theorem", @@ -410,7 +419,7 @@ def test_is_same_theorem(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) theorem3 = Theorem( full_name="other_theorem", @@ -418,7 +427,7 @@ def test_is_same_theorem(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) self.assertTrue(theorem1.is_same_theorem(theorem2)) self.assertFalse(theorem1.is_same_theorem(theorem3)) @@ -430,7 +439,7 @@ def test_repository_properties(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) theorem2 = Theorem( full_name="test_theorem2", @@ -438,7 +447,7 @@ def test_repository_properties(self): start=Pos(3, 1), end=Pos(4, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) self.repo.proven_theorems.append(theorem1) self.repo.sorry_theorems_unproved.append(theorem2) @@ -456,7 +465,7 @@ def test_get_all_theorems(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) theorem2 = Theorem( full_name="test_theorem2", @@ -464,7 +473,7 @@ def test_get_all_theorems(self): start=Pos(3, 1), end=Pos(4, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) self.repo.proven_theorems.append(theorem1) self.repo.sorry_theorems_unproved.append(theorem2) @@ -481,7 +490,7 @@ def test_empty_repository(self): commit="empty123", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) self.db.add_repository(empty_repo) @@ -499,7 +508,7 @@ def test_theorem_with_empty_traced_tactics(self): end=Pos(2, 1), url="https://github.com/test/repo", commit="abc123", - traced_tactics=[] + traced_tactics=[], ) self.repo.proven_theorems.append(theorem) self.db.add_repository(self.repo) @@ -516,7 +525,7 @@ def test_none_values(self): url="https://github.com/test/repo", commit="abc123", theorem_statement=None, - difficulty_rating=None + difficulty_rating=None, ) self.repo.proven_theorems.append(theorem) self.repo.pr_url = None @@ -534,7 +543,7 @@ def test_empty_strings(self): start=Pos(1, 1), end=Pos(2, 1), url="", - commit="" + commit="", ) self.repo.proven_theorems.append(theorem) self.db.add_repository(self.repo) @@ -551,7 +560,7 @@ def test_empty_strings(self): start=Pos(1, 1), end=Pos(2, 1), url="new_url", - commit="" + commit="", ) self.repo.update_theorem(theorem2) retrieved_theorem = self.repo.get_theorem("", "") # Should be theorem2 @@ -580,7 +589,7 @@ def test_duplicate_url_different_commit(self): commit="abc123", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) repo2 = Repository( url="https://github.com/test/repo", @@ -588,7 +597,9 @@ def test_duplicate_url_different_commit(self): commit="def456", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now() + datetime.timedelta(days=1)} + metadata={ + "date_processed": datetime.datetime.now() + datetime.timedelta(days=1) + }, ) # Add a theorem to both repositories @@ -599,7 +610,7 @@ def test_duplicate_url_different_commit(self): end=Pos(2, 1), url="https://github.com/test/repo", commit="abc123", - theorem_statement="Old version" + theorem_statement="Old version", ) repo1.proven_theorems.append(common_theorem) @@ -610,27 +621,31 @@ def test_duplicate_url_different_commit(self): end=Pos(2, 1), url="https://github.com/test/repo", commit="def456", - theorem_statement="New version" + theorem_statement="New version", ) repo2.proven_theorems.append(updated_common_theorem) # Add unique theorems to each repository - repo1.proven_theorems.append(Theorem( - full_name="unique_to_repo1", - file_path=Path("repo1.lean"), - start=Pos(1, 1), - end=Pos(2, 1), - url="https://github.com/test/repo", - commit="abc123" - )) - repo2.proven_theorems.append(Theorem( - full_name="unique_to_repo2", - file_path=Path("repo2.lean"), - start=Pos(1, 1), - end=Pos(2, 1), - url="https://github.com/test/repo", - commit="def456" - )) + repo1.proven_theorems.append( + Theorem( + full_name="unique_to_repo1", + file_path=Path("repo1.lean"), + start=Pos(1, 1), + end=Pos(2, 1), + url="https://github.com/test/repo", + commit="abc123", + ) + ) + repo2.proven_theorems.append( + Theorem( + full_name="unique_to_repo2", + file_path=Path("repo2.lean"), + start=Pos(1, 1), + end=Pos(2, 1), + url="https://github.com/test/repo", + commit="def456", + ) + ) self.db.add_repository(repo1) self.db.add_repository(repo2) @@ -639,14 +654,16 @@ def test_duplicate_url_different_commit(self): dst_dir = Path(RAID_DIR) / DATA_DIR / "test_duplicate_url" self.db.generate_merged_dataset(dst_dir) - with open(dst_dir / "random" / "train.json", 'r') as f: + with open(dst_dir / "random" / "train.json", "r") as f: data = json.load(f) # Check that both repositories are represented self.assertEqual(len(data), 3) # Check that the common theorem is from the most recent repository - common_theorem_in_dataset = next(t for t in data if t["full_name"] == "common_theorem") + common_theorem_in_dataset = next( + t for t in data if t["full_name"] == "common_theorem" + ) self.assertEqual(common_theorem_in_dataset["theorem_statement"], "New version") self.assertEqual(common_theorem_in_dataset["commit"], "def456") @@ -654,12 +671,16 @@ def test_duplicate_url_different_commit(self): self.assertTrue(any(t["full_name"] == "unique_to_repo1" for t in data)) self.assertTrue(any(t["full_name"] == "unique_to_repo2" for t in data)) - with open(dst_dir / "metadata.json", 'r') as f: + with open(dst_dir / "metadata.json", "r") as f: metadata = json.load(f) - + self.assertEqual(len(metadata["repositories"]), 2) - self.assertTrue(any(repo["commit"] == "abc123" for repo in metadata["repositories"])) - self.assertTrue(any(repo["commit"] == "def456" for repo in metadata["repositories"])) + self.assertTrue( + any(repo["commit"] == "abc123" for repo in metadata["repositories"]) + ) + self.assertTrue( + any(repo["commit"] == "def456" for repo in metadata["repositories"]) + ) def test_change_sorry_to_proven(self): theorem = Theorem( @@ -668,7 +689,7 @@ def test_change_sorry_to_proven(self): start=Pos(1, 1), end=Pos(2, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) self.repo.sorry_theorems_unproved.append(theorem) self.db.add_repository(self.repo) @@ -684,14 +705,14 @@ def test_change_sorry_to_proven(self): start=Pos(3, 1), end=Pos(4, 1), url="https://github.com/test/repo", - commit="abc123" + commit="abc123", ) with self.assertRaises(ValueError): self.repo.change_sorry_to_proven(not_found_theorem, PROOF_LOG_FILE_NAME) with self.assertRaises(ValueError): self.repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME) - + def test_add_repository_duplicate(self): repo = Repository( url="https://github.com/test/repo", @@ -708,7 +729,9 @@ def test_add_repository_duplicate(self): # Try to add the same repository again self.db.add_repository(repo) - self.assertEqual(len(self.db.repositories), 1, "Repository should not be added twice") + self.assertEqual( + len(self.db.repositories), 1, "Repository should not be added twice" + ) # Verify that the repository details are unchanged added_repo = self.db.get_repository("https://github.com/test/repo", "abc123") @@ -723,7 +746,7 @@ def test_repository_equality(self): commit="abc123", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) repo2 = Repository( url="https://github.com/test/repo", @@ -731,7 +754,7 @@ def test_repository_equality(self): commit="abc123", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) repo3 = Repository( url="https://github.com/test/repo", @@ -739,7 +762,7 @@ def test_repository_equality(self): commit="def456", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) self.assertEqual(repo1, repo2) self.assertNotEqual(repo1, repo3) @@ -751,13 +774,13 @@ def test_add_repository_duplicate(self): commit="abc123", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) self.db.add_repository(repo) self.assertEqual(len(self.db.repositories), 1) self.db.add_repository(repo) self.assertEqual(len(self.db.repositories), 1) - + def test_update_repository_duplicate(self): repo = Repository( url="https://github.com/test/repo", @@ -765,7 +788,7 @@ def test_update_repository_duplicate(self): commit="abc123", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) self.db.add_repository(repo) self.assertEqual(len(self.db.repositories), 1) @@ -789,7 +812,7 @@ def test_update_repository_duplicate(self): self.assertEqual(updated_repo.name, "Updated Repo") self.assertEqual(updated_repo.commit, "abc123") self.assertEqual(added_repo.lean_version, "3.50.4") - + def test_update_theorem_difficulty(self): theorem = Theorem( full_name="test_theorem", @@ -798,7 +821,7 @@ def test_update_theorem_difficulty(self): end=Pos(2, 1), url="https://github.com/test/repo", commit="abc123", - difficulty_rating=None + difficulty_rating=None, ) self.repo.proven_theorems.append(theorem) self.db.add_repository(self.repo) @@ -826,7 +849,7 @@ def test_update_theorem_difficulty(self): loaded_db = DynamicDatabase.from_json(json_file) loaded_repo = loaded_db.get_repository("https://github.com/test/repo", "abc123") self.assertIsNotNone(loaded_repo) - + loaded_theorem = loaded_repo.get_theorem("test_theorem", "test.lean") self.assertIsNotNone(loaded_theorem) @@ -838,22 +861,30 @@ def create_theorem(self, name, tactics): end=Pos(2, 1), url="https://github.com/test/repo", commit="abc123", - traced_tactics=tactics + traced_tactics=tactics, ) - + def _calculate_difficulty(self, theorem: Theorem) -> Union[float, None]: proof_steps = theorem.traced_tactics - if any('sorry' in step.tactic for step in proof_steps): - return float('inf') # Hard (no proof) + if any("sorry" in step.tactic for step in proof_steps): + return float("inf") # Hard (no proof) if len(proof_steps) == 0: return None # To be distributed later return math.exp(len(proof_steps)) def test_calculate_and_update_difficulty(self): # Test case 1: Theorem with 'sorry' - sorry_theorem = self.create_theorem("sorry_theorem", [ - AnnotatedTactic(tactic="sorry", annotated_tactic=("sorry", []), state_before="", state_after="") - ]) + sorry_theorem = self.create_theorem( + "sorry_theorem", + [ + AnnotatedTactic( + tactic="sorry", + annotated_tactic=("sorry", []), + state_before="", + state_after="", + ) + ], + ) self.repo.sorry_theorems_unproved.append(sorry_theorem) # Test case 2: Theorem with no tactics @@ -861,17 +892,43 @@ def test_calculate_and_update_difficulty(self): self.repo.proven_theorems.append(empty_theorem) # Test case 3: Theorem with proven sorry - normal_theorem = self.create_theorem("proven_sorry_theorem", [ - AnnotatedTactic(tactic="tactic1", annotated_tactic=("tactic1", []), state_before="", state_after=""), - AnnotatedTactic(tactic="tactic2", annotated_tactic=("tactic2", []), state_before="", state_after="") - ]) + normal_theorem = self.create_theorem( + "proven_sorry_theorem", + [ + AnnotatedTactic( + tactic="tactic1", + annotated_tactic=("tactic1", []), + state_before="", + state_after="", + ), + AnnotatedTactic( + tactic="tactic2", + annotated_tactic=("tactic2", []), + state_before="", + state_after="", + ), + ], + ) self.repo.proven_theorems.append(normal_theorem) # Test case 4: Theorem with normal teactics - normal_theorem = self.create_theorem("normal_theorem", [ - AnnotatedTactic(tactic="tactic1", annotated_tactic=("tactic1", []), state_before="before", state_after="no goals"), - AnnotatedTactic(tactic="tactic2", annotated_tactic=("tactic2", []), state_before="before2", state_after="no goals") - ]) + normal_theorem = self.create_theorem( + "normal_theorem", + [ + AnnotatedTactic( + tactic="tactic1", + annotated_tactic=("tactic1", []), + state_before="before", + state_after="no goals", + ), + AnnotatedTactic( + tactic="tactic2", + annotated_tactic=("tactic2", []), + state_before="before2", + state_after="no goals", + ), + ], + ) self.repo.proven_theorems.append(normal_theorem) self.db.add_repository(self.repo) @@ -885,7 +942,7 @@ def test_calculate_and_update_difficulty(self): self.db.update_repository(self.repo) sorry_theorem = self.repo.get_theorem("sorry_theorem", "test.lean") - self.assertEqual(sorry_theorem.difficulty_rating, float('inf')) + self.assertEqual(sorry_theorem.difficulty_rating, float("inf")) empty_theorem = self.repo.get_theorem("empty_theorem", "test.lean") self.assertIsNone(empty_theorem.difficulty_rating) @@ -901,7 +958,7 @@ def test_calculate_and_update_difficulty(self): self.assertIsNotNone(loaded_repo) loaded_sorry_theorem = loaded_repo.get_theorem("sorry_theorem", "test.lean") - self.assertEqual(loaded_sorry_theorem.difficulty_rating, float('inf')) + self.assertEqual(loaded_sorry_theorem.difficulty_rating, float("inf")) loaded_empty_theorem = loaded_repo.get_theorem("empty_theorem", "test.lean") self.assertIsNone(loaded_empty_theorem.difficulty_rating) @@ -909,6 +966,7 @@ def test_calculate_and_update_difficulty(self): loaded_normal_theorem = loaded_repo.get_theorem("normal_theorem", "test.lean") self.assertEqual(loaded_normal_theorem.difficulty_rating, math.exp(2)) + class TestDynamicDatabaseSimpleLean(unittest.TestCase): def setUp(self): self.db = DynamicDatabase() @@ -919,7 +977,7 @@ def create_simple_lean_repo(self): url = "https://github.com/Adarsh321123/SimpleLean" commit = "99a5078e1614e61f0d9cc234ca246c8744a4e660" lean_git_repo = LeanGitRepo(url, commit) - dir_name = url.split("/")[-1].replace('.git', '') + "_" + commit + dir_name = url.split("/")[-1].replace(".git", "") + "_" + commit dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name + "_updated" config = lean_git_repo.get_config("lean-toolchain") v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) @@ -951,14 +1009,15 @@ def test_generate_dataset_with_empty_repo(self): self.assertTrue(dst_dir.exists()) self.assertTrue((dst_dir / "random").exists()) self.assertTrue((dst_dir / "novel_premises").exists()) - - for split in ['train', 'val', 'test']: - with open(dst_dir / "random" / f"{split}.json", 'r') as f: + + for split in ["train", "val", "test"]: + with open(dst_dir / "random" / f"{split}.json", "r") as f: data = json.load(f) self.assertEqual(len(data), 0) + class TestDynamicDatabaseUnicode(unittest.TestCase): - """" + """ " Unit test class for testing Unicode handling in the DynamicDatabase class. This test suite focuses on verifying that the DynamicDatabase correctly handles Unicode characters during serialization and deserialization operations, and @@ -973,6 +1032,7 @@ class TestDynamicDatabaseUnicode(unittest.TestCase): This ensures the database can correctly handle international character sets and mathematical notation when saving to and loading from JSON files. """ + def setUp(self): self.db = DynamicDatabase() self.unicode_repo = self.create_unicode_sample_repo() @@ -985,7 +1045,7 @@ def assertDatetimeEqual(self, dt1, dt2): may lose microsecond precision. """ self.assertEqual(dt1.replace(microsecond=0), dt2.replace(microsecond=0)) - + def create_unicode_sample_repo(self): repo = Repository( url="https://github.com/example/repo", @@ -1007,19 +1067,22 @@ def create_unicode_sample_repo(self): traced_tactics=[ AnnotatedTactic( tactic="induction x with n ih", - annotated_tactic=("induction x with n ih", [ - Annotation( - full_name="induction", - def_path="src/tactic/induction.lean", - def_pos=Pos(100, 1), - def_end_pos=Pos(100, 10) - ) - ]), + annotated_tactic=( + "induction x with n ih", + [ + Annotation( + full_name="induction", + def_path="src/tactic/induction.lean", + def_pos=Pos(100, 1), + def_end_pos=Pos(100, 10), + ) + ], + ), state_before="⊢ ∀ x y : ℕ, x + y = y + x", - state_after="2 goals\ncase zero\n⊢ ∀ y : ℕ, 0 + y = y + 0\ncase succ\nn : ℕ\nih : ∀ y : ℕ, n + y = y + n\n⊢ ∀ y : ℕ, succ n + y = y + succ n" + state_after="2 goals\ncase zero\n⊢ ∀ y : ℕ, 0 + y = y + 0\ncase succ\nn : ℕ\nih : ∀ y : ℕ, n + y = y + n\n⊢ ∀ y : ℕ, succ n + y = y + succ n", ) ], - difficulty_rating=0.7 + difficulty_rating=0.7, ) theorem2 = Theorem( @@ -1031,7 +1094,7 @@ def create_unicode_sample_repo(self): url="https://github.com/example/repo", commit="abc123", traced_tactics=[], - difficulty_rating=0.9 + difficulty_rating=0.9, ) repo.proven_theorems.append(theorem1) @@ -1046,87 +1109,107 @@ def create_unicode_sample_repo(self): code="theorem sqrt_squared (x : ℝ) (h : x ≥ 0) : √(x^2) = x := sorry", start=Pos(1, 1), end=Pos(1, 70), - kind="theorem" + kind="theorem", ) - ] + ], ) repo.premise_files.append(premise_file) repo.files_traced.append(Path("src/example.lean")) return repo - + def test_unicode_serialization_deserialization(self): json_file = "test_unicode_database.json" self.db.to_json(json_file) - + deserialized_db = DynamicDatabase.from_json(json_file) - + assert len(self.db.repositories) == len(deserialized_db.repositories) - + original_repo = self.db.repositories[0] deserialized_repo = deserialized_db.repositories[0] - + assert original_repo.name == deserialized_repo.name - self.assertDatetimeEqual(original_repo.metadata["date_processed"], deserialized_repo.metadata["date_processed"]) - + self.assertDatetimeEqual( + original_repo.metadata["date_processed"], + deserialized_repo.metadata["date_processed"], + ) + original_theorem1 = original_repo.proven_theorems[0] deserialized_theorem1 = deserialized_repo.proven_theorems[0] - - assert original_theorem1.theorem_statement == deserialized_theorem1.theorem_statement - assert original_theorem1.traced_tactics[0].state_before == deserialized_theorem1.traced_tactics[0].state_before - assert original_theorem1.traced_tactics[0].state_after == deserialized_theorem1.traced_tactics[0].state_after - + + assert ( + original_theorem1.theorem_statement + == deserialized_theorem1.theorem_statement + ) + assert ( + original_theorem1.traced_tactics[0].state_before + == deserialized_theorem1.traced_tactics[0].state_before + ) + assert ( + original_theorem1.traced_tactics[0].state_after + == deserialized_theorem1.traced_tactics[0].state_after + ) + original_theorem2 = original_repo.sorry_theorems_unproved[0] deserialized_theorem2 = deserialized_repo.sorry_theorems_unproved[0] - - assert original_theorem2.theorem_statement == deserialized_theorem2.theorem_statement - + + assert ( + original_theorem2.theorem_statement + == deserialized_theorem2.theorem_statement + ) + original_premise = original_repo.premise_files[0].premises[0] deserialized_premise = deserialized_repo.premise_files[0].premises[0] - + assert original_premise.code == deserialized_premise.code - + def test_unicode_modification(self): json_file = "test_unicode_database.json" self.db.to_json(json_file) - + deserialized_db = DynamicDatabase.from_json(json_file) - - repo = deserialized_db.get_repository("https://github.com/example/repo", "abc123") + + repo = deserialized_db.get_repository( + "https://github.com/example/repo", "abc123" + ) assert repo is not None - + sorry_theorem = repo.sorry_theorems_unproved[0] - + sorry_theorem.traced_tactics = [ AnnotatedTactic( tactic="intros a b c x h_a_nonzero", annotated_tactic=("intros a b c x h_a_nonzero", []), state_before="⊢ ∀ a b c x : ℝ, a ≠ 0 → (a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a))", - state_after="a b c x : ℝ\nh_a_nonzero : a ≠ 0\n⊢ a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a)" + state_after="a b c x : ℝ\nh_a_nonzero : a ≠ 0\n⊢ a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a)", ), AnnotatedTactic( tactic="apply iff.intro", annotated_tactic=("apply iff.intro", []), state_before="a b c x : ℝ\nh_a_nonzero : a ≠ 0\n⊢ a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a)", - state_after="no goals" - ) + state_after="no goals", + ), ] - + repo.change_sorry_to_proven(sorry_theorem, PROOF_LOG_FILE_NAME) deserialized_db.update_json(json_file) updated_db = DynamicDatabase.from_json(json_file) - updated_repo = updated_db.get_repository("https://github.com/example/repo", "abc123") + updated_repo = updated_db.get_repository( + "https://github.com/example/repo", "abc123" + ) assert updated_repo is not None - + assert len(updated_repo.sorry_theorems_unproved) == 0 assert len(updated_repo.sorry_theorems_proved) == 1 - + updated_theorem = updated_repo.sorry_theorems_proved[0] assert updated_theorem.full_name == "example.quadratic_formula" assert len(updated_theorem.traced_tactics) == 2 assert "√(b² - 4*a*c)" in updated_theorem.traced_tactics[0].state_before assert "↔" in updated_theorem.traced_tactics[1].state_before + class TestDynamicDatabase(unittest.TestCase): """ Test suite for the DynamicDatabase class. @@ -1146,6 +1229,7 @@ class TestDynamicDatabase(unittest.TestCase): Each test method uses a fresh DynamicDatabase instance and a test Repository object created in the setUp method. """ + def setUp(self): self.db = DynamicDatabase() self.repo = Repository( @@ -1172,7 +1256,9 @@ def test_add_repository(self): def test_get_repository(self): self.db.add_repository(self.repo) - retrieved_repo = self.db.get_repository("https://github.com/test/repo", "abc123") + retrieved_repo = self.db.get_repository( + "https://github.com/test/repo", "abc123" + ) self.assertEqual(retrieved_repo, self.repo) def test_update_repository(self): @@ -1183,12 +1269,19 @@ def test_update_repository(self): commit="abc123", lean_version="3.50.3", lean_dojo_version="1.8.4", - metadata={"date_processed": datetime.datetime.now() + datetime.timedelta(days=1)}, + metadata={ + "date_processed": datetime.datetime.now() + datetime.timedelta(days=1) + }, ) self.db.update_repository(updated_repo) - retrieved_repo = self.db.get_repository("https://github.com/test/repo", "abc123") + retrieved_repo = self.db.get_repository( + "https://github.com/test/repo", "abc123" + ) self.assertEqual(retrieved_repo.name, "Test Repo") - self.assertNotEqual(retrieved_repo.metadata["date_processed"].replace(microsecond=0), self.current_datetime.replace(microsecond=0)) + self.assertNotEqual( + retrieved_repo.metadata["date_processed"].replace(microsecond=0), + self.current_datetime.replace(microsecond=0), + ) def test_delete_repository(self): self.db.add_repository(self.repo) @@ -1203,7 +1296,10 @@ def test_to_json_and_from_json(self): self.assertEqual(len(loaded_db.repositories), 1) loaded_repo = loaded_db.get_repository("https://github.com/test/repo", "abc123") self.assertEqual(loaded_repo.name, "Test Repo") - self.assertDatetimeEqual(loaded_repo.metadata["date_processed"], self.current_datetime) + self.assertDatetimeEqual( + loaded_repo.metadata["date_processed"], self.current_datetime + ) + class TestDynamicDatabasePFR(unittest.TestCase): def setUp(self): @@ -1237,7 +1333,7 @@ def create_sample_repo(self): "theorems_folder": theorems_folder, "premise_files_corpus": premise_files_corpus, "files_traced": files_traced, - "pr_url": pr_url + "pr_url": pr_url, } repo = Repository.from_dict(data) return repo @@ -1245,30 +1341,54 @@ def create_sample_repo(self): def test_repository_creation(self): self.assertIsNotNone(self.sample_repo) self.assertEqual(self.sample_repo.url, "https://github.com/teorth/pfr") - self.assertEqual(self.sample_repo.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e") + self.assertEqual( + self.sample_repo.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e" + ) def test_theorem_loading(self): self.assertGreater(len(self.sample_repo.proven_theorems), 0) self.assertGreater(len(self.sample_repo.sorry_theorems_unproved), 0) - theorem = next(t for t in self.sample_repo.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul") + theorem = next( + t + for t in self.sample_repo.proven_theorems + if t.full_name == "ContinuousLinearMap.opNorm_lsmul" + ) self.assertIsNotNone(theorem) - self.assertEqual(theorem.file_path, Path(".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean")) + self.assertEqual( + theorem.file_path, + Path( + ".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean" + ), + ) self.assertEqual(theorem.start, Pos(281, 1)) self.assertEqual(theorem.end, Pos(290, 26)) def test_traced_tactics(self): - theorem = next(t for t in self.sample_repo.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul") + theorem = next( + t + for t in self.sample_repo.proven_theorems + if t.full_name == "ContinuousLinearMap.opNorm_lsmul" + ) self.assertGreater(len(theorem.traced_tactics), 0) first_tactic = theorem.traced_tactics[0] - self.assertEqual(first_tactic.tactic, "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _") - self.assertIn("ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0]) + self.assertEqual( + first_tactic.tactic, + "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _", + ) + self.assertIn( + "ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0] + ) def test_premise_loading(self): self.assertGreater(len(self.sample_repo.premise_files), 0) - premise_file = next(pf for pf in self.sample_repo.premise_files if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean")) + premise_file = next( + pf + for pf in self.sample_repo.premise_files + if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean") + ) self.assertIsNotNone(premise_file) self.assertGreater(len(premise_file.premises), 0) @@ -1289,8 +1409,12 @@ def test_serialization_deserialization(self): self.assertEqual(original_repo.name, deserialized_repo.name) self.assertEqual(original_repo.commit, deserialized_repo.commit) - self.assertEqual(len(original_repo.proven_theorems), len(deserialized_repo.proven_theorems)) - self.assertEqual(len(original_repo.premise_files), len(deserialized_repo.premise_files)) + self.assertEqual( + len(original_repo.proven_theorems), len(deserialized_repo.proven_theorems) + ) + self.assertEqual( + len(original_repo.premise_files), len(deserialized_repo.premise_files) + ) def test_generate_dataset_structure(self): url = "https://github.com/teorth/pfr" @@ -1319,7 +1443,7 @@ def test_generated_dataset_content(self): dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name}_generated" self.db.generate_merged_dataset(dst_dir) - with open(dst_dir / "random" / "train.json", 'r') as f: + with open(dst_dir / "random" / "train.json", "r") as f: train_data = json.load(f) self.assertIsInstance(train_data, list) self.assertGreater(len(train_data), 0) @@ -1333,19 +1457,19 @@ def test_generated_dataset_content(self): self.assertIn("end", first_theorem) self.assertIn("traced_tactics", first_theorem) - with open(dst_dir / "corpus.jsonl", 'r') as f: + with open(dst_dir / "corpus.jsonl", "r") as f: first_line = f.readline().strip() first_premise_file = json.loads(first_line) self.assertIn("path", first_premise_file) self.assertIn("imports", first_premise_file) self.assertIn("premises", first_premise_file) - with open(dst_dir / "traced_files.jsonl", 'r') as f: + with open(dst_dir / "traced_files.jsonl", "r") as f: first_line = f.readline().strip() first_traced_file = json.loads(first_line) self.assertIn("traced_file_path", first_traced_file) - with open(dst_dir / "metadata.json", 'r') as f: + with open(dst_dir / "metadata.json", "r") as f: metadata = json.load(f) self.assertIn("repositories", metadata) self.assertEqual(len(metadata["repositories"]), 1) @@ -1371,22 +1495,46 @@ def test_dataset_splitting(self): dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name}_generated" self.db.generate_merged_dataset(dst_dir) - for strategy in ['random', 'novel_premises']: + for strategy in ["random", "novel_premises"]: train_set = set() val_set = set() test_set = set() - with open(dst_dir / strategy / "train.json", 'r') as f: + with open(dst_dir / strategy / "train.json", "r") as f: train_data = json.load(f) - train_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in train_data) + train_set = set( + ( + item["full_name"], + item["file_path"], + tuple(item["start"]), + tuple(item["end"]), + ) + for item in train_data + ) - with open(dst_dir / strategy / "val.json", 'r') as f: + with open(dst_dir / strategy / "val.json", "r") as f: val_data = json.load(f) - val_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in val_data) + val_set = set( + ( + item["full_name"], + item["file_path"], + tuple(item["start"]), + tuple(item["end"]), + ) + for item in val_data + ) - with open(dst_dir / strategy / "test.json", 'r') as f: + with open(dst_dir / strategy / "test.json", "r") as f: test_data = json.load(f) - test_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in test_data) + test_set = set( + ( + item["full_name"], + item["file_path"], + tuple(item["start"]), + tuple(item["end"]), + ) + for item in test_data + ) self.assertGreater(len(train_set), 0) self.assertGreater(len(val_set), 0) @@ -1406,58 +1554,78 @@ def test_dataset_consistency(self): # Check that all theorems in the dataset are from the original repository all_theorems = set(thm.full_name for thm in self.sample_repo.get_all_theorems) - for strategy in ['random', 'novel_premises']: - for split in ['train', 'val', 'test']: - with open(dst_dir / strategy / f"{split}.json", 'r') as f: + for strategy in ["random", "novel_premises"]: + for split in ["train", "val", "test"]: + with open(dst_dir / strategy / f"{split}.json", "r") as f: data = json.load(f) for item in data: - self.assertIn(item['full_name'], all_theorems) + self.assertIn(item["full_name"], all_theorems) def test_compare_manual_and_dynamic_datasets(self): random.seed(3407) - manual_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated" - dynamic_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_generated" + manual_dataset_path = ( + Path(RAID_DIR) + / DATA_DIR + / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated" + ) + dynamic_dataset_path = ( + Path(RAID_DIR) + / DATA_DIR + / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_generated" + ) self.db.generate_merged_dataset(dynamic_dataset_path) - - for strategy in ['random', 'novel_premises']: + + for strategy in ["random", "novel_premises"]: logger.info(f"Comparing datasets for {strategy} strategy") manual_theorems = [] dynamic_theorems = [] - for split in ['train', 'val', 'test']: + for split in ["train", "val", "test"]: logger.info(f"Loading {split} split for {strategy} strategy") manual_file = manual_dataset_path / strategy / f"{split}.json" dynamic_file = dynamic_dataset_path / strategy / f"{split}.json" - - with open(manual_file, 'r') as f: + + with open(manual_file, "r") as f: manual_data = json.load(f) manual_theorems.extend(manual_data) - logger.info(f"Loaded {len(manual_data)} theorems from manual {split} split") - - with open(dynamic_file, 'r') as f: + logger.info( + f"Loaded {len(manual_data)} theorems from manual {split} split" + ) + + with open(dynamic_file, "r") as f: dynamic_data = json.load(f) dynamic_theorems.extend(dynamic_data) - logger.info(f"Loaded {len(dynamic_data)} theorems from dynamic {split} split") - - assert len(manual_theorems) == len(dynamic_theorems), "Manual and dynamic datasets have different number of theorems" - logger.info(f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy") - self.assertTrue(self._fast_compare_theorems(manual_theorems, dynamic_theorems), - f"Theorem content for {strategy} strategy does not match") + logger.info( + f"Loaded {len(dynamic_data)} theorems from dynamic {split} split" + ) + + assert len(manual_theorems) == len( + dynamic_theorems + ), "Manual and dynamic datasets have different number of theorems" + logger.info( + f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy" + ) + self.assertTrue( + self._fast_compare_theorems(manual_theorems, dynamic_theorems), + f"Theorem content for {strategy} strategy does not match", + ) logger.info(f"Theorem content for {strategy} strategy matches") self.maxDiff = None logger.info("Comparing corpus and traced files") - with open(manual_dataset_path / "corpus.jsonl", 'r') as f: + with open(manual_dataset_path / "corpus.jsonl", "r") as f: manual_corpus = [json.loads(line) for line in f] logger.info(f"Loaded {len(manual_corpus)} items from manual corpus") - with open(dynamic_dataset_path / "corpus.jsonl", 'r') as f: + with open(dynamic_dataset_path / "corpus.jsonl", "r") as f: dynamic_corpus = [json.loads(line) for line in f] logger.info(f"Loaded {len(dynamic_corpus)} items from dynamic corpus") - assert len(manual_corpus) == len(dynamic_corpus), "Manual and dynamic datasets have different number of premise files" + assert len(manual_corpus) == len( + dynamic_corpus + ), "Manual and dynamic datasets have different number of premise files" logger.info("Comparing corpus content") try: self.assertCountEqual(manual_corpus, dynamic_corpus) @@ -1467,15 +1635,17 @@ def test_compare_manual_and_dynamic_datasets(self): logger.info(str(e)) raise - with open(manual_dataset_path / "traced_files.jsonl", 'r') as f: + with open(manual_dataset_path / "traced_files.jsonl", "r") as f: manual_traced = [json.loads(line) for line in f] logger.info(f"Loaded {len(manual_traced)} items from manual traced files") - with open(dynamic_dataset_path / "traced_files.jsonl", 'r') as f: + with open(dynamic_dataset_path / "traced_files.jsonl", "r") as f: dynamic_traced = [json.loads(line) for line in f] logger.info(f"Loaded {len(dynamic_traced)} items from dynamic traced files") - assert len(manual_traced) == len(dynamic_traced), "Manual and dynamic datasets have different number of traced files" + assert len(manual_traced) == len( + dynamic_traced + ), "Manual and dynamic datasets have different number of traced files" logger.info("Comparing traced files content") try: self.assertCountEqual(manual_traced, dynamic_traced) @@ -1486,12 +1656,20 @@ def test_compare_manual_and_dynamic_datasets(self): raise def _fast_compare_theorems(self, manual_theorems, dynamic_theorems): - logger.info(f"Converting {len(manual_theorems)} manual theorems to hashable format") + logger.info( + f"Converting {len(manual_theorems)} manual theorems to hashable format" + ) manual_set = set(map(self._theorem_to_hashable, manual_theorems)) - assert len(manual_set) == len(manual_theorems), "Manual theorems contain duplicates" - logger.info(f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format") + assert len(manual_set) == len( + manual_theorems + ), "Manual theorems contain duplicates" + logger.info( + f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format" + ) dynamic_set = set(map(self._theorem_to_hashable, dynamic_theorems)) - assert len(dynamic_set) == len(dynamic_theorems), "Dynamic theorems contain duplicates" + assert len(dynamic_set) == len( + dynamic_theorems + ), "Dynamic theorems contain duplicates" logger.info("Comparing theorem sets") only_in_manual = manual_set - dynamic_set @@ -1520,20 +1698,27 @@ def _fast_compare_theorems(self, manual_theorems, dynamic_theorems): def _theorem_to_hashable(self, theorem): return ( - theorem['file_path'], - theorem['full_name'], - tuple(theorem['start']), - tuple(theorem['end']), + theorem["file_path"], + theorem["full_name"], + tuple(theorem["start"]), + tuple(theorem["end"]), ) def _tactic_to_hashable(self, tactic): return ( - tactic['tactic'], - tactic['annotated_tactic'][0], - tuple((a['full_name'], a['def_path'], tuple(a['def_pos']), tuple(a['def_end_pos'])) - for a in tactic['annotated_tactic'][1]), - tactic['state_before'], - tactic['state_after'] + tactic["tactic"], + tactic["annotated_tactic"][0], + tuple( + ( + a["full_name"], + a["def_path"], + tuple(a["def_pos"]), + tuple(a["def_end_pos"]), + ) + for a in tactic["annotated_tactic"][1] + ), + tactic["state_before"], + tactic["state_after"], ) def test_unicode_handling_in_dataset(self): @@ -1543,16 +1728,26 @@ def test_unicode_handling_in_dataset(self): dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name}_generated" self.db.generate_merged_dataset(dst_dir) - with open(dst_dir / "metadata.json", 'r', encoding='utf-8') as f: + with open(dst_dir / "metadata.json", "r", encoding="utf-8") as f: metadata = json.load(f) - self.assertIn('repositories', metadata, "No 'repositories' key in metadata") - self.assertGreater(len(metadata['repositories']), 0, "No repositories in metadata") - repo = metadata['repositories'][0] - self.assertIn('metadata', repo, "No 'metadata' key in repository") - repo_metadata = repo['metadata'] - self.assertIn('unicode', repo_metadata, "No 'unicode' key in repository metadata") - self.assertIn("ユニコード", repo_metadata['unicode'], "Unicode string not found in metadata") - self.assertIn("ユニコード", metadata['repositories'][0]['metadata']['unicode']) + self.assertIn("repositories", metadata, "No 'repositories' key in metadata") + self.assertGreater( + len(metadata["repositories"]), 0, "No repositories in metadata" + ) + repo = metadata["repositories"][0] + self.assertIn("metadata", repo, "No 'metadata' key in repository") + repo_metadata = repo["metadata"] + self.assertIn( + "unicode", repo_metadata, "No 'unicode' key in repository metadata" + ) + self.assertIn( + "ユニコード", + repo_metadata["unicode"], + "Unicode string not found in metadata", + ) + self.assertIn( + "ユニコード", metadata["repositories"][0]["metadata"]["unicode"] + ) def tearDown(self): # Clean up generated files after tests @@ -1564,7 +1759,11 @@ def tearDown(self): shutil.rmtree(dst_dir) def test_theorem_statement(self): - theorem = next(t for t in self.sample_repo.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul") + theorem = next( + t + for t in self.sample_repo.proven_theorems + if t.full_name == "ContinuousLinearMap.opNorm_lsmul" + ) self.assertIsNotNone(theorem.theorem_statement) self.assertIn("opNorm_lsmul", theorem.theorem_statement) @@ -1575,8 +1774,14 @@ def test_unicode_handling(self): def test_file_tracing(self): self.assertGreater(len(self.sample_repo.files_traced), 0) - self.assertIn(Path("PFR/Mathlib/GroupTheory/Torsion.lean"), self.sample_repo.files_traced) - self.assertIn(Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), self.sample_repo.files_traced) + self.assertIn( + Path("PFR/Mathlib/GroupTheory/Torsion.lean"), self.sample_repo.files_traced + ) + self.assertIn( + Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), + self.sample_repo.files_traced, + ) + class TestDynamicDatabasePFRNewVersion(unittest.TestCase): """ @@ -1598,10 +1803,16 @@ class TestDynamicDatabasePFRNewVersion(unittest.TestCase): It also compares dynamically generated datasets with manually created ones to ensure compatibility and correctness. """ + def setUp(self): self.db = DynamicDatabase() - self.sample_repo_PFR = self.create_sample_repo("https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e") - self.sample_repo_new_version = self.create_sample_repo("https://github.com/Adarsh321123/new-version-test", "f465306be03ced999caa157a85558a6c41b3e3f5") + self.sample_repo_PFR = self.create_sample_repo( + "https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e" + ) + self.sample_repo_new_version = self.create_sample_repo( + "https://github.com/Adarsh321123/new-version-test", + "f465306be03ced999caa157a85558a6c41b3e3f5", + ) self.db.add_repository(self.sample_repo_PFR) self.db.add_repository(self.sample_repo_new_version) @@ -1628,14 +1839,14 @@ def create_sample_repo(self, url, commit): } repo = Repository.from_dict(data) return repo - + def test_multiple_json_serialization_deserialization(self): json_file1 = "test_multiple_1.json" json_file2 = "test_multiple_2.json" - + # First serialization self.db.to_json(json_file1) - + # Deserialize and modify loaded_db1 = DynamicDatabase.from_json(json_file1) new_repo = Repository( @@ -1644,31 +1855,45 @@ def test_multiple_json_serialization_deserialization(self): commit="newcommit123", lean_version="4.0.0", lean_dojo_version="1.0.0", - metadata={"date_processed": datetime.datetime.now()} + metadata={"date_processed": datetime.datetime.now()}, ) loaded_db1.add_repository(new_repo) - + # Second serialization loaded_db1.to_json(json_file2) - + # Final deserialization loaded_db2 = DynamicDatabase.from_json(json_file2) - - self.assertEqual(len(loaded_db2.repositories), 3) # PFR, new-version-test, and new-repo - self.assertEqual(loaded_db2.repositories[2].url, "https://github.com/test/new-repo") + + self.assertEqual( + len(loaded_db2.repositories), 3 + ) # PFR, new-version-test, and new-repo + self.assertEqual( + loaded_db2.repositories[2].url, "https://github.com/test/new-repo" + ) self.assertEqual(loaded_db2.repositories[2].commit, "newcommit123") - + # Check that the original repositories are still intact - self.assertEqual(loaded_db2.repositories[0].url, "https://github.com/teorth/pfr") - self.assertEqual(loaded_db2.repositories[1].url, "https://github.com/Adarsh321123/new-version-test") - + self.assertEqual( + loaded_db2.repositories[0].url, "https://github.com/teorth/pfr" + ) + self.assertEqual( + loaded_db2.repositories[1].url, + "https://github.com/Adarsh321123/new-version-test", + ) + # Verify that the content of the repositories is preserved - pfr_repo = loaded_db2.get_repository("https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e") + pfr_repo = loaded_db2.get_repository( + "https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e" + ) self.assertIsNotNone(pfr_repo) self.assertGreater(len(pfr_repo.proven_theorems), 0) self.assertGreater(len(pfr_repo.sorry_theorems_unproved), 0) - - new_version_repo = loaded_db2.get_repository("https://github.com/Adarsh321123/new-version-test", "f465306be03ced999caa157a85558a6c41b3e3f5") + + new_version_repo = loaded_db2.get_repository( + "https://github.com/Adarsh321123/new-version-test", + "f465306be03ced999caa157a85558a6c41b3e3f5", + ) self.assertIsNotNone(new_version_repo) self.assertGreater(len(new_version_repo.proven_theorems), 0) self.assertGreater(len(new_version_repo.sorry_theorems_unproved), 0) @@ -1676,49 +1901,92 @@ def test_multiple_json_serialization_deserialization(self): def test_repository_creation(self): self.assertIsNotNone(self.sample_repo_PFR) self.assertEqual(self.sample_repo_PFR.url, "https://github.com/teorth/pfr") - self.assertEqual(self.sample_repo_PFR.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e") + self.assertEqual( + self.sample_repo_PFR.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e" + ) self.assertIsNotNone(self.sample_repo_new_version) - self.assertEqual(self.sample_repo_new_version.url, "https://github.com/Adarsh321123/new-version-test") - self.assertEqual(self.sample_repo_new_version.commit, "f465306be03ced999caa157a85558a6c41b3e3f5") + self.assertEqual( + self.sample_repo_new_version.url, + "https://github.com/Adarsh321123/new-version-test", + ) + self.assertEqual( + self.sample_repo_new_version.commit, + "f465306be03ced999caa157a85558a6c41b3e3f5", + ) def test_theorem_loading(self): self.assertGreater(len(self.sample_repo_PFR.proven_theorems), 0) self.assertGreater(len(self.sample_repo_PFR.sorry_theorems_unproved), 0) - theorem = next(t for t in self.sample_repo_PFR.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul") + theorem = next( + t + for t in self.sample_repo_PFR.proven_theorems + if t.full_name == "ContinuousLinearMap.opNorm_lsmul" + ) self.assertIsNotNone(theorem) - self.assertEqual(theorem.file_path, Path(".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean")) + self.assertEqual( + theorem.file_path, + Path( + ".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean" + ), + ) self.assertEqual(theorem.start, Pos(281, 1)) self.assertEqual(theorem.end, Pos(290, 26)) self.assertGreater(len(self.sample_repo_new_version.proven_theorems), 0) self.assertGreater(len(self.sample_repo_new_version.sorry_theorems_unproved), 0) - theorem = next(t for t in self.sample_repo_new_version.proven_theorems if t.full_name == "Ordinal.le_mul_right") + theorem = next( + t + for t in self.sample_repo_new_version.proven_theorems + if t.full_name == "Ordinal.le_mul_right" + ) self.assertIsNotNone(theorem) - self.assertEqual(theorem.file_path, Path(".lake/packages/mathlib/Mathlib/SetTheory/Ordinal/Arithmetic.lean")) + self.assertEqual( + theorem.file_path, + Path(".lake/packages/mathlib/Mathlib/SetTheory/Ordinal/Arithmetic.lean"), + ) self.assertEqual(theorem.start, Pos(742, 1)) self.assertEqual(theorem.end, Pos(744, 17)) def test_traced_tactics(self): - theorem = next(t for t in self.sample_repo_PFR.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul") + theorem = next( + t + for t in self.sample_repo_PFR.proven_theorems + if t.full_name == "ContinuousLinearMap.opNorm_lsmul" + ) self.assertGreater(len(theorem.traced_tactics), 0) first_tactic = theorem.traced_tactics[0] - self.assertEqual(first_tactic.tactic, "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _") - self.assertIn("ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0]) + self.assertEqual( + first_tactic.tactic, + "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _", + ) + self.assertIn( + "ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0] + ) - theorem = next(t for t in self.sample_repo_new_version.proven_theorems if t.full_name == "Ordinal.le_mul_right") + theorem = next( + t + for t in self.sample_repo_new_version.proven_theorems + if t.full_name == "Ordinal.le_mul_right" + ) self.assertGreater(len(theorem.traced_tactics), 0) first_tactic = theorem.traced_tactics[0] - self.assertEqual(first_tactic.tactic, "convert mul_le_mul_right' (one_le_iff_pos.2 hb) a") + self.assertEqual( + first_tactic.tactic, "convert mul_le_mul_right' (one_le_iff_pos.2 hb) a" + ) self.assertIn("mul_le_mul_right'", first_tactic.annotated_tactic[0]) def test_premise_loading(self): self.assertGreater(len(self.sample_repo_PFR.premise_files), 0) - premise_file = next(pf for pf in self.sample_repo_PFR.premise_files if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean")) + premise_file = next( + pf + for pf in self.sample_repo_PFR.premise_files + if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean") + ) self.assertIsNotNone(premise_file) self.assertGreater(len(premise_file.premises), 0) @@ -1728,7 +1996,11 @@ def test_premise_loading(self): self.assertGreater(len(self.sample_repo_new_version.premise_files), 0) - premise_file = next(pf for pf in self.sample_repo_new_version.premise_files if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean")) + premise_file = next( + pf + for pf in self.sample_repo_new_version.premise_files + if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean") + ) self.assertIsNotNone(premise_file) self.assertGreater(len(premise_file.premises), 0) @@ -1749,16 +2021,32 @@ def test_serialization_deserialization(self): self.assertEqual(original_repo_PFR.name, deserialized_repo_PFR.name) self.assertEqual(original_repo_PFR.commit, deserialized_repo_PFR.commit) - self.assertEqual(len(original_repo_PFR.proven_theorems), len(deserialized_repo_PFR.proven_theorems)) - self.assertEqual(len(original_repo_PFR.premise_files), len(deserialized_repo_PFR.premise_files)) + self.assertEqual( + len(original_repo_PFR.proven_theorems), + len(deserialized_repo_PFR.proven_theorems), + ) + self.assertEqual( + len(original_repo_PFR.premise_files), + len(deserialized_repo_PFR.premise_files), + ) original_repo_new_version = self.db.repositories[0] deserialized_repo_new_version = deserialized_db.repositories[0] - self.assertEqual(original_repo_new_version.name, deserialized_repo_new_version.name) - self.assertEqual(original_repo_new_version.commit, deserialized_repo_new_version.commit) - self.assertEqual(len(original_repo_new_version.proven_theorems), len(deserialized_repo_new_version.proven_theorems)) - self.assertEqual(len(original_repo_new_version.premise_files), len(deserialized_repo_new_version.premise_files)) + self.assertEqual( + original_repo_new_version.name, deserialized_repo_new_version.name + ) + self.assertEqual( + original_repo_new_version.commit, deserialized_repo_new_version.commit + ) + self.assertEqual( + len(original_repo_new_version.proven_theorems), + len(deserialized_repo_new_version.proven_theorems), + ) + self.assertEqual( + len(original_repo_new_version.premise_files), + len(deserialized_repo_new_version.premise_files), + ) def test_generate_dataset_structure(self): url_PFR = "https://github.com/teorth/pfr" @@ -1767,7 +2055,11 @@ def test_generate_dataset_structure(self): url_new_version = "https://github.com/Adarsh321123/new-version-test" commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5" dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version - dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated" + dst_dir = ( + Path(RAID_DIR) + / DATA_DIR + / f"{dir_name_PFR}_{dir_name_new_version}_generated" + ) self.db.generate_merged_dataset(dst_dir) self.assertTrue(dst_dir.exists()) @@ -1790,30 +2082,42 @@ def test_generated_dataset_content(self): url_new_version = "https://github.com/Adarsh321123/new-version-test" commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5" dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version - dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated" + dst_dir = ( + Path(RAID_DIR) + / DATA_DIR + / f"{dir_name_PFR}_{dir_name_new_version}_generated" + ) self.db.generate_merged_dataset(dst_dir) # Count sorry theorems in each repository repo_sorry_counts = { self.sample_repo_PFR.url: len(self.sample_repo_PFR.sorry_theorems_unproved), - self.sample_repo_new_version.url: len(self.sample_repo_new_version.sorry_theorems_unproved) + self.sample_repo_new_version.url: len( + self.sample_repo_new_version.sorry_theorems_unproved + ), } total_sorry_theorems = sum(repo_sorry_counts.values()) # Count sorry theorems in the generated dataset dataset_sorry_count = 0 - for split in ['train', 'val', 'test']: - with open(dst_dir / "random" / f"{split}.json", 'r') as f: + for split in ["train", "val", "test"]: + with open(dst_dir / "random" / f"{split}.json", "r") as f: data = json.load(f) for theorem in data: - if any(tactic.get('tactic') == 'sorry' for tactic in theorem.get('traced_tactics', [])): + if any( + tactic.get("tactic") == "sorry" + for tactic in theorem.get("traced_tactics", []) + ): dataset_sorry_count += 1 - self.assertEqual(dataset_sorry_count, total_sorry_theorems, - f"Number of sorry theorems in dataset ({dataset_sorry_count}) does not match " - f"the sum from individual repositories ({total_sorry_theorems})") + self.assertEqual( + dataset_sorry_count, + total_sorry_theorems, + f"Number of sorry theorems in dataset ({dataset_sorry_count}) does not match " + f"the sum from individual repositories ({total_sorry_theorems})", + ) - with open(dst_dir / "random" / "train.json", 'r') as f: + with open(dst_dir / "random" / "train.json", "r") as f: train_data = json.load(f) self.assertIsInstance(train_data, list) self.assertGreater(len(train_data), 0) @@ -1827,19 +2131,19 @@ def test_generated_dataset_content(self): self.assertIn("end", first_theorem) self.assertIn("traced_tactics", first_theorem) - with open(dst_dir / "corpus.jsonl", 'r') as f: + with open(dst_dir / "corpus.jsonl", "r") as f: first_line = f.readline().strip() first_premise_file = json.loads(first_line) self.assertIn("path", first_premise_file) self.assertIn("imports", first_premise_file) self.assertIn("premises", first_premise_file) - with open(dst_dir / "traced_files.jsonl", 'r') as f: + with open(dst_dir / "traced_files.jsonl", "r") as f: first_line = f.readline().strip() first_traced_file = json.loads(first_line) self.assertIn("traced_file_path", first_traced_file) - with open(dst_dir / "metadata.json", 'r') as f: + with open(dst_dir / "metadata.json", "r") as f: metadata = json.load(f) self.assertIn("repositories", metadata) self.assertEqual(len(metadata["repositories"]), 2) @@ -1851,9 +2155,12 @@ def test_generated_dataset_content(self): self.assertIn("num_files_traced", metadata) # Check if the total number of sorry theorems in metadata matches our count - self.assertEqual(metadata["num_sorry_theorems"], total_sorry_theorems, - f"Number of sorry theorems in metadata ({metadata['num_sorry_theorems']}) " - f"does not match the sum from individual repositories ({total_sorry_theorems})") + self.assertEqual( + metadata["num_sorry_theorems"], + total_sorry_theorems, + f"Number of sorry theorems in metadata ({metadata['num_sorry_theorems']}) " + f"does not match the sum from individual repositories ({total_sorry_theorems})", + ) for repo in metadata["repositories"]: self.assertIn("url", repo) @@ -1870,25 +2177,53 @@ def test_dataset_splitting(self): url_new_version = "https://github.com/Adarsh321123/new-version-test" commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5" dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version - dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated" + dst_dir = ( + Path(RAID_DIR) + / DATA_DIR + / f"{dir_name_PFR}_{dir_name_new_version}_generated" + ) self.db.generate_merged_dataset(dst_dir) - for strategy in ['random', 'novel_premises']: + for strategy in ["random", "novel_premises"]: train_set = set() val_set = set() test_set = set() - with open(dst_dir / strategy / "train.json", 'r') as f: + with open(dst_dir / strategy / "train.json", "r") as f: train_data = json.load(f) - train_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in train_data) + train_set = set( + ( + item["full_name"], + item["file_path"], + tuple(item["start"]), + tuple(item["end"]), + ) + for item in train_data + ) - with open(dst_dir / strategy / "val.json", 'r') as f: + with open(dst_dir / strategy / "val.json", "r") as f: val_data = json.load(f) - val_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in val_data) + val_set = set( + ( + item["full_name"], + item["file_path"], + tuple(item["start"]), + tuple(item["end"]), + ) + for item in val_data + ) - with open(dst_dir / strategy / "test.json", 'r') as f: + with open(dst_dir / strategy / "test.json", "r") as f: test_data = json.load(f) - test_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in test_data) + test_set = set( + ( + item["full_name"], + item["file_path"], + tuple(item["start"]), + tuple(item["end"]), + ) + for item in test_data + ) self.assertGreater(len(train_set), 0) self.assertGreater(len(val_set), 0) @@ -1905,73 +2240,105 @@ def test_dataset_consistency(self): url_new_version = "https://github.com/Adarsh321123/new-version-test" commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5" dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version - dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated" + dst_dir = ( + Path(RAID_DIR) + / DATA_DIR + / f"{dir_name_PFR}_{dir_name_new_version}_generated" + ) self.db.generate_merged_dataset(dst_dir) # Check that all theorems in the dataset are from the original repositories - all_theorems_PFR = set(thm.full_name for thm in self.sample_repo_PFR.get_all_theorems) - all_theorems_new_version = set(thm.full_name for thm in self.sample_repo_new_version.get_all_theorems) + all_theorems_PFR = set( + thm.full_name for thm in self.sample_repo_PFR.get_all_theorems + ) + all_theorems_new_version = set( + thm.full_name for thm in self.sample_repo_new_version.get_all_theorems + ) - for strategy in ['random', 'novel_premises']: - for split in ['train', 'val', 'test']: - with open(dst_dir / strategy / f"{split}.json", 'r') as f: + for strategy in ["random", "novel_premises"]: + for split in ["train", "val", "test"]: + with open(dst_dir / strategy / f"{split}.json", "r") as f: data = json.load(f) for item in data: - self.assertIn(item['full_name'], all_theorems_PFR | all_theorems_new_version) - + self.assertIn( + item["full_name"], + all_theorems_PFR | all_theorems_new_version, + ) + def test_generate_dataset_with_specific_repo(self): dynamic_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_only_generated" - self.db.generate_merged_dataset(dynamic_dataset_path, repos_to_include=[(self.sample_repo_PFR.url, self.sample_repo_PFR.commit)]) + self.db.generate_merged_dataset( + dynamic_dataset_path, + repos_to_include=[(self.sample_repo_PFR.url, self.sample_repo_PFR.commit)], + ) self.assertTrue(dynamic_dataset_path.exists()) self.assertTrue((dynamic_dataset_path / "random").exists()) self.assertTrue((dynamic_dataset_path / "novel_premises").exists()) - with open(dynamic_dataset_path / "metadata.json", 'r') as f: + with open(dynamic_dataset_path / "metadata.json", "r") as f: metadata = json.load(f) self.assertEqual(len(metadata["repositories"]), 1) - self.assertEqual(metadata["repositories"][0]["url"], self.sample_repo_PFR.url) + self.assertEqual( + metadata["repositories"][0]["url"], self.sample_repo_PFR.url + ) # Compare with the original PFR dataset - manual_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated" - - for strategy in ['random', 'novel_premises']: + manual_dataset_path = ( + Path(RAID_DIR) + / DATA_DIR + / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated" + ) + + for strategy in ["random", "novel_premises"]: logger.info(f"Comparing datasets for {strategy} strategy") manual_theorems = [] dynamic_theorems = [] - for split in ['train', 'val', 'test']: + for split in ["train", "val", "test"]: logger.info(f"Loading {split} split for {strategy} strategy") manual_file = manual_dataset_path / strategy / f"{split}.json" dynamic_file = dynamic_dataset_path / strategy / f"{split}.json" - - with open(manual_file, 'r') as f: + + with open(manual_file, "r") as f: manual_data = json.load(f) manual_theorems.extend(manual_data) - logger.info(f"Loaded {len(manual_data)} theorems from manual {split} split") - - with open(dynamic_file, 'r') as f: + logger.info( + f"Loaded {len(manual_data)} theorems from manual {split} split" + ) + + with open(dynamic_file, "r") as f: dynamic_data = json.load(f) dynamic_theorems.extend(dynamic_data) - logger.info(f"Loaded {len(dynamic_data)} theorems from dynamic {split} split") - - assert len(manual_theorems) == len(dynamic_theorems), "Manual and dynamic datasets have different number of theorems" - logger.info(f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy") - self.assertTrue(self._fast_compare_theorems(manual_theorems, dynamic_theorems), - f"Theorem content for {strategy} strategy does not match") + logger.info( + f"Loaded {len(dynamic_data)} theorems from dynamic {split} split" + ) + + assert len(manual_theorems) == len( + dynamic_theorems + ), "Manual and dynamic datasets have different number of theorems" + logger.info( + f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy" + ) + self.assertTrue( + self._fast_compare_theorems(manual_theorems, dynamic_theorems), + f"Theorem content for {strategy} strategy does not match", + ) logger.info(f"Theorem content for {strategy} strategy matches") self.maxDiff = None logger.info("Comparing corpus and traced files") - with open(manual_dataset_path / "corpus.jsonl", 'r') as f: + with open(manual_dataset_path / "corpus.jsonl", "r") as f: manual_corpus = [json.loads(line) for line in f] logger.info(f"Loaded {len(manual_corpus)} items from manual corpus") - with open(dynamic_dataset_path / "corpus.jsonl", 'r') as f: + with open(dynamic_dataset_path / "corpus.jsonl", "r") as f: dynamic_corpus = [json.loads(line) for line in f] logger.info(f"Loaded {len(dynamic_corpus)} items from dynamic corpus") - assert len(manual_corpus) == len(dynamic_corpus), "Manual and dynamic datasets have different number of premise files" + assert len(manual_corpus) == len( + dynamic_corpus + ), "Manual and dynamic datasets have different number of premise files" logger.info("Comparing corpus content") try: self.assertCountEqual(manual_corpus, dynamic_corpus) @@ -1981,15 +2348,17 @@ def test_generate_dataset_with_specific_repo(self): logger.info(str(e)) raise - with open(manual_dataset_path / "traced_files.jsonl", 'r') as f: + with open(manual_dataset_path / "traced_files.jsonl", "r") as f: manual_traced = [json.loads(line) for line in f] logger.info(f"Loaded {len(manual_traced)} items from manual traced files") - with open(dynamic_dataset_path / "traced_files.jsonl", 'r') as f: + with open(dynamic_dataset_path / "traced_files.jsonl", "r") as f: dynamic_traced = [json.loads(line) for line in f] logger.info(f"Loaded {len(dynamic_traced)} items from dynamic traced files") - assert len(manual_traced) == len(dynamic_traced), "Manual and dynamic datasets have different number of traced files" + assert len(manual_traced) == len( + dynamic_traced + ), "Manual and dynamic datasets have different number of traced files" logger.info("Comparing traced files content") try: self.assertCountEqual(manual_traced, dynamic_traced) @@ -2002,32 +2371,46 @@ def test_generate_dataset_with_specific_repo(self): def test_compare_manual_and_dynamic_datasets(self): random.seed(3407) - manual_dataset_path = Path(RAID_DIR) / MERGED_DATA_DIR / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_updated" - dynamic_dataset_path = Path(RAID_DIR) / MERGED_DATA_DIR / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_generated" + manual_dataset_path = ( + Path(RAID_DIR) + / MERGED_DATA_DIR + / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_updated" + ) + dynamic_dataset_path = ( + Path(RAID_DIR) + / MERGED_DATA_DIR + / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_generated" + ) self.db.generate_merged_dataset(dynamic_dataset_path) - - for strategy in ['random', 'novel_premises']: + + for strategy in ["random", "novel_premises"]: logger.info(f"Comparing datasets for {strategy} strategy") manual_theorems = [] dynamic_theorems = [] - for split in ['train', 'val', 'test']: + for split in ["train", "val", "test"]: logger.info(f"Loading {split} split for {strategy} strategy") manual_file = manual_dataset_path / strategy / f"{split}.json" dynamic_file = dynamic_dataset_path / strategy / f"{split}.json" - - with open(manual_file, 'r') as f: + + with open(manual_file, "r") as f: manual_data = json.load(f) manual_theorems.extend(manual_data) - logger.info(f"Loaded {len(manual_data)} theorems from manual {split} split") - - with open(dynamic_file, 'r') as f: + logger.info( + f"Loaded {len(manual_data)} theorems from manual {split} split" + ) + + with open(dynamic_file, "r") as f: dynamic_data = json.load(f) dynamic_theorems.extend(dynamic_data) - logger.info(f"Loaded {len(dynamic_data)} theorems from dynamic {split} split") - - logger.info(f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy") + logger.info( + f"Loaded {len(dynamic_data)} theorems from dynamic {split} split" + ) + + logger.info( + f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy" + ) # The manual code has a bug where it allows duplicate theorems as long as they exist in different repositories. # As such, we need to remove these duplicates. @@ -2040,89 +2423,145 @@ def test_compare_manual_and_dynamic_datasets(self): else: manual_dict[key] = thm deduplicated_manual_theorems = list(manual_dict.values()) - + dynamic_dict = {self._theorem_to_key(t): t for t in dynamic_theorems} - logger.info(f"After deduplication - Manual theorems: {len(deduplicated_manual_theorems)}, Dynamic theorems: {len(dynamic_theorems)}") - + logger.info( + f"After deduplication - Manual theorems: {len(deduplicated_manual_theorems)}, Dynamic theorems: {len(dynamic_theorems)}" + ) + only_in_manual = set(manual_dict.keys()) - set(dynamic_dict.keys()) only_in_dynamic = set(dynamic_dict.keys()) - set(manual_dict.keys()) - + if only_in_manual: - logger.error(f"{len(only_in_manual)} theorems only in manual dataset for {strategy}") + logger.error( + f"{len(only_in_manual)} theorems only in manual dataset for {strategy}" + ) for key in list(only_in_manual)[:1]: manual_thm = manual_dict[key] - logger.error(f"Manual only: {manual_thm['full_name']} in {manual_thm['file_path']}") - logger.error(f" URL: {manual_thm['url']}, Commit: {manual_thm['commit']}") - logger.error(f" Start: {manual_thm['start']}, End: {manual_thm['end']}") - logger.error(f" Theorem statement: {manual_thm['theorem_statement'][:100]}...") # First 100 chars - + logger.error( + f"Manual only: {manual_thm['full_name']} in {manual_thm['file_path']}" + ) + logger.error( + f" URL: {manual_thm['url']}, Commit: {manual_thm['commit']}" + ) + logger.error( + f" Start: {manual_thm['start']}, End: {manual_thm['end']}" + ) + logger.error( + f" Theorem statement: {manual_thm['theorem_statement'][:100]}..." + ) # First 100 chars + if only_in_dynamic: - logger.error(f"{len(only_in_dynamic)} theorems only in dynamic dataset for {strategy}") + logger.error( + f"{len(only_in_dynamic)} theorems only in dynamic dataset for {strategy}" + ) for key in list(only_in_dynamic)[:1]: dynamic_thm = dynamic_dict[key] - logger.error(f"Dynamic only: {dynamic_thm['full_name']} in {dynamic_thm['file_path']}") - logger.error(f" URL: {dynamic_thm['url']}, Commit: {dynamic_thm['commit']}") - logger.error(f" Start: {dynamic_thm['start']}, End: {dynamic_thm['end']}") - logger.error(f" Theorem statement: {dynamic_thm['theorem_statement'][:100]}...") # First 100 chars - - self.assertEqual(len(only_in_manual), 0, f"Theorems found only in manual dataset for {strategy}") - self.assertEqual(len(only_in_dynamic), 0, f"Theorems found only in dynamic dataset for {strategy}") - - assert len(set(manual_dict.keys())) == len(set(dynamic_dict.keys())), "Manual and dynamic datasets have different number of theorems" - self.assertTrue(self._fast_compare_theorems(deduplicated_manual_theorems, dynamic_theorems), - f"Theorem content for {strategy} strategy does not match") - logger.info(f"Theorem content for {strategy} strategy matches after deduplication") + logger.error( + f"Dynamic only: {dynamic_thm['full_name']} in {dynamic_thm['file_path']}" + ) + logger.error( + f" URL: {dynamic_thm['url']}, Commit: {dynamic_thm['commit']}" + ) + logger.error( + f" Start: {dynamic_thm['start']}, End: {dynamic_thm['end']}" + ) + logger.error( + f" Theorem statement: {dynamic_thm['theorem_statement'][:100]}..." + ) # First 100 chars + + self.assertEqual( + len(only_in_manual), + 0, + f"Theorems found only in manual dataset for {strategy}", + ) + self.assertEqual( + len(only_in_dynamic), + 0, + f"Theorems found only in dynamic dataset for {strategy}", + ) + + assert len(set(manual_dict.keys())) == len( + set(dynamic_dict.keys()) + ), "Manual and dynamic datasets have different number of theorems" + self.assertTrue( + self._fast_compare_theorems( + deduplicated_manual_theorems, dynamic_theorems + ), + f"Theorem content for {strategy} strategy does not match", + ) + logger.info( + f"Theorem content for {strategy} strategy matches after deduplication" + ) self.maxDiff = None logger.info("Comparing corpus and traced files") - with open(manual_dataset_path / "corpus.jsonl", 'r') as f: + with open(manual_dataset_path / "corpus.jsonl", "r") as f: manual_corpus = [json.loads(line) for line in f] logger.info(f"Loaded {len(manual_corpus)} items from manual corpus") - with open(dynamic_dataset_path / "corpus.jsonl", 'r') as f: + with open(dynamic_dataset_path / "corpus.jsonl", "r") as f: dynamic_corpus = [json.loads(line) for line in f] logger.info(f"Loaded {len(dynamic_corpus)} items from dynamic corpus") - manual_corpus_dict = {item['path']: item for item in manual_corpus} + manual_corpus_dict = {item["path"]: item for item in manual_corpus} deduplicated_manual_corpus = list(manual_corpus_dict.values()) - dynamic_corpus_dict = {item['path']: item for item in dynamic_corpus} + dynamic_corpus_dict = {item["path"]: item for item in dynamic_corpus} - logger.info(f"Manual corpus: {len(manual_corpus)} items, {len(deduplicated_manual_corpus)} unique") + logger.info( + f"Manual corpus: {len(manual_corpus)} items, {len(deduplicated_manual_corpus)} unique" + ) logger.info(f"Dynamic corpus: {len(dynamic_corpus)} items") - only_in_manual_corpus = set(manual_corpus_dict.keys()) - set(dynamic_corpus_dict.keys()) - only_in_dynamic_corpus = set(dynamic_corpus_dict.keys()) - set(manual_corpus_dict.keys()) + only_in_manual_corpus = set(manual_corpus_dict.keys()) - set( + dynamic_corpus_dict.keys() + ) + only_in_dynamic_corpus = set(dynamic_corpus_dict.keys()) - set( + manual_corpus_dict.keys() + ) - self.assertEqual(len(only_in_manual_corpus), 0, "Corpus items found only in manual dataset") - self.assertEqual(len(only_in_dynamic_corpus), 0, "Corpus items found only in dynamic dataset") + self.assertEqual( + len(only_in_manual_corpus), 0, "Corpus items found only in manual dataset" + ) + self.assertEqual( + len(only_in_dynamic_corpus), 0, "Corpus items found only in dynamic dataset" + ) - assert len(set(dynamic_corpus_dict.keys())) == len(set(dynamic_corpus_dict.keys())), "Manual and dynamic datasets have different number of premise files" + assert len(set(dynamic_corpus_dict.keys())) == len( + set(dynamic_corpus_dict.keys()) + ), "Manual and dynamic datasets have different number of premise files" # Since we choose the first processed premise file in the case of duplicates, # we can't compare the lines directly logger.info("Comparing corpus content") try: # Check that the paths are the same in both datasets manual_paths = set(manual_corpus_dict.keys()) - dynamic_paths = set(item['path'] for item in dynamic_corpus) - self.assertEqual(manual_paths, dynamic_paths, "Paths in manual and dynamic corpus do not match") + dynamic_paths = set(item["path"] for item in dynamic_corpus) + self.assertEqual( + manual_paths, + dynamic_paths, + "Paths in manual and dynamic corpus do not match", + ) logger.info("Corpus content matches after deduplication") except AssertionError as e: logger.info("Corpus content mismatch:") logger.info(str(e)) raise - with open(manual_dataset_path / "traced_files.jsonl", 'r') as f: + with open(manual_dataset_path / "traced_files.jsonl", "r") as f: manual_traced = [json.loads(line) for line in f] logger.info(f"Loaded {len(manual_traced)} items from manual traced files") - with open(dynamic_dataset_path / "traced_files.jsonl", 'r') as f: + with open(dynamic_dataset_path / "traced_files.jsonl", "r") as f: dynamic_traced = [json.loads(line) for line in f] logger.info(f"Loaded {len(dynamic_traced)} items from dynamic traced files") - manual_traced_dict = {item['traced_file_path']: item for item in manual_traced} + manual_traced_dict = {item["traced_file_path"]: item for item in manual_traced} deduplicated_manual_traced = list(manual_traced_dict.values()) - logger.info(f"Manual traced files: {len(manual_traced)} items, {len(deduplicated_manual_traced)} unique") + logger.info( + f"Manual traced files: {len(manual_traced)} items, {len(deduplicated_manual_traced)} unique" + ) logger.info(f"Dynamic traced files: {len(dynamic_traced)} items") logger.info("Comparing traced files content") @@ -2136,19 +2575,27 @@ def test_compare_manual_and_dynamic_datasets(self): def _theorem_to_key(self, theorem): return ( - theorem['file_path'], - theorem['full_name'], - tuple(theorem['start']), - tuple(theorem['end']) + theorem["file_path"], + theorem["full_name"], + tuple(theorem["start"]), + tuple(theorem["end"]), ) def _fast_compare_theorems(self, manual_theorems, dynamic_theorems): - logger.info(f"Converting {len(manual_theorems)} manual theorems to hashable format") + logger.info( + f"Converting {len(manual_theorems)} manual theorems to hashable format" + ) manual_set = set(map(self._theorem_to_hashable, manual_theorems)) - assert len(manual_set) == len(manual_theorems), "Manual theorems contain duplicates" - logger.info(f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format") + assert len(manual_set) == len( + manual_theorems + ), "Manual theorems contain duplicates" + logger.info( + f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format" + ) dynamic_set = set(map(self._theorem_to_hashable, dynamic_theorems)) - assert len(dynamic_set) == len(dynamic_theorems), "Dynamic theorems contain duplicates" + assert len(dynamic_set) == len( + dynamic_theorems + ), "Dynamic theorems contain duplicates" logger.info("Comparing theorem sets") only_in_manual = manual_set - dynamic_set @@ -2177,22 +2624,29 @@ def _fast_compare_theorems(self, manual_theorems, dynamic_theorems): def _theorem_to_hashable(self, theorem): return ( - theorem['file_path'], - theorem['full_name'], - tuple(theorem['start']), - tuple(theorem['end']), + theorem["file_path"], + theorem["full_name"], + tuple(theorem["start"]), + tuple(theorem["end"]), ) def _tactic_to_hashable(self, tactic): return ( - tactic['tactic'], - tactic['annotated_tactic'][0], - tuple((a['full_name'], a['def_path'], tuple(a['def_pos']), tuple(a['def_end_pos'])) - for a in tactic['annotated_tactic'][1]), - tactic['state_before'], - tactic['state_after'] - ) - + tactic["tactic"], + tactic["annotated_tactic"][0], + tuple( + ( + a["full_name"], + a["def_path"], + tuple(a["def_pos"]), + tuple(a["def_end_pos"]), + ) + for a in tactic["annotated_tactic"][1] + ), + tactic["state_before"], + tactic["state_after"], + ) + def test_unicode_handling_in_dataset(self): url_PFR = "https://github.com/teorth/pfr" commit_PFR = "6a5082ee465f9e44cea479c7b741b3163162bb7e" @@ -2200,27 +2654,55 @@ def test_unicode_handling_in_dataset(self): url_new_version = "https://github.com/Adarsh321123/new-version-test" commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5" dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version - dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated" + dst_dir = ( + Path(RAID_DIR) + / DATA_DIR + / f"{dir_name_PFR}_{dir_name_new_version}_generated" + ) self.db.generate_merged_dataset(dst_dir) - with open(dst_dir / "metadata.json", 'r', encoding='utf-8') as f: + with open(dst_dir / "metadata.json", "r", encoding="utf-8") as f: metadata = json.load(f) - self.assertIn('repositories', metadata, "No 'repositories' key in metadata") - self.assertGreater(len(metadata['repositories']), 0, "No repositories in metadata") - repo_PFR = metadata['repositories'][0] - self.assertIn('metadata', repo_PFR, "No 'metadata' key in repository") - repo_PFR_metadata = repo_PFR['metadata'] - self.assertIn('unicode', repo_PFR_metadata, "No 'unicode' key in repository metadata") - self.assertIn("ユニコード", repo_PFR_metadata['unicode'], "Unicode string not found in metadata") - self.assertIn("ユニコード", metadata['repositories'][0]['metadata']['unicode']) - - self.assertGreater(len(metadata['repositories']), 1, "Only one repository in metadata") - repo_new_version = metadata['repositories'][1] - self.assertIn('metadata', repo_new_version, "No 'metadata' key in repository") - repo_new_version_metadata = repo_new_version['metadata'] - self.assertIn('unicode', repo_new_version_metadata, "No 'unicode' key in repository metadata") - self.assertIn("ユニコード", repo_new_version_metadata['unicode'], "Unicode string not found in metadata") - self.assertIn("ユニコード", metadata['repositories'][1]['metadata']['unicode']) + self.assertIn("repositories", metadata, "No 'repositories' key in metadata") + self.assertGreater( + len(metadata["repositories"]), 0, "No repositories in metadata" + ) + repo_PFR = metadata["repositories"][0] + self.assertIn("metadata", repo_PFR, "No 'metadata' key in repository") + repo_PFR_metadata = repo_PFR["metadata"] + self.assertIn( + "unicode", repo_PFR_metadata, "No 'unicode' key in repository metadata" + ) + self.assertIn( + "ユニコード", + repo_PFR_metadata["unicode"], + "Unicode string not found in metadata", + ) + self.assertIn( + "ユニコード", metadata["repositories"][0]["metadata"]["unicode"] + ) + + self.assertGreater( + len(metadata["repositories"]), 1, "Only one repository in metadata" + ) + repo_new_version = metadata["repositories"][1] + self.assertIn( + "metadata", repo_new_version, "No 'metadata' key in repository" + ) + repo_new_version_metadata = repo_new_version["metadata"] + self.assertIn( + "unicode", + repo_new_version_metadata, + "No 'unicode' key in repository metadata", + ) + self.assertIn( + "ユニコード", + repo_new_version_metadata["unicode"], + "Unicode string not found in metadata", + ) + self.assertIn( + "ユニコード", metadata["repositories"][1]["metadata"]["unicode"] + ) def tearDown(self): # Clean up generated files after tests @@ -2230,16 +2712,28 @@ def tearDown(self): url_new_version = "https://github.com/Adarsh321123/new-version-test" commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5" dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version - dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated" + dst_dir = ( + Path(RAID_DIR) + / DATA_DIR + / f"{dir_name_PFR}_{dir_name_new_version}_generated" + ) if dst_dir.exists(): shutil.rmtree(dst_dir) def test_theorem_statement(self): - theorem = next(t for t in self.sample_repo_PFR.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul") + theorem = next( + t + for t in self.sample_repo_PFR.proven_theorems + if t.full_name == "ContinuousLinearMap.opNorm_lsmul" + ) self.assertIsNotNone(theorem.theorem_statement) self.assertIn("opNorm_lsmul", theorem.theorem_statement) - theorem = next(t for t in self.sample_repo_new_version.proven_theorems if t.full_name == "Ordinal.le_mul_right") + theorem = next( + t + for t in self.sample_repo_new_version.proven_theorems + if t.full_name == "Ordinal.le_mul_right" + ) self.assertIsNotNone(theorem.theorem_statement) self.assertIn("le_mul_right", theorem.theorem_statement) @@ -2254,12 +2748,25 @@ def test_unicode_handling(self): def test_file_tracing(self): self.assertGreater(len(self.sample_repo_PFR.files_traced), 0) - self.assertIn(Path("PFR/Mathlib/GroupTheory/Torsion.lean"), self.sample_repo_PFR.files_traced) - self.assertIn(Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), self.sample_repo_PFR.files_traced) + self.assertIn( + Path("PFR/Mathlib/GroupTheory/Torsion.lean"), + self.sample_repo_PFR.files_traced, + ) + self.assertIn( + Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), + self.sample_repo_PFR.files_traced, + ) self.assertGreater(len(self.sample_repo_new_version.files_traced), 0) - self.assertIn(Path("NewVersionTest/ExercisesOne.lean"), self.sample_repo_new_version.files_traced) - self.assertIn(Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), self.sample_repo_new_version.files_traced) + self.assertIn( + Path("NewVersionTest/ExercisesOne.lean"), + self.sample_repo_new_version.files_traced, + ) + self.assertIn( + Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), + self.sample_repo_new_version.files_traced, + ) + class TestDynamicDatabaseProver(unittest.TestCase): """ @@ -2281,6 +2788,7 @@ class TestDynamicDatabaseProver(unittest.TestCase): a single unproved theorem, which can then be manipulated to test different aspects of the system's functionality. """ + def setUp(self): self.db = DynamicDatabase() self.repo = Repository( @@ -2298,9 +2806,9 @@ def setUp(self): end=Pos(10, 1), url="https://github.com/test/repo", commit="abcdef1234567890", - theorem_statement="theorem test_theorem : 2 + 2 = 4 := sorry" + theorem_statement="theorem test_theorem : 2 + 2 = 4 := sorry", ) - ] + ], ) self.db.add_repository(self.repo) @@ -2310,7 +2818,7 @@ def test_create_annotated_tactic(self): tactic=tactic, annotated_tactic=(tactic, []), state_before="", - state_after="" + state_after="", ) self.assertEqual(annotated_tactic.tactic, tactic) self.assertEqual(annotated_tactic.annotated_tactic, (tactic, [])) @@ -2324,14 +2832,14 @@ def test_update_theorem_with_proof(self): tactic="rw [add_comm]", annotated_tactic=("rw [add_comm]", []), state_before="⊢ 2 + 2 = 4", - state_after="⊢ 2 + 2 = 4" + state_after="⊢ 2 + 2 = 4", ), AnnotatedTactic( tactic="refl", annotated_tactic=("refl", []), state_before="⊢ 2 + 2 = 4", - state_after="no goals" - ) + state_after="no goals", + ), ] theorem.traced_tactics = traced_tactics self.repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME) @@ -2340,7 +2848,9 @@ def test_update_theorem_with_proof(self): updated_repo = self.db.get_repository(self.repo.url, self.repo.commit) self.assertEqual(len(updated_repo.sorry_theorems_proved), 1) self.assertEqual(len(updated_repo.sorry_theorems_unproved), 0) - self.assertEqual(updated_repo.sorry_theorems_proved[0].traced_tactics, traced_tactics) + self.assertEqual( + updated_repo.sorry_theorems_proved[0].traced_tactics, traced_tactics + ) def test_json_serialization_with_proved_theorems(self): results = [ @@ -2352,42 +2862,45 @@ def test_json_serialization_with_proved_theorems(self): environment_time=2.0, total_time=3.0, num_total_nodes=10, - num_searched_nodes=5 + num_searched_nodes=5, ) ] result = results[0] - + traced_tactics = [ AnnotatedTactic( tactic=tactic, annotated_tactic=(tactic, []), state_before="", - state_after="" - ) for tactic in result.proof + state_after="", + ) + for tactic in result.proof ] self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics - self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME) - + self.repo.change_sorry_to_proven( + self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME + ) + # Serialize to JSON json_file = "proved_theorems_test.json" self.db.to_json(json_file) - + # Deserialize from JSON loaded_db = DynamicDatabase.from_json(json_file) - + # Verify the loaded data loaded_repo = loaded_db.get_repository(self.repo.url, self.repo.commit) self.assertIsNotNone(loaded_repo) - + self.assertEqual(len(loaded_repo.sorry_theorems_unproved), 0) self.assertEqual(len(loaded_repo.sorry_theorems_proved), 1) - + proved_theorem = loaded_repo.sorry_theorems_proved[0] self.assertEqual(proved_theorem.full_name, "test_theorem") self.assertEqual(len(proved_theorem.traced_tactics), 2) self.assertEqual(proved_theorem.traced_tactics[0].tactic, "rw [add_comm]") self.assertEqual(proved_theorem.traced_tactics[1].tactic, "refl") - + # Test updating the loaded database new_theorem = Theorem( full_name="new_theorem", @@ -2396,18 +2909,18 @@ def test_json_serialization_with_proved_theorems(self): end=Pos(5, 1), url="https://github.com/test/repo", commit="abcdef1234567890", - theorem_statement="theorem new_theorem : 3 + 3 = 6 := sorry" + theorem_statement="theorem new_theorem : 3 + 3 = 6 := sorry", ) loaded_repo.sorry_theorems_unproved.append(new_theorem) - + # Serialize the updated database updated_json_file = "updated_proved_theorems_test.json" loaded_db.to_json(updated_json_file) - + # Deserialize and verify the update final_db = DynamicDatabase.from_json(updated_json_file) final_repo = final_db.get_repository(self.repo.url, self.repo.commit) - + self.assertEqual(len(final_repo.sorry_theorems_proved), 1) self.assertEqual(len(final_repo.sorry_theorems_unproved), 1) self.assertEqual(final_repo.sorry_theorems_unproved[0].full_name, "new_theorem") @@ -2415,16 +2928,16 @@ def test_json_serialization_with_proved_theorems(self): def test_update_theorem_with_proof_and_json(self): json_file = "temp_file.json" theorem = self.repo.sorry_theorems_unproved[0] - + traced_tactics = [ AnnotatedTactic( tactic="rw [add_comm]", annotated_tactic=("rw [add_comm]", []), state_before="", - state_after="" + state_after="", ) ] - + theorem.traced_tactics = traced_tactics self.repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME) self.db.update_repository(self.repo) @@ -2440,7 +2953,7 @@ def test_update_theorem_with_proof_and_json(self): self.assertEqual(proved_theorem.file_path, theorem.file_path) self.assertEqual(proved_theorem.start, theorem.start) self.assertEqual(proved_theorem.end, theorem.end) - + self.assertEqual(len(proved_theorem.traced_tactics), 1) loaded_tactic = proved_theorem.traced_tactics[0] self.assertEqual(loaded_tactic.tactic, "rw [add_comm]") @@ -2458,7 +2971,7 @@ def test_prove_sorry_theorems(self): environment_time=2.0, total_time=3.0, num_total_nodes=10, - num_searched_nodes=5 + num_searched_nodes=5, ) ] result = results[0] if results else None @@ -2472,11 +2985,13 @@ def test_prove_sorry_theorems(self): tactic=tactic, annotated_tactic=(tactic, []), state_before="", - state_after="" + state_after="", ) ) self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics - self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME) + self.repo.change_sorry_to_proven( + self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME + ) self.assertEqual(len(self.repo.sorry_theorems_unproved), 0) self.assertEqual(len(self.repo.sorry_theorems_proved), 1) @@ -2493,10 +3008,14 @@ def test_save_load_dynamic_database(self): self.assertEqual(len(self.db.repositories), len(loaded_db.repositories)) self.assertEqual(self.db.repositories[0].url, loaded_db.repositories[0].url) - self.assertEqual(self.db.repositories[0].commit, loaded_db.repositories[0].commit) - self.assertEqual(len(self.db.repositories[0].sorry_theorems_unproved), - len(loaded_db.repositories[0].sorry_theorems_unproved)) - + self.assertEqual( + self.db.repositories[0].commit, loaded_db.repositories[0].commit + ) + self.assertEqual( + len(self.db.repositories[0].sorry_theorems_unproved), + len(loaded_db.repositories[0].sorry_theorems_unproved), + ) + def test_add_repository_and_save(self): json_file = "temp_file.json" @@ -2517,9 +3036,9 @@ def test_add_repository_and_save(self): "end": [10, 1], "url": "https://github.com/test/new-repo", "commit": "1234567890abcdef", - "theorem_statement": "theorem new_test_theorem : 3 + 3 = 6 := sorry" + "theorem_statement": "theorem new_test_theorem : 3 + 3 = 6 := sorry", } - ] + ], } new_repo = Repository.from_dict(new_repo_data) @@ -2528,13 +3047,18 @@ def test_add_repository_and_save(self): loaded_db = DynamicDatabase.from_json(json_file) self.assertEqual(len(loaded_db.repositories), 2) - self.assertEqual(loaded_db.repositories[1].url, "https://github.com/test/new-repo") + self.assertEqual( + loaded_db.repositories[1].url, "https://github.com/test/new-repo" + ) self.assertEqual(len(loaded_db.repositories[1].sorry_theorems_unproved), 1) - self.assertEqual(loaded_db.repositories[1].sorry_theorems_unproved[0].full_name, "new_test_theorem") + self.assertEqual( + loaded_db.repositories[1].sorry_theorems_unproved[0].full_name, + "new_test_theorem", + ) def test_prove_sorry_theorems_and_save(self): json_file = "temp_file.json" - + self.db.to_json(json_file) results = [ @@ -2546,7 +3070,7 @@ def test_prove_sorry_theorems_and_save(self): environment_time=2.0, total_time=3.0, num_total_nodes=10, - num_searched_nodes=5 + num_searched_nodes=5, ) ] result = results[0] if results else None @@ -2560,11 +3084,13 @@ def test_prove_sorry_theorems_and_save(self): tactic=tactic, annotated_tactic=(tactic, []), state_before="", - state_after="" + state_after="", ) ) self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics - self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME) + self.repo.change_sorry_to_proven( + self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME + ) self.db.to_json(json_file) loaded_db = DynamicDatabase.from_json(json_file) @@ -2577,8 +3103,15 @@ def test_prove_sorry_theorems_and_save(self): self.assertEqual(proved_theorem.traced_tactics[0].tactic, "rw [add_comm]") self.assertEqual(proved_theorem.traced_tactics[1].tactic, "refl") - def _theorem_identifier(self, theorem: Theorem) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]: - return (theorem.full_name, str(theorem.file_path), tuple(theorem.start), tuple(theorem.end)) + def _theorem_identifier( + self, theorem: Theorem + ) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]: + return ( + theorem.full_name, + str(theorem.file_path), + tuple(theorem.start), + tuple(theorem.end), + ) def test_prove_sorry_theorems_with_duplicates(self): # Create two repositories with the same theorem but different commits @@ -2597,9 +3130,9 @@ def test_prove_sorry_theorems_with_duplicates(self): end=Pos(10, 1), url="https://github.com/test/repo", commit="commit1", - theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry" + theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry", ) - ] + ], ) repo2 = Repository( @@ -2617,9 +3150,9 @@ def test_prove_sorry_theorems_with_duplicates(self): end=Pos(10, 1), url="https://github.com/test/repo", commit="commit2", - theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry" + theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry", ) - ] + ], ) # Create a test database with both repositories @@ -2647,29 +3180,36 @@ def test_prove_sorry_theorems_with_duplicates(self): environment_time=2.0, total_time=3.0, num_total_nodes=10, - num_searched_nodes=5 + num_searched_nodes=5, ) ] # Simulate the prove_sorry_theorems function processed_theorems = set() - for repo in sorted(loaded_db.repositories, key=lambda r: r.metadata['date_processed'], reverse=True): + for repo in sorted( + loaded_db.repositories, + key=lambda r: r.metadata["date_processed"], + reverse=True, + ): for theorem in repo.sorry_theorems_unproved: theorem_id = self._theorem_identifier(theorem) if theorem_id in processed_theorems: continue - + processed_theorems.add(theorem_id) - + # Apply the proof to the theorem - result = results[0] # In a real scenario, this would be the result of calling the prover + result = results[ + 0 + ] # In a real scenario, this would be the result of calling the prover traced_tactics = [ AnnotatedTactic( tactic=tactic, annotated_tactic=(tactic, []), state_before="", - state_after="" - ) for tactic in result.proof + state_after="", + ) + for tactic in result.proof ] theorem.traced_tactics = traced_tactics repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME) @@ -2695,13 +3235,13 @@ def test_prove_sorry_theorems_with_duplicates(self): self.assertEqual(len(proved_theorem.traced_tactics), 2) self.assertEqual(proved_theorem.traced_tactics[0].tactic, "rw [add_comm]") self.assertEqual(proved_theorem.traced_tactics[1].tactic, "refl") - + def test_repeated_to_json_during_proving(self): json_file = "repeated_save_test.json" - + # Initial save self.db.to_json(json_file) - + results = [ SearchResult( theorem=self.repo.sorry_theorems_unproved[0], @@ -2711,10 +3251,10 @@ def test_repeated_to_json_during_proving(self): environment_time=2.0, total_time=3.0, num_total_nodes=10, - num_searched_nodes=5 + num_searched_nodes=5, ) ] - + # Simulate proving and saving after each theorem for result in results: if isinstance(result, SearchResult) and result.status == Status.PROVED: @@ -2723,22 +3263,26 @@ def test_repeated_to_json_during_proving(self): tactic=tactic, annotated_tactic=(tactic, []), state_before="", - state_after="" - ) for tactic in result.proof + state_after="", + ) + for tactic in result.proof ] self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics - self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME) + self.repo.change_sorry_to_proven( + self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME + ) self.db.update_repository(self.repo) self.db.to_json(json_file) # Save after each theorem is proved - + # Final save self.db.to_json(json_file) - + # Load and verify loaded_db = DynamicDatabase.from_json(json_file) self.assertEqual(len(loaded_db.repositories[0].sorry_theorems_proved), 1) self.assertEqual(len(loaded_db.repositories[0].sorry_theorems_unproved), 0) + class TestDynamicDatabaseEmpty(unittest.TestCase): def setUp(self): self.empty_json_path = "empty_database.json" @@ -2749,38 +3293,38 @@ def tearDown(self): os.remove(self.empty_json_path) def test_from_json_empty_file(self): - with open(self.empty_json_path, 'w') as f: + with open(self.empty_json_path, "w") as f: json.dump({"repositories": []}, f) - + db = DynamicDatabase.from_json(self.empty_json_path) self.assertEqual(len(db.repositories), 0) def test_to_json_empty_database(self): self.db.to_json(self.empty_json_path) - + self.assertTrue(os.path.exists(self.empty_json_path)) - with open(self.empty_json_path, 'r') as f: + with open(self.empty_json_path, "r") as f: content = json.load(f) self.assertEqual(content, {"repositories": []}) def test_from_json_invalid_empty_object(self): - with open(self.empty_json_path, 'w') as f: + with open(self.empty_json_path, "w") as f: json.dump({}, f) - + with self.assertRaises(ValueError): DynamicDatabase.from_json(self.empty_json_path) def test_from_json_nonexistent_file(self): if os.path.exists(self.empty_json_path): os.remove(self.empty_json_path) - + with self.assertRaises(FileNotFoundError): DynamicDatabase.from_json(self.empty_json_path) def test_add_repository_to_empty_database(self): - with open(self.empty_json_path, 'w') as f: + with open(self.empty_json_path, "w") as f: json.dump({"repositories": []}, f) - + db = DynamicDatabase.from_json(self.empty_json_path) repo_data = { @@ -2789,19 +3333,21 @@ def test_add_repository_to_empty_database(self): "commit": "abc123", "lean_version": "3.50.3", "lean_dojo_version": "1.8.4", - "metadata": {"date_processed": datetime.datetime.now().isoformat()} + "metadata": {"date_processed": datetime.datetime.now().isoformat()}, } repo = Repository.from_dict(repo_data) db.add_repository(repo) - + db.to_json(self.empty_json_path) - + loaded_db = DynamicDatabase.from_json(self.empty_json_path) self.assertEqual(len(loaded_db.repositories), 1) self.assertEqual(loaded_db.repositories[0].url, "https://github.com/test/repo") + def main(): unittest.main() + if __name__ == "__main__": main() From a455001658465226bc4168b40da67e5a82fbf498 Mon Sep 17 00:00:00 2001 From: motiwari Date: Mon, 25 Aug 2025 09:26:21 -0700 Subject: [PATCH 03/29] Fixing imports --- CLAUDE.md | 91 ++++++++++++++++++++++++++ common.py | 26 ++++---- custom_traced_data.py | 34 +++++----- custom_utils.py | 20 +++--- dynamic_database.py | 14 ++-- generate_benchmark_lean4.py | 19 +++--- generator/datamodule.py | 20 ++---- generator/main.py | 1 + generator/model.py | 26 +++----- ld_path.txt | 1 + leanagent.py | 67 ++++++++----------- pl_path.txt | 1 + prover/evaluate.py | 16 ++--- prover/proof_search.py | 33 ++++------ prover/search_tree.py | 16 ++--- retrieval/bm25/main.py | 23 ++++--- retrieval/bm25/train_tokenizer.py | 5 +- retrieval/datamodule.py | 24 +++---- retrieval/evaluate.py | 9 +-- retrieval/evaluate_multiple.py | 9 +-- retrieval/fisher_computation_module.py | 5 +- retrieval/index.py | 5 +- retrieval/main.py | 11 ++-- retrieval/model.py | 29 ++++---- tests/test_common.py | 1 + unittest_dynamic_database.py | 38 +++++------ 26 files changed, 298 insertions(+), 246 deletions(-) create mode 100644 CLAUDE.md create mode 100644 ld_path.txt create mode 100644 pl_path.txt diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..5ef3e45 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,91 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Common Development Commands + +### Environment Setup +```bash +# Create and activate conda environment +conda create -n "LeanAgent" python=3.10 +conda activate LeanAgent +pip install -r requirements.txt +``` + +### Running Tests +```bash +# Run the test suite +conda activate LeanAgent +python -m pytest tests/ +``` + +### Running LeanAgent +```bash +# Main training/proving pipeline +bash run_leanagent.sh +``` + +### Fisher Information Matrix Computation (for EWC ablations) +```bash +# Compute Fisher Information Matrix +bash run_compute_fisher.sh +``` + +### Environment Configuration Required +Before running, update the following in shell scripts: +- `RAID_DIR`: Path to storage directory +- `PATH_TO_CONDA_ENV`: Path to conda installation +- `GITHUB_ACCESS_TOKEN`: GitHub personal access token + +## Architecture Overview + +LeanAgent is a lifelong learning framework for formal theorem proving that continuously learns from expanding mathematical repositories without forgetting previous knowledge. + +### Core Components + +1. **Dynamic Database (`dynamic_database.py`)**: Central JSON-based storage system that manages mathematical knowledge across repositories. Tracks theorems (proven, sorry-but-now-proven, unproven), premise files, and repository metadata with deduplication capabilities. + +2. **Repository Processing Pipeline**: + - Discovers and clones Lean repositories from GitHub + - Uses LeanDojo to trace/extract theorems, proofs, and premises + - Checks Lean version compatibility (4.3.0-rc2 to 4.8.0-rc1) + - Builds dependency graphs and exports structured datasets + +3. **Progressive Retriever Training (`retrieval/`)**: + - Trains ByT5-based retriever incrementally on new repositories + - Uses PyTorch Lightning with DDP for distributed training + - Saves checkpoints based on R@10 validation performance + - Measures both plasticity (new learning) and stability (retention) + +4. **Theorem Proving (`prover/`)**: + - Best-first tree search for sorry theorem proving + - Uses trained retriever to find relevant premises + - Generates tactic candidates with beam search + - 10-minute timeout per theorem, processes in batches of 12 + +5. **Curriculum Learning**: Exponential complexity scoring (e^S where S = proof steps) with Easy/Medium/Hard categorization based on 33rd/67th percentiles. + +### Key Configuration Points + +The main configuration happens in `leanagent.py` where you must set: +- `repo_dir`: Repository path +- `DATA_DIR`: Data storage directory +- `CHECKPOINT_DIR`: Model checkpoint directory +- `EVAL_RESULTS_FILE_PATH`: Evaluation results path +- `DB_FILE_NAME`: Database filename +- `PROOF_LOG_FILE_NAME`: Proof logging filename +- `ENCOUNTERED_THEOREMS_FILE`: Theorem tracking file +- `FISHER_DIR`: Fisher Information Matrix directory (optional) + +### Distributed Computing +- Uses Ray for distributed repository processing +- PyTorch Lightning DDP for multi-GPU training (typically 4 A100s) +- Custom timeout settings for lengthy operations +- Resource cleanup between phases to prevent memory leaks + +### Repository Integration +When theorems are successfully proven, LeanAgent can: +- Create temporary branches +- Replace `sorry` with generated proofs +- Submit pull requests to original repositories +- Use standardized commit messages and PR templates \ No newline at end of file diff --git a/common.py b/common.py index ac50437..b9eafef 100644 --- a/common.py +++ b/common.py @@ -1,24 +1,24 @@ +import json import os +import random import re import sys -import json -import random -import torch import tempfile +from dataclasses import dataclass, field +from typing import Any, Dict, Generator, List, Optional, Tuple + import networkx as nx -from loguru import logger -from lean_dojo import Pos import pytorch_lightning as pl -from dataclasses import dataclass, field -from leanagent_utils import remove_marks, MARK_START_SYMBOL, MARK_END_SYMBOL -from pytorch_lightning.utilities.deepspeed import ( - convert_zero_checkpoint_to_fp32_state_dict, -) -from transformers import get_cosine_schedule_with_warmup -from deepspeed.ops.adam import FusedAdam, DeepSpeedCPUAdam -from typing import Optional, List, Dict, Any, Tuple, Generator +import torch +from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam +from lean_dojo import Pos +from loguru import logger from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy +from pytorch_lightning.utilities.deepspeed import \ + convert_zero_checkpoint_to_fp32_state_dict +from transformers import get_cosine_schedule_with_warmup +from leanagent_utils import MARK_END_SYMBOL, MARK_START_SYMBOL, remove_marks Example = Dict[str, Any] Batch = Dict[str, Any] diff --git a/custom_traced_data.py b/custom_traced_data.py index ae569e4..ee61c83 100644 --- a/custom_traced_data.py +++ b/custom_traced_data.py @@ -1,30 +1,26 @@ """This module defines traced repos/files/theorems.""" -import re -import os +import itertools import json +import os import random -import itertools +import re import webbrowser -import networkx as nx -from tqdm import tqdm -from lxml import etree +from dataclasses import dataclass, field from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import networkx as nx from loguru import logger -from dataclasses import dataclass, field -from typing import List, Optional, Dict, Any, Tuple, Union - -from ..utils import ( - is_git_repo, - compute_md5, - to_lean_path, - to_dep_path, - to_json_path, - to_xml_path, -) +from lxml import etree +from tqdm import tqdm + +from ..constants import (LEAN4_PACKAGES_DIR, LOAD_USED_PACKAGES_ONLY, + NUM_WORKERS) +from ..utils import (compute_md5, is_git_repo, to_dep_path, to_json_path, + to_lean_path, to_xml_path) from .ast import * -from .lean import LeanFile, LeanGitRepo, Theorem, Pos -from ..constants import NUM_WORKERS, LOAD_USED_PACKAGES_ONLY, LEAN4_PACKAGES_DIR +from .lean import LeanFile, LeanGitRepo, Pos, Theorem @dataclass(frozen=True) diff --git a/custom_utils.py b/custom_utils.py index dd587a4..181e9ac 100644 --- a/custom_utils.py +++ b/custom_utils.py @@ -1,20 +1,22 @@ """Utility functions used internally by LeanDojo.""" -import re +import hashlib import os +import re +import subprocess +import tempfile import time -import urllib import typing -import hashlib -import tempfile -import subprocess +import urllib +from contextlib import contextmanager +from functools import cache from pathlib import Path +from typing import Generator, List, Optional, Tuple, Union + from loguru import logger -from functools import cache -from contextlib import contextmanager -from typing import Tuple, Union, List, Generator, Optional -from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR +from .constants import (LEAN4_BUILD_DIR, LEAN4_PACKAGES_DIR, NUM_WORKERS, + TMP_DIR) @contextmanager diff --git a/dynamic_database.py b/dynamic_database.py index d819000..d3fccc5 100644 --- a/dynamic_database.py +++ b/dynamic_database.py @@ -1,16 +1,18 @@ from __future__ import annotations + import datetime import json import os -from dataclasses import dataclass, field, asdict -from typing import List, Dict, Optional, Union, Tuple, Set -from pathlib import Path -from lean_dojo.data_extraction.lean import Pos -from tqdm import tqdm import random +import shutil from collections import defaultdict +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple, Union + +from lean_dojo.data_extraction.lean import Pos from loguru import logger -import shutil +from tqdm import tqdm def parse_pos(pos_str): diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index 0c72377..d343931 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -1,20 +1,21 @@ import json -import shutil import random -import networkx as nx +import re +import shutil +import subprocess +import sys +import time +from collections import defaultdict from copy import copy -from pathlib import Path -from loguru import logger from datetime import datetime -from collections import defaultdict +from pathlib import Path from typing import Dict, List, Union -import time + import lean_dojo +import networkx as nx from lean_dojo import * from lean_dojo.constants import LEAN4_PACKAGES_DIR -import re -import subprocess -import sys +from loguru import logger random.seed(3407) # https://arxiv.org/abs/2109.08203 diff --git a/generator/datamodule.py b/generator/datamodule.py index 882e64a..0dde8da 100644 --- a/generator/datamodule.py +++ b/generator/datamodule.py @@ -1,24 +1,18 @@ """Data module for the tactic generator.""" -import os import json +import os import pickle -from tqdm import tqdm -from loguru import logger +from typing import Any, Dict, List, Optional + import pytorch_lightning as pl -from typing import Optional, List, Dict, Any +from loguru import logger from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm from transformers import AutoTokenizer, ByT5Tokenizer -from common import ( - Batch, - Corpus, - Example, - format_state, - remove_marks, - format_tactic, - format_augmented_state, -) +from common import (Batch, Corpus, Example, format_augmented_state, + format_state, format_tactic, remove_marks) class GeneratorDataset(Dataset): diff --git a/generator/main.py b/generator/main.py index 9ae06c8..3d42239 100644 --- a/generator/main.py +++ b/generator/main.py @@ -1,6 +1,7 @@ """Script for training the tactic generator.""" import os + from loguru import logger from pytorch_lightning.cli import LightningCLI diff --git a/generator/model.py b/generator/model.py index f07e95c..66d3617 100644 --- a/generator/model.py +++ b/generator/model.py @@ -1,28 +1,22 @@ """Lightning module for the tactic generator.""" import os -import torch +import pickle import shutil +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple + import openai -import pickle +import pytorch_lightning as pl +import torch from lean_dojo import Pos from loguru import logger -import pytorch_lightning as pl from torchmetrics import Metric -from abc import ABC, abstractmethod -from typing import List, Dict, Any, Optional, Tuple -from transformers import T5ForConditionalGeneration, AutoTokenizer - -from common import ( - zip_strict, - remove_marks, - IndexedCorpus, - get_optimizers, - load_checkpoint, - format_augmented_state, -) -from retrieval.model import PremiseRetriever +from transformers import AutoTokenizer, T5ForConditionalGeneration +from common import (IndexedCorpus, format_augmented_state, get_optimizers, + load_checkpoint, remove_marks, zip_strict) +from retrieval.model import PremiseRetriever torch.set_float32_matmul_precision("medium") diff --git a/ld_path.txt b/ld_path.txt new file mode 100644 index 0000000..c00d90a --- /dev/null +++ b/ld_path.txt @@ -0,0 +1 @@ +/Users/motiwari/miniforge3/envs/LeanAgent/lib/python3.10/site-packages/lean_dojo/__init__.py diff --git a/leanagent.py b/leanagent.py index ea7d062..59d2724 100644 --- a/leanagent.py +++ b/leanagent.py @@ -1,54 +1,45 @@ # import all the necessary libraries +import json import math -import ray -from collections import defaultdict import os -import requests -import subprocess +import pickle +import random import re import shutil -from lean_dojo import * -import os -import json -import pickle +import subprocess +import sys +import time +import traceback +from collections import defaultdict +from copy import copy +from datetime import datetime, timedelta +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import lean_dojo import numpy as np -from tqdm import tqdm -from loguru import logger +import pytorch_lightning as pl +import ray +import requests +import torch +from lean_dojo import * +from lean_dojo import LeanGitRepo, Pos from lean_dojo import Theorem -from typing import List, Tuple, Optional -from lean_dojo import LeanGitRepo, Pos, is_available_in_cache from lean_dojo import Theorem as LeanDojoTheorem -import json -import shutil -import random -from copy import copy -from pathlib import Path +from lean_dojo import is_available_in_cache from loguru import logger -from datetime import datetime, timedelta -from collections import defaultdict -from typing import Dict, List, Union -import generate_benchmark_lean4 -import traceback -import sys +from pytorch_lightning import seed_everything +from pytorch_lightning.callbacks import (Callback, EarlyStopping, + LearningRateMonitor, ModelCheckpoint) +from pytorch_lightning.strategies import DDPStrategy from tqdm import tqdm + +import generate_benchmark_lean4 from dynamic_database import * -import time -from pytorch_lightning.strategies import DDPStrategy -from prover.proof_search import Status, DistributedProver, SearchResult -import re -import lean_dojo -import pytorch_lightning as pl -from retrieval.model import PremiseRetriever +from prover.proof_search import DistributedProver, SearchResult, Status from retrieval.datamodule import RetrievalDataModule from retrieval.main import run_cli -import torch -from pytorch_lightning.callbacks import ( - ModelCheckpoint, - EarlyStopping, - LearningRateMonitor, - Callback, -) -from pytorch_lightning import seed_everything +from retrieval.model import PremiseRetriever # Set the seed for reproducibility random.seed(3407) # https://arxiv.org/abs/2109.08203 diff --git a/pl_path.txt b/pl_path.txt new file mode 100644 index 0000000..de1baf0 --- /dev/null +++ b/pl_path.txt @@ -0,0 +1 @@ +/Users/motiwari/miniforge3/envs/LeanAgent/lib/python3.10/site-packages/pytorch_lightning/__init__.py diff --git a/prover/evaluate.py b/prover/evaluate.py index 44eec39..4ad634f 100644 --- a/prover/evaluate.py +++ b/prover/evaluate.py @@ -1,18 +1,18 @@ """Script for evaluating the prover on theorems extracted by LeanDojo.""" -import os -import uuid +import argparse +import hashlib import json +import os import pickle -import hashlib -import argparse +import uuid +from typing import List, Optional, Tuple + +from lean_dojo import LeanGitRepo, Pos, Theorem, is_available_in_cache from loguru import logger -from lean_dojo import Theorem -from typing import List, Tuple, Optional -from lean_dojo import LeanGitRepo, Theorem, Pos, is_available_in_cache from common import set_logger -from prover.proof_search import Status, DistributedProver +from prover.proof_search import DistributedProver, Status def _get_theorems( diff --git a/prover/proof_search.py b/prover/proof_search.py index e927630..dd08fe6 100644 --- a/prover/proof_search.py +++ b/prover/proof_search.py @@ -1,36 +1,27 @@ """Proof search using best-first search.""" +import asyncio +import heapq import os import sys -import ray import time import uuid -import heapq -import asyncio -import torch -from lean_dojo import ( - Pos, - Dojo, - Theorem, - LeanGitRepo, - TacticState, - LeanError, - TimeoutError, - ProofFinished, - ProofGivenUp, - DojoInitError, - DojoCrashError, - DojoHardTimeoutError, -) -from loguru import logger from dataclasses import dataclass from typing import List, Optional, Tuple + +import ray +import torch +from lean_dojo import (Dojo, DojoCrashError, DojoHardTimeoutError, + DojoInitError, LeanError, LeanGitRepo, Pos, + ProofFinished, ProofGivenUp, TacticState, Theorem, + TimeoutError) +from loguru import logger from ray.util.actor_pool import ActorPool -from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput +from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams from common import zip_strict +from generator.model import FixedTacticGenerator, RetrievalAugmentedGenerator from prover.search_tree import * -from generator.model import RetrievalAugmentedGenerator, FixedTacticGenerator tolerance = 1 # second RAID_DIR = os.environ.get("RAID_DIR") diff --git a/prover/search_tree.py b/prover/search_tree.py index 222d030..b1df0a2 100644 --- a/prover/search_tree.py +++ b/prover/search_tree.py @@ -1,18 +1,14 @@ """Definitions of the search tree used by the prover.""" import math -from enum import Enum -from lean_dojo import ( - TacticState, - LeanError, - TimeoutError, - ProofGivenUp, - ProofFinished, -) from abc import ABC, abstractmethod -from functools import total_ordering from dataclasses import dataclass, field -from typing import Optional, List, Tuple, Iterable, Union +from enum import Enum +from functools import total_ordering +from typing import Iterable, List, Optional, Tuple, Union + +from lean_dojo import (LeanError, ProofFinished, ProofGivenUp, TacticState, + TimeoutError) class Status(Enum): diff --git a/retrieval/bm25/main.py b/retrieval/bm25/main.py index 0db0900..1d41415 100644 --- a/retrieval/bm25/main.py +++ b/retrieval/bm25/main.py @@ -1,24 +1,23 @@ """Script for training the BM25 premise retriever.""" -import os -import ray -import json -import pickle import argparse import itertools -import numpy as np -from tqdm import tqdm +import json import multiprocessing -from loguru import logger -from common import Corpus +import os +import pickle +from typing import Any, Dict, List + +import numpy as np +import ray from lean_dojo import Pos +from loguru import logger from rank_bm25 import BM25Okapi -from tokenizers import Tokenizer -from typing import List, Dict, Any from ray.util.actor_pool import ActorPool +from tokenizers import Tokenizer +from tqdm import tqdm - -from common import Context, format_state, get_all_pos_premises +from common import Context, Corpus, format_state, get_all_pos_premises def _process_theorem( diff --git a/retrieval/bm25/train_tokenizer.py b/retrieval/bm25/train_tokenizer.py index 28f1b3f..fc0514a 100644 --- a/retrieval/bm25/train_tokenizer.py +++ b/retrieval/bm25/train_tokenizer.py @@ -1,10 +1,11 @@ -import os import argparse +import os + from loguru import logger from tokenizers import Tokenizer from tokenizers.models import BPE -from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace +from tokenizers.trainers import BpeTrainer from common import Corpus from retrieval.datamodule import RetrievalDataset diff --git a/retrieval/datamodule.py b/retrieval/datamodule.py index 400ca88..39ee444 100644 --- a/retrieval/datamodule.py +++ b/retrieval/datamodule.py @@ -1,23 +1,23 @@ """Datamodule for the premise retrieval.""" -import os +import itertools import json -import torch +import os +import pickle import random -import itertools -from tqdm import tqdm -from loguru import logger from copy import deepcopy -from lean_dojo import Pos +from typing import List, Optional + import pytorch_lightning as pl -from lean_dojo import LeanGitRepo -from typing import Optional, List +import torch +from lean_dojo import LeanGitRepo, Pos +from loguru import logger +from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm from transformers import AutoTokenizer -from torch.utils.data import Dataset, DataLoader -import pickle - -from common import Context, Corpus, Batch, Example, format_state, get_all_pos_premises +from common import (Batch, Context, Corpus, Example, format_state, + get_all_pos_premises) class RetrievalDataset(Dataset): diff --git a/retrieval/evaluate.py b/retrieval/evaluate.py index c18ddb6..5f31b95 100644 --- a/retrieval/evaluate.py +++ b/retrieval/evaluate.py @@ -1,13 +1,14 @@ """Script for evaluating the premise retriever.""" -import os +import argparse import json +import os import pickle -import argparse -import numpy as np -from tqdm import tqdm from typing import Tuple + +import numpy as np from loguru import logger +from tqdm import tqdm def _eval(data, preds_map) -> Tuple[float, float, float]: diff --git a/retrieval/evaluate_multiple.py b/retrieval/evaluate_multiple.py index 6c9a4d0..84ede56 100644 --- a/retrieval/evaluate_multiple.py +++ b/retrieval/evaluate_multiple.py @@ -1,11 +1,12 @@ -import os +import argparse import json +import os import pickle -import argparse -import numpy as np -from tqdm import tqdm from typing import List, Tuple + +import numpy as np from loguru import logger +from tqdm import tqdm def _eval(data, preds_map) -> Tuple[float, float, float]: diff --git a/retrieval/fisher_computation_module.py b/retrieval/fisher_computation_module.py index a333575..d257b3c 100644 --- a/retrieval/fisher_computation_module.py +++ b/retrieval/fisher_computation_module.py @@ -1,8 +1,9 @@ +import pickle + import pytorch_lightning as pl -from loguru import logger import torch import torch.distributed as dist -import pickle +from loguru import logger class FisherComputationModule(pl.LightningModule): diff --git a/retrieval/index.py b/retrieval/index.py index 1fb2f05..1a6f89b 100644 --- a/retrieval/index.py +++ b/retrieval/index.py @@ -1,8 +1,9 @@ """Script for indexing the corpus using the retriever.""" -import torch -import pickle import argparse +import pickle + +import torch from loguru import logger from common import IndexedCorpus diff --git a/retrieval/main.py b/retrieval/main.py index 331846d..ed71157 100644 --- a/retrieval/main.py +++ b/retrieval/main.py @@ -1,17 +1,18 @@ """Script for training the premise retriever.""" +import json import os +import pickle +import sys from typing import Tuple + import numpy as np -import pickle -import json -from tqdm import tqdm from loguru import logger from pytorch_lightning.cli import LightningCLI, SaveConfigCallback -import sys +from tqdm import tqdm -from retrieval.model import PremiseRetriever from retrieval.datamodule import RetrievalDataModule +from retrieval.model import PremiseRetriever class CLI(LightningCLI): diff --git a/retrieval/model.py b/retrieval/model.py index 6832674..a03946f 100644 --- a/retrieval/model.py +++ b/retrieval/model.py @@ -1,31 +1,24 @@ """Ligihtning module for the premise retriever.""" -import os import json import math -import torch +import os import pickle +from datetime import datetime, timedelta +from typing import Any, Dict, List, Tuple, Union + import numpy as np -from tqdm import tqdm -from lean_dojo import Pos -from loguru import logger import pytorch_lightning as pl +import torch import torch.nn.functional as F -from typing import List, Dict, Any, Tuple, Union -from transformers import T5EncoderModel, AutoTokenizer +from lean_dojo import Pos +from loguru import logger from torch.distributed import barrier -from datetime import datetime, timedelta - -from common import ( - Premise, - Context, - Corpus, - get_optimizers, - load_checkpoint, - zip_strict, - cpu_checkpointing_enabled, -) +from tqdm import tqdm +from transformers import AutoTokenizer, T5EncoderModel +from common import (Context, Corpus, Premise, cpu_checkpointing_enabled, + get_optimizers, load_checkpoint, zip_strict) torch.set_float32_matmul_precision("medium") diff --git a/tests/test_common.py b/tests/test_common.py index 6855286..735274d 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,4 +1,5 @@ import pytest + from leanagent_utils import remove_marks diff --git a/unittest_dynamic_database.py b/unittest_dynamic_database.py index 25c9b03..291e2ec 100644 --- a/unittest_dynamic_database.py +++ b/unittest_dynamic_database.py @@ -1,32 +1,24 @@ # import all the necessary modules +import datetime +import json import math -from typing import Union +import os +import random +import shutil import unittest -import datetime from pathlib import Path -from dynamic_database import ( - DynamicDatabase, - Repository, - Theorem, - AnnotatedTactic, - Annotation, - PremiseFile, - Premise, -) -from lean_dojo.data_extraction.lean import Pos, LeanGitRepo -import generate_benchmark_lean4 +from typing import Tuple, Union +from unittest.mock import MagicMock, Mock, patch + import lean_dojo -import json -import shutil -import random +from lean_dojo.data_extraction.lean import LeanGitRepo, Pos from loguru import logger -from unittest.mock import Mock, patch -from dynamic_database import DynamicDatabase, Repository, Theorem, AnnotatedTactic -from prover.proof_search import Status, SearchResult -from dynamic_database import parse_pos -from typing import Tuple -import os -from unittest.mock import patch, MagicMock + +import generate_benchmark_lean4 +from dynamic_database import (AnnotatedTactic, Annotation, DynamicDatabase, + Premise, PremiseFile, Repository, Theorem, + parse_pos) +from prover.proof_search import SearchResult, Status RAID_DIR = os.environ.get("RAID_DIR") DATA_DIR = "datasets_new_unittest" From 93118f45bcea37ff3c1b31508dc102e6a5b7ac06 Mon Sep 17 00:00:00 2001 From: motiwari Date: Mon, 25 Aug 2025 13:29:27 -0700 Subject: [PATCH 04/29] Refactor --- .gitignore | 1 + constants.py | 237 +++++++++ dynamic_database.py | 8 +- generate_benchmark_lean4.py | 2 +- git_utils.py | 547 ++++++++++++++++++++ leanagent.py | 972 ++++++------------------------------ run_leanagent.sh | 10 +- 7 files changed, 937 insertions(+), 840 deletions(-) create mode 100644 constants.py create mode 100644 git_utils.py mode change 100644 => 100755 run_leanagent.sh diff --git a/.gitignore b/.gitignore index 757bfd1..dce68d7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ retrieval/bm25 .idea/ .DS_Store RAID/ +repos # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/constants.py b/constants.py new file mode 100644 index 0000000..01f0ad4 --- /dev/null +++ b/constants.py @@ -0,0 +1,237 @@ +PR_TITLE = "[LeanAgent] Proofs" + +PR_BODY = """ +[LeanAgent](https://arxiv.org/abs/2410.06209) discovers a proof for a theorem with the `sorry` keyword. + +--- + +~LeanAgent - From the [LeanDojo](https://leandojo.org/) family +""" + +TMP_BRANCH = "_LeanAgent" + +COMMIT_MESSAGE = "[LeanAgent] Proofs" + +# List of known repositories to process or skip +# Feel free to remove any repos from this list if you would like to test on them + +known_repositories = [ + "leanprover-community/mathlib4", # ReProver is trained on this + "leanprover-community/batteries", # functional programming instead of math + "leanprover-community/aesop", + "leanprover/lean4", + "leanprover-community/mathlib", # Mathlib3 version + "leanprover-community/mathlib3", + "leanprover/std4", # moved to batteries + "leanprover-community/duper", # functional programming instead of math + "leanprover/lake", + "openai/lean-gym", + "leanprover-community/lean4-metaprogramming-book", + "kmill/lean4-raytracer", # no theorems + "argumentcomputer/yatima", # trace problems + "ImperialCollegeLondon/formalising-mathematics-2024", # trace problems + "leanprover-community/ProofWidgets4", # trace problems + "leanprover/verso", # trace problems + "leanprover-community/NNG4", # trace problems + "ufmg-smite/lean-smt", # fails to trace due to windows-style line endings + "teorth/symmetric_project", # no compatible commit + "cmu-l3/llmlean", # irrelevant + only 4 theorems + "PatrickMassot/GlimpseOfLean", # strange trace problems with _parse_deps + "avigad/lamr", # trace problems + "leanprover-community/quote4", # no theorems + "leanprover-community/iris-lean", # trace problems + "aripiprazole/rinha", # incompatible commit + "leanprover/lean4-cli", # no theorems + "leanprover/LeanInk", # no theorems + "leanprover-community/lean-auto", + "leanprover-community/repl", # no theorems + "leanprover/doc-gen4", # no theorems + "leanprover/SampCert", # trace problems + "nomeata/loogle", + "risc0/risc0-lean4", + "PatrickMassot/verbose-lean4", # no theorems + "tydeu/lean4-alloy", # no theorems + "leanprover/leansat", # deprecated + "BoltonBailey/formal-snarks-project", # two theorems + "dwrensha/lean4-maze", # two theorems + "leanprover-community/mathport", # irrelevant + "argumentcomputer/LSpec", # one theorem + "reaslab/jixia", # no theorems + "riccardobrasca/flt3", # no theorems + "dwrensha/animate-lean-proofs", # irrelevant + "lean-ja/lean-by-example", # irrelevant + "NethermindEth/Clear", # no theorems + "fgdorais/lean4-parser", # irrelevant + "semorrison/lean-training-data", # irrelevant + "verse-lab/lean-ssr", # irrelevant + "GaloisInc/lean-llvm", # irrelevant + "argumentcomputer/Wasm.lean", # irrelevant + "NethermindEth/EVMYulLean", # irrelevant + "rwbarton/advent-of-lean-4", # irrelevant + "leanprover-community/tutorials4", # irrelevant + "haruhisa-enomoto/mathlib4-all-tactics", # irrelevant + "leanprover/LNSym", + "leanprover-community/flt-regular", + "opencompl/lean-mlir-old", + "rami3l/plfl", + "HEPLean/HepLean", + "forked-from-1kasper/ground_zero", + "verified-optimization/CvxLean", + "leanprover-community/sphere-eversion", + "optsuite/optlib", + "YaelDillies/LeanCamCombi", + "JamesGallicchio/LeanColls", + "T-Brick/c0deine", + "jjdishere/EG", + "alexkeizer/QpfTypes", + "fpvandoorn/LeanCourse23", + "marcusrossel/lean-egg", + "reilabs/proven-zk", + "algebraic-dev/soda", + "leanprover-community/llm", + "dignissimus/Untangle", + "argumentcomputer/Megaparsec.lean", + "emilyriehl/infinity-cosmos", + "BartoszPiotrowski/lean-premise-selection", + "djvelleman/HTPILeanPackage", + "girving/ray", + "Anderssorby/SDL.lean", + "pandaman64/lean-regex", + "brown-cs22/CS22-Lean-2023", + "hhu-adam/GameSkeleton", + "FR-vdash-bot/Algorithm", + "PeterKementzey/graph-library-for-lean4", + "arthurpaulino/LeanMySQL", + "arthurpaulino/NumLean", + "FormalSAT/trestle", + "nomeata/lean-wf-induct", + "leanprover/lean4checker", + "IPDSnelting/tba-2022", + "digama0/mm-lean4", + "KislyjKisel/Raylib.lean", + "algebraic-dev/melp", + "hhu-adam/Robo", # same as other tutorials but has lots of sorries + "hargoniX/socket.lean", + "kovach/etch", + "damek/gd-lean", + "0art0/lean-slides", + "forked-from-1kasper/lean4-categories", + "katydid/proofs", + "alexjbest/leaff", + "sinhp/Poly", + "lftcm2023/lftcm2023", # same as other tutorials but has lots of sorries + "lean-ja/lean99", + "leanprover/SHerLOC", + "Seasawher/mdgen", + "opencompl/egg-tactic-code", + "david-christiansen/ssft24", + "T-Brick/lean2wasm", + "hargoniX/cpdt-lean", + "jsm28/AperiodicMonotilesLean", + "draperlaboratory/ELFSage", + "rookie-joe/automatic-lean4-compilation", + "madvorak/fecssk", + "david-christiansen/bob24", + "awodey/joyal", + "BrownCS1951x/fpv2023", # same as other tutorials but has lots of sorries + "paulch42/lean-spec", + "siddhartha-gadgil/MetaExamples", + "dannypsnl/violet", + "arthurpaulino/LeanREPL", + "Kha/do-supplement", + "joehendrix/lean-sat-checker", + "ammkrn/timelib", + "kmill/LeanTeX", + "leanprover/lean4export", + "leanprover-community/mathlib3port", + "brown-cs22/CS22-Lean-2024", # same as other tutorials but has lots of sorries + "T-Brick/lean-wasm", + "crabbo-rave/Soup", + "argumentcomputer/RustFFI.lean", + "suhr/tmath", + "leanprover/leanbv", + "arthurpaulino/FxyLang", + "SchrodingerZhu/LeanGccBackend", + "lecopivo/lean4-karray", + "ImperialCollegeLondon/M1F-explained", + "proost-assistant/ProostLean", + "DavePearce/LeanEVM", + "algebraic-dev/ash", + "FormalizedFormalLogic/Arithmetization", + "cmu-l3/ntp-toolkit", + "dwrensha/tryAtEachStep", + "yangky11/lean4-example", + "T-Brick/DateTime", + "model-checking/rust-lean-models", + "MichaelStollBayreuth/EulerProducts", + "hargoniX/Flame", + "argumentcomputer/Http.lean", + "madvorak/vcsp", + "teorth/newton", + "apnelson1/Matroid", + "smorel394/TS1", + "ianjauslin-rutgers/pythagoras4", + "mortarsanjaya/IMOSLLean4", + "dupuisf/BibtexQuery", + "nomeata/lean-calcify", + "argumentcomputer/FFaCiL.lean", + "javra/iit", + "arthurpaulino/viper", + "lindy-labs/aegis", + "PatrickMassot/NNG4", + "argumentcomputer/YatimaStdLib.lean", + "fgdorais/lean4-unicode-basic", + "mhuisi/Uniq", + "Kha/macro-supplement", + "chenjulang/rubikcubegroup", + "arthurpaulino/LeanMusic", + "argumentcomputer/Ipld.lean", + "Odomontois/advent2022-lean", + "kbuzzard/IISc-experiments", # same as other tutorials but has lots of sorries + "ykonstant1/InfinitePrimes", + "alexkassil/natural_number_game_lean4", + "seewoo5/lean-poly-abc", + "rah4927/lean-dojo-mew", + "siddhartha-gadgil/proofs-and-programs-2023", + "PatrickMassot/lean4-game-server", + "knowsys/Formale-Systeme-in-LEAN", # same as other tutorials but has lots of sorries + "katydid/symbolic-automatic-derivatives", + "girving/interval", + "ImperialCollegeLondon/group-theory-experiments", + "knowsys/CertifyingDatalog", + "bergmannjg/leanCurl", + "vasnesterov/HadwigerNelson", + "FWuermse/lean-postgres", + "leanprover-community/import-graph", + "Human-Oriented-ATP/lean-tactics", # more about tactics than premises + "paulcadman/lean4-leetcode", + "argumentcomputer/Lurk.lean", + "AlexDuchnowski/rubiks-cube", + "SchrodingerZhu/lean-gccjit", + "JamesGallicchio/http", + "jtristan/UnicodeSkipListTableExample", + "adomani/MA4N1_2023", # same as other tutorials but has lots of sorries + "remimimimimi/leansec", + "hhu-adam/lean-i18n", + "RemyDegenne/testing-lower-bounds", + "mariainesdff/LocalClassFieldTheory", + "AviCraimer/relational-calculus-library-lean4", + "JLimperg/regensburg-itp-school-2023", + "jaalonso/Calculemus2", + "mseri/BET", + "xubaiw/Reservoir.lean", + "hargoniX/nest-core", + "siddhartha-gadgil/Polylean", + "MichaelStollBayreuth/Weights", + "sanchace/FRACTRAN", + "argumentcomputer/Poseidon.lean", + "madvorak/chomsky", + "T-Brick/ControlFlow", + "pa-ba/guarded-lean", +] + +known_dead_repos = [ + "uwdb/Cosette", + "notepad-plus-plus/userDefinedLanguages", + "teorth/analysis", +] \ No newline at end of file diff --git a/dynamic_database.py b/dynamic_database.py index d3fccc5..4f1b297 100644 --- a/dynamic_database.py +++ b/dynamic_database.py @@ -1,14 +1,14 @@ from __future__ import annotations - -import datetime +import time +from datetime import datetime import json import os import random import shutil from collections import defaultdict -from dataclasses import asdict, dataclass, field +from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple from lean_dojo.data_extraction.lean import Pos from loguru import logger diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index d343931..67c182c 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -10,7 +10,7 @@ from datetime import datetime from pathlib import Path from typing import Dict, List, Union - +import os import lean_dojo import networkx as nx from lean_dojo import * diff --git a/git_utils.py b/git_utils.py new file mode 100644 index 0000000..6d56fb4 --- /dev/null +++ b/git_utils.py @@ -0,0 +1,547 @@ +import numpy as np +import json +import re +import shutil +import subprocess +import requests +import generate_benchmark_lean4 +from lean_dojo import LeanGitRepo +from datetime import datetime +import lean_dojo +from collections import defaultdict +from dynamic_database import Repository, DynamicDatabase, Theorem + +from loguru import logger +from typing import Union, List, Tuple +import math +import os + +from constants import known_repositories, known_dead_repos, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE + +personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN") +BATCH_SIZE = 4 +RAID_DIR = os.environ.get("RAID_DIR") +os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp" +repo_dir = f"{RAID_DIR}/repos_new" + +DATA_DIR = f"{RAID_DIR}/data" +CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints" +EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt" +DB_FILE_NAME = "db_file.txt" +PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt" +ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl" +FISHER_DIR = f"{RAID_DIR}/fisher" # Optional + + +def clone_repo(repo_url): + """Clone a git repository and return the path to the repository and its sha.""" + # TODO: Fix + repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "") + logger.info(f"Cloning {repo_url}") + logger.info(f"Repo name: {repo_name}") + repo_name = os.path.join(repo_dir, repo_name) + if os.path.exists(repo_name): + print(f"Deleting existing repository directory: {repo_name}") + shutil.rmtree(repo_name) + + subprocess.run(["git", "clone", repo_url, repo_name]) + process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE) + stdout, _stderr = process.communicate() + sha = re.split(r"\t+", stdout.decode("utf-8"))[0] + return repo_name, sha + + +def branch_exists(repo_name, branch_name): + """Check if a branch exists in a git repository.""" + proc = subprocess.run( + ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True + ) + branches = proc.stdout.split("\n") + local_branch = branch_name + remote_branch = f"remote/{branch_name}" + return any( + branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch) + for branch in branches + ) + + +def create_or_switch_branch(repo_name, branch_name, base_branch): + """Create a branch in a git repository if it doesn't exist, or switch to it if it does.""" + if not branch_exists(repo_name, branch_name): + subprocess.run( + ["git", "-C", repo_name, "checkout", "-b", branch_name], check=True + ) + else: + subprocess.run(["git", "-C", repo_name, "checkout", branch_name], check=True) + subprocess.run( + [ + "git", + "-C", + repo_name, + "merge", + base_branch, + "-m", + f"Merging {branch_name} into {base_branch}", + ], + check=True, + ) + + +def commit_changes(repo_name, commit_message): + """Commit changes to a git repository.""" + status = subprocess.run( + ["git", "-C", repo_name, "status", "--porcelain"], + capture_output=True, + text=True, + ).stdout.strip() + if status == "": + print("No changes to commit.") + return False + subprocess.run(["git", "-C", repo_name, "add", "."], check=True) + subprocess.run(["git", "-C", repo_name, "commit", "-m", commit_message], check=True) + return True + + +def push_changes(repo_name, branch_name): + """Push changes to a git repository.""" + subprocess.run( + ["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True + ) + + +def get_default_branch(repo_full_name): + """Get the default branch of a repository (default `main`).""" + url = f"https://api.github.com/repos/{repo_full_name}" + headers = { + "Authorization": f"token {personal_access_token}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.get(url, headers=headers) + if response.status_code == 200: + return response.json()["default_branch"] + else: + logger.info(f"Failed to get default branch for {repo_full_name}") + return "main" + + +def create_pull_request(repo_full_name, title, body, head_branch): + """Create a pull request in a repository.""" + base_branch = get_default_branch(repo_full_name) + url = f"https://api.github.com/repos/{repo_full_name}/pulls" + headers = { + "Authorization": f"token {personal_access_token}", + "Accept": "application/vnd.github.v3+json", + } + data = {"title": title, "body": body, "head": head_branch, "base": base_branch} + response = requests.post(url, headers=headers, json=data) + if response.status_code == 201: + print("Pull request created successfully: " + response.json()["html_url"]) + return response.json()["html_url"] + else: + print("Failed to create pull request", response.text) + return "" + +def ensure_inside_git(): + """Ensure that the current directory is inside a git repository.""" + try: + subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + logger.info("Already in a Git repository") + except subprocess.CalledProcessError: + logger.info("Not in a Git repository. Initializing one.") + subprocess.run(["git", "init"], check=True) + +def get_compatible_commit(url): + """Find the most recent commit with a Lean version that LeanAgent supports.""" + try: + process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) + stdout, stderr = process.communicate() + latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0] + logger.info(f"Latest commit: {latest_commit}") + + new_url = url.replace(".git", "") + logger.info(f"Creating LeanGitRepo for {new_url}") + + repo = LeanGitRepo(new_url, latest_commit) + logger.info(f"Getting config for {url}") + + config = repo.get_config("lean-toolchain") + v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) + + if generate_benchmark_lean4.is_supported_version(v): + logger.info(f"Latest commit compatible for url {url}") + return latest_commit, v + + logger.info(f"Searching for compatible commit for {url}") + + ensure_inside_git() + ZZ + process = subprocess.Popen( + ["git", "fetch", "--depth=1000000", url], # Fetch commits + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + logger.info(f"Fetching commits for {url}") + _, stderr = process.communicate() + + if process.returncode != 0: + raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}") + + logger.info(f"Fetched commits for {url}") + + process = subprocess.Popen( + ["git", "log", "--format=%H", "FETCH_HEAD"], # Get list of commits + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + logger.info(f"Getting list of commits for {url}") + + stdout, stderr = process.communicate() + if process.returncode != 0: + raise Exception(f"Git log command failed: {stderr.decode('utf-8')}") + + commits = stdout.decode("utf-8").strip().split("\n") + logger.info(f"Found {len(commits)} commits for {url}") + + new_url = url.replace(".git", "") + + repo_human_name = "/".join(new_url.split("/")[-2:]) + + # Delete repo if it exists, because it might be checked out to a different commit + if os.path.exists(os.path.join("repos", repo_human_name)): + shutil.rmtree(os.path.join("repos", repo_human_name)) + + subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True) + + for commit in commits: + logger.info(f"Checking commit {commit} for {url}") + # Check out the commit locally + subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True) + import ipdb; ipdb.set_trace() + repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit) + config = repo.get_config("lean-toolchain") + v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) + if generate_benchmark_lean4.is_supported_version(v): + logger.info(f"Found compatible commit {commit} for {url}") + return commit, v + + raise Exception("No compatible commit found") + + except Exception as e: + logger.info(f"Error in get_compatible_commit: {str(e)}") + return None, None + + +def find_and_save_compatible_commits(repo_info_file, lean_git_repos): + """Finds compatible commits for various repositories""" + with open(repo_info_file, "r") as repo_compatibility_file: + updated_repos = json.loads(repo_compatibility_file) + + for repo in lean_git_repos: + url = repo.url + if not url.endswith(".git"): + url = url + ".git" + + sha = None + v = None + + # TODO: Check these + if "mathlib4" in url: + sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" + v = "v4.8.0" + elif "SciLean" in url: + sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744" + v = "v4.7.0" + elif "pfr" in url: + sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687" + v = "v4.8.0-rc1" + else: + # Check if it's in any element + for elem in updated_repos: + if url.replace(".git", "") == elem["url"]: + continue + + sha, v = get_compatible_commit(url) + + + # Always write to json, even for null repos + updated_repos.append( + {"url": url.replace(".git", ""), "commit": sha if sha else None, "version": v if v else None} + ) + + if not sha: + logger.info(f"Failed to find a compatible commit for {url}") + + # Write per repo in case of interrupt + with open(repo_info_file, "w") as f: + json.dump(updated_repos, f) + + return updated_repos + + +def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos=10): + """Search for the given number of repositories on GitHub that have the given language.""" + headers = {"Authorization": personal_access_token} + query_params = { + "q": f"language:{language}", + "sort": "stars", + "order": "desc", + "per_page": 100, + } + + cloned_count = 0 + page = 1 + + while cloned_count < num_repos: + query_params["page"] = page + response = requests.get( + "https://api.github.com/search/repositories", + headers=headers, + params=query_params, + ) + + if response.status_code == 200: + repositories = response.json()["items"] + for repo in repositories: + if cloned_count >= num_repos: + break + + repo_full_name = repo["full_name"] + print("\n\n") + logger.info(f"Processing {repo_full_name}") + + + # Skip repos that are already known + if repo_full_name not in known_repositories + known_dead_repos + repos: + print("\n\n") + logger.info(f"Processing new repo: {repo_full_name}") + name = None + try: + clone_url = repo["clone_url"] + repo_name, sha = clone_repo(clone_url) + name = repo_name + url = clone_url.replace(".git", "") + + # TODO: This constructor can be very slow + lean_git_repo = LeanGitRepo(url, sha) + + lean_git_repos.append(lean_git_repo) + repos.append(repo_full_name) + cloned_count += 1 + logger.info(f"Cloned {repo_full_name}") + except Exception as e: + shutil.rmtree(name) + logger.info(f"Failed to clone {repo_full_name} because of {e}") + else: + logger.info( + f"Skipping {repo_full_name} since it is a known repository" + ) + page += 1 + else: + logger.info("Failed to search GitHub", response.status_code) + break + + # Check if we've reached the end of the search results + if len(repositories) < 100: + break + + logger.info(f"Total repositories processed: {cloned_count}") + return lean_git_repos, repos + + +def add_repo_to_database(dynamic_database_json_path, repo, db): + """Adds a repository to the dynamic database.""" + # Prepare the data necessary to add this repo to the dynamic database + url = repo.url + if not url.endswith(".git"): + url = url + ".git" + logger.info(f"\n\nProcessing {url}") + + if "mathlib4" in url: + sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" + v = "v4.8.0" + elif "SciLean" in url: + sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744" + v = "v4.7.0" + elif "pfr" in url: + sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687" + v = "v4.8.0-rc1" + else: + sha, v = get_compatible_commit(url) + + if not sha: + logger.info(f"Failed to find a compatible commit for {url}") + return None + + logger.info(f"Found compatible commit {sha} for {url} with lean version: {v}") + url = url.replace(".git", "") + repo = LeanGitRepo(url, sha) + dir_name = repo.url.split("/")[-1] + "_" + sha + dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name + logger.info(f"Generating benchmark at {dst_dir}") + traced_repo, _, _, total_theorems = generate_benchmark_lean4.main( + repo.url, sha, dst_dir + ) + if not traced_repo: + logger.info(f"Failed to trace {url}") + return None + if total_theorems < 3 * BATCH_SIZE: # Should be enough theorems for train/val/test + logger.info(f"No theorems found in {url}") + return None + logger.info(f"Finished generating benchmark at {dst_dir}") + + # Add the new repo to the dynamic database + config = repo.get_config("lean-toolchain") + v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) + theorems_folder = dst_dir + "/random" + premise_files_corpus = dst_dir + "/corpus.jsonl" + files_traced = dst_dir + "/traced_files.jsonl" + pr_url = None + data = { + "url": repo.url, + "name": "/".join(repo.url.split("/")[-2:]), + "commit": repo.commit, + "lean_version": v, + "lean_dojo_version": lean_dojo.__version__, + "metadata": { + "date_processed": datetime.datetime.now(), + }, + "theorems_folder": theorems_folder, + "premise_files_corpus": premise_files_corpus, + "files_traced": files_traced, + "pr_url": pr_url, + } + + repo = Repository.from_dict(data) + logger.info("Before adding new repo:") + db.print_database_contents() + db.add_repository(repo) + logger.info("After adding new repo:") + db.print_database_contents() + db.to_json(dynamic_database_json_path) + return "Done" + +def calculate_difficulty(theorem: Theorem) -> Union[float, None]: + """Calculates the difficulty of a theorem.""" + proof_steps = theorem.traced_tactics + if any("sorry" in step.tactic for step in proof_steps): + return float("inf") # Hard (no proof) + if len(proof_steps) == 0: + return None # To be distributed later + return math.exp(len(proof_steps)) + +def categorize_difficulty( + difficulty: Union[float, None], percentiles: List[float] +) -> str: + """Categorizes the difficulty of a theorem.""" + if difficulty is None: + return "To_Distribute" + if difficulty == float("inf"): + return "Hard (No proof)" + elif difficulty <= percentiles[0]: + return "Easy" + elif difficulty <= percentiles[1]: + return "Medium" + else: + return "Hard" + + +def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]: + """Sorts repositories by the difficulty of their theorems.""" + difficulties_by_repo = defaultdict(list) + all_difficulties = [] + + print("Ready to calculate difficulties of all theorems") + for repo in db.repositories: + print(f"Starting {repo.name}") + for theorem in repo.get_all_theorems: + difficulty = calculate_difficulty(theorem) + theorem.difficulty_rating = difficulty + difficulties_by_repo[repo].append( + ( + theorem.full_name, + str(theorem.file_path), + tuple(theorem.start), + tuple(theorem.end), + difficulty, + ) + ) + if difficulty is not None: + all_difficulties.append(difficulty) + + db.update_repository(repo) + print(f"Finished {repo.name}") + + percentiles = np.percentile(all_difficulties, [33, 67]) + + categorized_theorems = defaultdict(lambda: defaultdict(list)) + + print("Ready to categorize theorems") + for repo, theorems in difficulties_by_repo.items(): + print(f"Starting {repo.name}") + for theorem_name, file_path, start, end, difficulty in theorems: + category = categorize_difficulty(difficulty, percentiles) + categorized_theorems[repo][category].append( + (theorem_name, file_path, start, end, difficulty) + ) + print(f"Finished {repo.name}") + + print("Distributed theorems with no proofs") + for repo in categorized_theorems: + print(f"Starting {repo.name}") + to_distribute = categorized_theorems[repo]["To_Distribute"] + chunk_size = len(to_distribute) // 3 + for i, category in enumerate(["Easy", "Medium", "Hard"]): + start = i * chunk_size + end = start + chunk_size if i < 2 else None + categorized_theorems[repo][category].extend(to_distribute[start:end]) + del categorized_theorems[repo]["To_Distribute"] + print(f"Finished {repo.name}") + + # Sort repositories based on the number of easy theorems + sorted_repos = sorted( + categorized_theorems.keys(), + key=lambda r: len(categorized_theorems[r]["Easy"]), + reverse=True, + ) + + return sorted_repos, categorized_theorems, percentiles + + +def save_sorted_repos(sorted_repos: List[Repository], file_path: str): + """Saves the sorted repositories to a file.""" + sorted_repo_data = [ + {"url": repo.url, "commit": repo.commit, "name": repo.name} + for repo in sorted_repos + ] + with open(file_path, "w") as f: + json.dump(sorted_repo_data, f, indent=2) + + +def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]: + """Loads the sorted repositories from a file.""" + with open(file_path, "r") as f: + sorted_repo_data = json.load(f) + return [(repo["url"], repo["commit"], repo["name"]) for repo in sorted_repo_data] + + +def write_skip_file(repo_url): + """Writes a repository URL to a file to skip it.""" + skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") + with open(skip_file_path, "w") as f: + f.write(repo_url) + + +def should_skip_repo(): + """Checks if a repository should be skipped.""" + skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") + if os.path.exists(skip_file_path): + with open(skip_file_path, "r") as f: + repo_url = f.read().strip() + return True, repo_url + return False, None \ No newline at end of file diff --git a/leanagent.py b/leanagent.py index 59d2724..68a3e4d 100644 --- a/leanagent.py +++ b/leanagent.py @@ -1,47 +1,45 @@ -# import all the necessary libraries import json -import math import os import pickle import random -import re -import shutil -import subprocess + import sys import time import traceback -from collections import defaultdict -from copy import copy + from datetime import datetime, timedelta from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple + -import lean_dojo import numpy as np import pytorch_lightning as pl import ray -import requests + import torch from lean_dojo import * from lean_dojo import LeanGitRepo, Pos from lean_dojo import Theorem from lean_dojo import Theorem as LeanDojoTheorem -from lean_dojo import is_available_in_cache + from loguru import logger from pytorch_lightning import seed_everything -from pytorch_lightning.callbacks import (Callback, EarlyStopping, - LearningRateMonitor, ModelCheckpoint) +from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint from pytorch_lightning.strategies import DDPStrategy from tqdm import tqdm import generate_benchmark_lean4 -from dynamic_database import * +from dynamic_database import AnnotatedTactic, Theorem, DynamicDatabase from prover.proof_search import DistributedProver, SearchResult, Status from retrieval.datamodule import RetrievalDataModule from retrieval.main import run_cli from retrieval.model import PremiseRetriever +from git_utils import find_and_save_compatible_commits, search_github_repositories, should_skip_repo, add_repo_to_database, sort_repositories_by_difficulty, save_sorted_repos + # Set the seed for reproducibility +personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN") + random.seed(3407) # https://arxiv.org/abs/2109.08203 BATCH_SIZE = 4 RAID_DIR = os.environ.get("RAID_DIR") @@ -58,512 +56,8 @@ repos_for_merged_dataset = [] repos_for_proving = [] - -# List of known repositories to process or skip -# Feel free to remove any repos from this list if you would like to test on them -known_repositories = [ - "leanprover-community/mathlib4", # ReProver is trained on this - "leanprover-community/batteries", # functional programming instead of math - "leanprover-community/aesop", - "leanprover/lean4", - "leanprover-community/mathlib", # Mathlib3 version - "leanprover-community/mathlib3", - "leanprover/std4", # moved to batteries - "leanprover-community/duper", # functional programming instead of math - "leanprover/lake", - "openai/lean-gym", - "leanprover-community/lean4-metaprogramming-book", - "kmill/lean4-raytracer", # no theorems - "argumentcomputer/yatima", # trace problems - "ImperialCollegeLondon/formalising-mathematics-2024", # trace problems - "leanprover-community/ProofWidgets4", # trace problems - "leanprover/verso", # trace problems - "leanprover-community/NNG4", # trace problems - "ufmg-smite/lean-smt", # fails to trace due to windows-style line endings - "teorth/symmetric_project", # no compatible commit - "cmu-l3/llmlean", # irrelevant + only 4 theorems - "PatrickMassot/GlimpseOfLean", # strange trace problems with _parse_deps - "avigad/lamr", # trace problems - "leanprover-community/quote4", # no theorems - "leanprover-community/iris-lean", # trace problems - "aripiprazole/rinha", # incompatible commit - "leanprover/lean4-cli", # no theorems - "leanprover/LeanInk", # no theorems - "leanprover-community/lean-auto", - "leanprover-community/repl", # no theorems - "leanprover/doc-gen4", # no theorems - "leanprover/SampCert", # trace problems - "nomeata/loogle", - "risc0/risc0-lean4", - "PatrickMassot/verbose-lean4", # no theorems - "tydeu/lean4-alloy", # no theorems - "leanprover/leansat", # deprecated - "BoltonBailey/formal-snarks-project", # two theorems - "dwrensha/lean4-maze", # two theorems - "leanprover-community/mathport", # irrelevant - "argumentcomputer/LSpec", # one theorem - "reaslab/jixia", # no theorems - "riccardobrasca/flt3", # no theorems - "dwrensha/animate-lean-proofs", # irrelevant - "lean-ja/lean-by-example", # irrelevant - "NethermindEth/Clear", # no theorems - "fgdorais/lean4-parser", # irrelevant - "semorrison/lean-training-data", # irrelevant - "verse-lab/lean-ssr", # irrelevant - "GaloisInc/lean-llvm", # irrelevant - "argumentcomputer/Wasm.lean", # irrelevant - "NethermindEth/EVMYulLean", # irrelevant - "rwbarton/advent-of-lean-4", # irrelevant - "leanprover-community/tutorials4", # irrelevant - "haruhisa-enomoto/mathlib4-all-tactics", # irrelevant - "leanprover/LNSym", - "leanprover-community/flt-regular", - "opencompl/lean-mlir-old", - "rami3l/plfl", - "HEPLean/HepLean", - "forked-from-1kasper/ground_zero", - "verified-optimization/CvxLean", - "leanprover-community/sphere-eversion", - "optsuite/optlib", - "YaelDillies/LeanCamCombi", - "JamesGallicchio/LeanColls", - "T-Brick/c0deine", - "jjdishere/EG", - "alexkeizer/QpfTypes", - "fpvandoorn/LeanCourse23", - "marcusrossel/lean-egg", - "reilabs/proven-zk", - "algebraic-dev/soda", - "leanprover-community/llm", - "dignissimus/Untangle", - "argumentcomputer/Megaparsec.lean", - "emilyriehl/infinity-cosmos", - "BartoszPiotrowski/lean-premise-selection", - "djvelleman/HTPILeanPackage", - "girving/ray", - "Anderssorby/SDL.lean", - "pandaman64/lean-regex", - "brown-cs22/CS22-Lean-2023", - "hhu-adam/GameSkeleton", - "FR-vdash-bot/Algorithm", - "PeterKementzey/graph-library-for-lean4", - "arthurpaulino/LeanMySQL", - "arthurpaulino/NumLean", - "FormalSAT/trestle", - "nomeata/lean-wf-induct", - "leanprover/lean4checker", - "IPDSnelting/tba-2022", - "digama0/mm-lean4", - "KislyjKisel/Raylib.lean", - "algebraic-dev/melp", - "hhu-adam/Robo", # same as other tutorials but has lots of sorries - "hargoniX/socket.lean", - "kovach/etch", - "damek/gd-lean", - "0art0/lean-slides", - "forked-from-1kasper/lean4-categories", - "katydid/proofs", - "alexjbest/leaff", - "sinhp/Poly", - "lftcm2023/lftcm2023", # same as other tutorials but has lots of sorries - "lean-ja/lean99", - "leanprover/SHerLOC", - "Seasawher/mdgen", - "opencompl/egg-tactic-code", - "david-christiansen/ssft24", - "T-Brick/lean2wasm", - "hargoniX/cpdt-lean", - "jsm28/AperiodicMonotilesLean", - "draperlaboratory/ELFSage", - "rookie-joe/automatic-lean4-compilation", - "madvorak/fecssk", - "david-christiansen/bob24", - "awodey/joyal", - "BrownCS1951x/fpv2023", # same as other tutorials but has lots of sorries - "paulch42/lean-spec", - "siddhartha-gadgil/MetaExamples", - "dannypsnl/violet", - "arthurpaulino/LeanREPL", - "Kha/do-supplement", - "joehendrix/lean-sat-checker", - "ammkrn/timelib", - "kmill/LeanTeX", - "leanprover/lean4export", - "leanprover-community/mathlib3port", - "brown-cs22/CS22-Lean-2024", # same as other tutorials but has lots of sorries - "T-Brick/lean-wasm", - "crabbo-rave/Soup", - "argumentcomputer/RustFFI.lean", - "suhr/tmath", - "leanprover/leanbv", - "arthurpaulino/FxyLang", - "SchrodingerZhu/LeanGccBackend", - "lecopivo/lean4-karray", - "ImperialCollegeLondon/M1F-explained", - "proost-assistant/ProostLean", - "DavePearce/LeanEVM", - "algebraic-dev/ash", - "FormalizedFormalLogic/Arithmetization", - "cmu-l3/ntp-toolkit", - "dwrensha/tryAtEachStep", - "yangky11/lean4-example", - "T-Brick/DateTime", - "model-checking/rust-lean-models", - "MichaelStollBayreuth/EulerProducts", - "hargoniX/Flame", - "argumentcomputer/Http.lean", - "madvorak/vcsp", - "teorth/newton", - "apnelson1/Matroid", - "smorel394/TS1", - "ianjauslin-rutgers/pythagoras4", - "mortarsanjaya/IMOSLLean4", - "dupuisf/BibtexQuery", - "nomeata/lean-calcify", - "argumentcomputer/FFaCiL.lean", - "javra/iit", - "arthurpaulino/viper", - "lindy-labs/aegis", - "PatrickMassot/NNG4", - "argumentcomputer/YatimaStdLib.lean", - "fgdorais/lean4-unicode-basic", - "mhuisi/Uniq", - "Kha/macro-supplement", - "chenjulang/rubikcubegroup", - "arthurpaulino/LeanMusic", - "argumentcomputer/Ipld.lean", - "Odomontois/advent2022-lean", - "kbuzzard/IISc-experiments", # same as other tutorials but has lots of sorries - "ykonstant1/InfinitePrimes", - "alexkassil/natural_number_game_lean4", - "seewoo5/lean-poly-abc", - "rah4927/lean-dojo-mew", - "siddhartha-gadgil/proofs-and-programs-2023", - "PatrickMassot/lean4-game-server", - "knowsys/Formale-Systeme-in-LEAN", # same as other tutorials but has lots of sorries - "katydid/symbolic-automatic-derivatives", - "girving/interval", - "ImperialCollegeLondon/group-theory-experiments", - "knowsys/CertifyingDatalog", - "bergmannjg/leanCurl", - "vasnesterov/HadwigerNelson", - "FWuermse/lean-postgres", - "leanprover-community/import-graph", - "Human-Oriented-ATP/lean-tactics", # more about tactics than premises - "paulcadman/lean4-leetcode", - "argumentcomputer/Lurk.lean", - "AlexDuchnowski/rubiks-cube", - "SchrodingerZhu/lean-gccjit", - "JamesGallicchio/http", - "jtristan/UnicodeSkipListTableExample", - "adomani/MA4N1_2023", # same as other tutorials but has lots of sorries - "remimimimimi/leansec", - "hhu-adam/lean-i18n", - "RemyDegenne/testing-lower-bounds", - "mariainesdff/LocalClassFieldTheory", - "AviCraimer/relational-calculus-library-lean4", - "JLimperg/regensburg-itp-school-2023", - "jaalonso/Calculemus2", - "mseri/BET", - "xubaiw/Reservoir.lean", - "hargoniX/nest-core", - "siddhartha-gadgil/Polylean", - "MichaelStollBayreuth/Weights", - "sanchace/FRACTRAN", - "argumentcomputer/Poseidon.lean", - "madvorak/chomsky", - "T-Brick/ControlFlow", - "pa-ba/guarded-lean", -] - -repos = [] lean_git_repos = [] -personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN") - -PR_TITLE = "[LeanAgent] Proofs" - -PR_BODY = """ -[LeanAgent](https://arxiv.org/abs/2410.06209) discovers a proof for a theorem with the `sorry` keyword. - ---- - -~LeanAgent - From the [LeanDojo](https://leandojo.org/) family -""" - -TMP_BRANCH = "_LeanAgent" - -COMMIT_MESSAGE = "[LeanAgent] Proofs" - - -def clone_repo(repo_url): - """Clone a git repository and return the path to the repository and its sha.""" - repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "") - logger.info(f"Cloning {repo_url}") - logger.info(f"Repo name: {repo_name}") - repo_name = repo_dir + "/" + repo_name - if os.path.exists(repo_name): - print(f"Deleting existing repository directory: {repo_name}") - shutil.rmtree(repo_name) - subprocess.run(["git", "clone", repo_url, repo_name]) - process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE) - stdout, stderr = process.communicate() - sha = re.split(r"\t+", stdout.decode("utf-8"))[0] - return repo_name, sha - - -def branch_exists(repo_name, branch_name): - """Check if a branch exists in a git repository.""" - proc = subprocess.run( - ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True - ) - branches = proc.stdout.split("\n") - local_branch = branch_name - remote_branch = f"remote/{branch_name}" - return any( - branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch) - for branch in branches - ) - - -def create_or_switch_branch(repo_name, branch_name, base_branch): - """Create a branch in a git repository if it doesn't exist, or switch to it if it does.""" - if not branch_exists(repo_name, branch_name): - subprocess.run( - ["git", "-C", repo_name, "checkout", "-b", branch_name], check=True - ) - else: - subprocess.run(["git", "-C", repo_name, "checkout", branch_name], check=True) - subprocess.run( - [ - "git", - "-C", - repo_name, - "merge", - base_branch, - "-m", - f"Merging {branch_name} into {base_branch}", - ], - check=True, - ) - - -def commit_changes(repo_name, commit_message): - """Commit changes to a git repository.""" - status = subprocess.run( - ["git", "-C", repo_name, "status", "--porcelain"], - capture_output=True, - text=True, - ).stdout.strip() - if status == "": - print("No changes to commit.") - return False - subprocess.run(["git", "-C", repo_name, "add", "."], check=True) - subprocess.run(["git", "-C", repo_name, "commit", "-m", commit_message], check=True) - return True - - -def push_changes(repo_name, branch_name): - """Push changes to a git repository.""" - subprocess.run( - ["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True - ) - - -def get_default_branch(repo_full_name): - """Get the default branch of a repository (default `main`).""" - url = f"https://api.github.com/repos/{repo_full_name}" - headers = { - "Authorization": f"token {personal_access_token}", - "Accept": "application/vnd.github.v3+json", - } - response = requests.get(url, headers=headers) - if response.status_code == 200: - return response.json()["default_branch"] - else: - logger.info(f"Failed to get default branch for {repo_full_name}") - return "main" - - -def create_pull_request(repo_full_name, title, body, head_branch): - """Create a pull request in a repository.""" - base_branch = get_default_branch(repo_full_name) - url = f"https://api.github.com/repos/{repo_full_name}/pulls" - headers = { - "Authorization": f"token {personal_access_token}", - "Accept": "application/vnd.github.v3+json", - } - data = {"title": title, "body": body, "head": head_branch, "base": base_branch} - response = requests.post(url, headers=headers, json=data) - if response.status_code == 201: - print("Pull request created successfully: " + response.json()["html_url"]) - return response.json()["html_url"] - else: - print("Failed to create pull request", response.text) - return "" - - -def get_compatible_commit(url): - """Find the most recent commit with a Lean version that LeanAgent supports.""" - try: - process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) - stdout, stderr = process.communicate() - latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0] - logger.info(f"Latest commit: {latest_commit}") - - new_url = url.replace(".git", "") - logger.info(f"Creating LeanGitRepo for {new_url}") - repo = LeanGitRepo(new_url, latest_commit) - logger.info(f"Getting config for {url}") - config = repo.get_config("lean-toolchain") - v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) - if generate_benchmark_lean4.is_supported_version(v): - logger.info(f"Latest commit compatible for url {url}") - return latest_commit, v - - logger.info(f"Searching for compatible commit for {url}") - try: - subprocess.run( - ["git", "rev-parse", "--is-inside-work-tree"], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - logger.info("Already in a Git repository") - except subprocess.CalledProcessError: - logger.info("Not in a Git repository. Initializing one.") - subprocess.run(["git", "init"], check=True) - - process = subprocess.Popen( - ["git", "fetch", "--depth=1000000", url], # Fetch commits - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - logger.info(f"Fetching commits for {url}") - _, stderr = process.communicate() - if process.returncode != 0: - raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}") - logger.info(f"Fetched commits for {url}") - process = subprocess.Popen( - ["git", "log", "--format=%H", "FETCH_HEAD"], # Get list of commits - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - logger.info(f"Getting list of commits for {url}") - stdout, stderr = process.communicate() - if process.returncode != 0: - raise Exception(f"Git log command failed: {stderr.decode('utf-8')}") - commits = stdout.decode("utf-8").strip().split("\n") - logger.info(f"Found {len(commits)} commits for {url}") - for commit in commits: - new_url = url.replace(".git", "") - repo = LeanGitRepo(new_url, commit) - config = repo.get_config("lean-toolchain") - v = generate_benchmark_lean4.get_lean4_version_from_config( - config["content"] - ) - if generate_benchmark_lean4.is_supported_version(v): - logger.info(f"Found compatible commit {commit} for {url}") - return commit, v - - raise Exception("No compatible commit found") - - except Exception as e: - logger.info(f"Error in get_compatible_commit: {str(e)}") - return None, None - - -def find_and_save_compatible_commits(repo_info_file, lean_git_repos): - """Finds compatible commits for various repositories""" - updated_repos = [] - for repo in lean_git_repos: - url = repo.url - if not url.endswith(".git"): - url = url + ".git" - - sha = None - v = None - if "mathlib4" in url: - sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" - v = "v4.8.0" - elif "SciLean" in url: - sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744" - v = "v4.7.0" - elif "pfr" in url: - sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687" - v = "v4.8.0-rc1" - else: - sha, v = get_compatible_commit(url) - if not sha: - logger.info(f"Failed to find a compatible commit for {url}") - continue - - updated_repos.append( - {"url": url.replace(".git", ""), "commit": sha, "version": v} - ) - - with open(repo_info_file, "w") as f: - json.dump(updated_repos, f) - - return updated_repos - - -def search_github_repositories(language="Lean", num_repos=10): - """Search for the given number of repositories on GitHub that have the given language.""" - headers = {"Authorization": personal_access_token} - query_params = { - "q": f"language:{language}", - "sort": "stars", - "order": "desc", - "per_page": 100, - } - - cloned_count = 0 - page = 1 - - while cloned_count < num_repos: - query_params["page"] = page - response = requests.get( - "https://api.github.com/search/repositories", - headers=headers, - params=query_params, - ) - - if response.status_code == 200: - repositories = response.json()["items"] - for repo in repositories: - if cloned_count >= num_repos: - break - repo_full_name = repo["full_name"] - logger.info(f"Processing {repo_full_name}") - if repo_full_name not in known_repositories: - name = None - try: - clone_url = repo["clone_url"] - repo_name, sha = clone_repo(clone_url) - name = repo_name - url = clone_url.replace(".git", "") - lean_git_repo = LeanGitRepo(url, sha) - lean_git_repos.append(lean_git_repo) - repos.append(repo_full_name) - cloned_count += 1 - logger.info(f"Cloned {repo_full_name}") - except Exception as e: - shutil.rmtree(name) - logger.info(f"Failed to clone {repo_full_name} because of {e}") - else: - logger.info( - f"Skipping {repo_full_name} since it is a known repository" - ) - page += 1 - else: - logger.info("Failed to search GitHub", response.status_code) - break - - # Check if we've reached the end of the search results - if len(repositories) < 100: - break - - logger.info(f"Total repositories processed: {cloned_count}") +repos = [] def _eval(data, preds_map) -> Tuple[float, float, float]: @@ -839,82 +333,7 @@ def prove_sorry_theorems( save_progress(all_encountered_theorems) logger.info("Finished attempting to prove sorry theorems") - - -def add_repo_to_database(dynamic_database_json_path, repo, db): - """Adds a repository to the dynamic database.""" - # Prepare the data necessary to add this repo to the dynamic database - url = repo.url - if not url.endswith(".git"): - url = url + ".git" - logger.info(f"Processing {url}") - - if "mathlib4" in url: - sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" - v = "v4.8.0" - elif "SciLean" in url: - sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744" - v = "v4.7.0" - elif "pfr" in url: - sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687" - v = "v4.8.0-rc1" - else: - sha, v = get_compatible_commit(url) - - if not sha: - logger.info(f"Failed to find a compatible commit for {url}") - return None - - logger.info(f"Found compatible commit {sha} for {url}") - logger.info(f"Lean version: {v}") - url = url.replace(".git", "") - repo = LeanGitRepo(url, sha) - dir_name = repo.url.split("/")[-1] + "_" + sha - dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name - logger.info(f"Generating benchmark at {dst_dir}") - traced_repo, _, _, total_theorems = generate_benchmark_lean4.main( - repo.url, sha, dst_dir - ) - if not traced_repo: - logger.info(f"Failed to trace {url}") - return None - if total_theorems < 3 * BATCH_SIZE: # Should be enough theorems for train/val/test - logger.info(f"No theorems found in {url}") - return None - logger.info(f"Finished generating benchmark at {dst_dir}") - - # Add the new repo to the dynamic database - config = repo.get_config("lean-toolchain") - v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) - theorems_folder = dst_dir + "/random" - premise_files_corpus = dst_dir + "/corpus.jsonl" - files_traced = dst_dir + "/traced_files.jsonl" - pr_url = None - data = { - "url": repo.url, - "name": "/".join(repo.url.split("/")[-2:]), - "commit": repo.commit, - "lean_version": v, - "lean_dojo_version": lean_dojo.__version__, - "metadata": { - "date_processed": datetime.datetime.now(), - }, - "theorems_folder": theorems_folder, - "premise_files_corpus": premise_files_corpus, - "files_traced": files_traced, - "pr_url": pr_url, - } - - repo = Repository.from_dict(data) - logger.info("Before adding new repo:") - db.print_database_contents() - db.add_repository(repo) - logger.info("After adding new repo:") - db.print_database_contents() - db.to_json(dynamic_database_json_path) - return "Done" - - + def replace_sorry_with_proof(proofs): """Replace the `sorry` with the proof text in the Lean files.""" logger.info(f"Replacing sorries with {len(proofs)} proofs!") @@ -945,128 +364,133 @@ def replace_sorry_with_proof(proofs): logger.info("Finished replacing sorries with proofs!") - -def calculate_difficulty(theorem: Theorem) -> Union[float, None]: - """Calculates the difficulty of a theorem.""" - proof_steps = theorem.traced_tactics - if any("sorry" in step.tactic for step in proof_steps): - return float("inf") # Hard (no proof) - if len(proof_steps) == 0: - return None # To be distributed later - return math.exp(len(proof_steps)) - - -def categorize_difficulty( - difficulty: Union[float, None], percentiles: List[float] -) -> str: - """Categorizes the difficulty of a theorem.""" - if difficulty is None: - return "To_Distribute" - if difficulty == float("inf"): - return "Hard (No proof)" - elif difficulty <= percentiles[0]: - return "Easy" - elif difficulty <= percentiles[1]: - return "Medium" - else: - return "Hard" - - -def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]: - """Sorts repositories by the difficulty of their theorems.""" - difficulties_by_repo = defaultdict(list) - all_difficulties = [] - - print("Ready to calculate difficulties of all theorems") - for repo in db.repositories: - print(f"Starting {repo.name}") - for theorem in repo.get_all_theorems: - difficulty = calculate_difficulty(theorem) - theorem.difficulty_rating = difficulty - difficulties_by_repo[repo].append( - ( - theorem.full_name, - str(theorem.file_path), - tuple(theorem.start), - tuple(theorem.end), - difficulty, - ) +def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase: + """Initializes or loads the dynamic database.""" + # Check if the current process is the main one + is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0 + + # Initialize the database if it doesn't exist or is empty + if is_main_process: + logger.info("Starting the main process") + if ( + not os.path.exists(dynamic_database_json_path) + or os.path.getsize(dynamic_database_json_path) == 0 + ): + # File doesn't exist or is empty, initialize it + logger.info( + f"\nInitializing new database at {dynamic_database_json_path}\n" ) - if difficulty is not None: - all_difficulties.append(difficulty) - - db.update_repository(repo) - print(f"Finished {repo.name}") - - percentiles = np.percentile(all_difficulties, [33, 67]) - - categorized_theorems = defaultdict(lambda: defaultdict(list)) + db = DynamicDatabase() + db.to_json(dynamic_database_json_path) + else: + try: + logger.info(f"Loading database from {dynamic_database_json_path}") + db = DynamicDatabase.from_json(dynamic_database_json_path) + logger.info(f"Loaded database from {dynamic_database_json_path}") + except json.JSONDecodeError: + # If there's an error decoding the JSON, initialize a new database + logger.warning( + f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database." + ) + db = DynamicDatabase() + db.to_json(dynamic_database_json_path) + + return db - print("Ready to categorize theorems") - for repo, theorems in difficulties_by_repo.items(): - print(f"Starting {repo.name}") - for theorem_name, file_path, start, end, difficulty in theorems: - category = categorize_difficulty(difficulty, percentiles) - categorized_theorems[repo][category].append( - (theorem_name, file_path, start, end, difficulty) +def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_path: str, db: DynamicDatabase): + global lean_git_repos + global repos + # If curriculum learning is enabled, initialize repositories and sort them by difficulty + repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json") + # Check if the current process is the main one + is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0 + if curriculum_learning: + logger.info("Starting curriculum learning") + if is_main_process: + lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "Lean", num_repos) + + for i in range(len(lean_git_repos)): + repo = lean_git_repos[i] + print("\n\n") + logger.info(f"Processing new repo: {repo.url}") + result = add_repo_to_database(dynamic_database_json_path, repo, db) + if result is not None: + logger.info(f"Successfully added repo {repo.url}") + + logger.info( + f"Successfully added {num_repos} repositories to the database" ) - print(f"Finished {repo.name}") - - print("Distributed theorems with no proofs") - for repo in categorized_theorems: - print(f"Starting {repo.name}") - to_distribute = categorized_theorems[repo]["To_Distribute"] - chunk_size = len(to_distribute) // 3 - for i, category in enumerate(["Easy", "Medium", "Hard"]): - start = i * chunk_size - end = start + chunk_size if i < 2 else None - categorized_theorems[repo][category].extend(to_distribute[start:end]) - del categorized_theorems[repo]["To_Distribute"] - print(f"Finished {repo.name}") - - # Sort repositories based on the number of easy theorems - sorted_repos = sorted( - categorized_theorems.keys(), - key=lambda r: len(categorized_theorems[r]["Easy"]), - reverse=True, - ) - - return sorted_repos, categorized_theorems, percentiles - - -def save_sorted_repos(sorted_repos: List[Repository], file_path: str): - """Saves the sorted repositories to a file.""" - sorted_repo_data = [ - {"url": repo.url, "commit": repo.commit, "name": repo.name} - for repo in sorted_repos - ] - with open(file_path, "w") as f: - json.dump(sorted_repo_data, f, indent=2) - - -def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]: - """Loads the sorted repositories from a file.""" - with open(file_path, "r") as f: - sorted_repo_data = json.load(f) - return [(repo["url"], repo["commit"], repo["name"]) for repo in sorted_repo_data] + sorted_repos, categorized_theorems, percentiles = ( + sort_repositories_by_difficulty(db) + ) + + print("Sorted repositories. Saving now...") + db.to_json(dynamic_database_json_path) + save_sorted_repos(sorted_repos, "sorted_repos.json") + + print("Summary of theorem difficulties by URL:") + for repo in sorted_repos: + print(f"\nURL: {repo.url}") + for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]: + theorems = categorized_theorems[repo][category] + print(f" {category}: {len(theorems)} theorems") + if theorems: + sorted_theorems = sorted( + theorems, + key=lambda x: ( + x[2] if x[2] is not None else -float("inf") + ), + reverse=True, + )[:3] + for name, path, _start, _end, diff in sorted_theorems: + diff_str = f"{diff:.2f}" if diff is not None else "N/A" + print( + f" - {name} (File: {path}, Difficulty: {diff_str})" + ) -def write_skip_file(repo_url): - """Writes a repository URL to a file to skip it.""" - skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") - with open(skip_file_path, "w") as f: - f.write(repo_url) + print("\nOverall Statistics:") + total_theorems = sum( + len(theorems) + for categories in categorized_theorems.values() + for theorems in categories.values() + ) + for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]: + count = sum( + len(categories[category]) + for categories in categorized_theorems.values() + ) + percentage = (count / total_theorems) * 100 + print(f"{category}: {count} theorems ({percentage:.2f}%)") + print( + f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}" + ) -def should_skip_repo(): - """Checks if a repository should be skipped.""" - skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") - if os.path.exists(skip_file_path): - with open(skip_file_path, "r") as f: - repo_url = f.read().strip() - return True, repo_url - return False, None + logger.info("Finding compatible repositories...") + updated_repos = find_and_save_compatible_commits(repo_info_file, sorted_repos) + lean_git_repos = [LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos] + logger.info("Finished finding compatible repositories") + else: + logger.info("Starting without curriculum learning") + if is_main_process: + lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "lean", num_repos) + for i in range(len(lean_git_repos)): + repo = lean_git_repos[i] + logger.info(f"Processing {repo.url}") + result = add_repo_to_database(dynamic_database_json_path, repo, db) + if result is not None: + logger.info(f"Successfully added repo {repo.url}") + + logger.info(f"Successfully added {num_repos} repositories to the database") + + logger.info("Finding compatible repositories...") + updated_repos = find_and_save_compatible_commits(repo_info_file, lean_git_repos) + lean_git_repos = [LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos] + logger.info("Finished finding compatible repositories") + + return lean_git_repos, repos, updated_repos def main(): """ @@ -1075,6 +499,7 @@ def main(): global repos_for_merged_dataset global repos_for_proving global lean_git_repos + global repos try: current_epoch = 0 epochs_per_repo = 1 @@ -1083,7 +508,7 @@ def main(): single_repo = True curriculum_learning = True num_repos = 1 - dynamic_database_json_path = RAID_DIR + "/" + DB_FILE_NAME + dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME) lambdas = None if run_progressive_training: @@ -1098,131 +523,14 @@ def main(): generate_benchmark_lean4.configure_leandojo() logger.info("LeanDojo configured") - # Check if the current process is the main one - is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0 - - # Initialize the database if it doesn't exist or is empty - if is_main_process: - logger.info("Starting the main process") - if ( - not os.path.exists(dynamic_database_json_path) - or os.path.getsize(dynamic_database_json_path) == 0 - ): - # File doesn't exist or is empty, initialize it - logger.info( - f"Initializing new database at {dynamic_database_json_path}" - ) - db = DynamicDatabase() - db.to_json(dynamic_database_json_path) - else: - try: - logger.info(f"Loading database from {dynamic_database_json_path}") - db = DynamicDatabase.from_json(dynamic_database_json_path) - logger.info(f"Loaded database from {dynamic_database_json_path}") - except json.JSONDecodeError: - # If there's an error decoding the JSON, initialize a new database - logger.warning( - f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database." - ) - db = DynamicDatabase() - db.to_json(dynamic_database_json_path) - + db = initialize_database(dynamic_database_json_path) logger.info(f"Found {num_repos} repositories") - # If curriculum learning is enabled, initialize repositories and sort them by difficulty - if curriculum_learning: - logger.info("Starting curriculum learning") - repo_info_file = f"{RAID_DIR}/{DATA_DIR}/repo_info_compatible.json" - if is_main_process: - search_github_repositories("Lean", num_repos) - for i in range(len(lean_git_repos)): - repo = lean_git_repos[i] - logger.info(f"Processing {repo.url}") - result = add_repo_to_database(dynamic_database_json_path, repo, db) - if result is not None: - logger.info(f"Successfully added repo {repo.url}") - logger.info( - f"Successfully added {num_repos} repositories to the database" - ) - - sorted_repos, categorized_theorems, percentiles = ( - sort_repositories_by_difficulty(db) - ) - print("Sorted repositories. Saving now...") - db.to_json(dynamic_database_json_path) - save_sorted_repos(sorted_repos, "sorted_repos.json") - print("Summary of theorem difficulties by URL:") - for repo in sorted_repos: - print(f"\nURL: {repo.url}") - for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]: - theorems = categorized_theorems[repo][category] - print(f" {category}: {len(theorems)} theorems") - if theorems: - sorted_theorems = sorted( - theorems, - key=lambda x: ( - x[2] if x[2] is not None else -float("inf") - ), - reverse=True, - )[:3] - for name, path, start, end, diff in sorted_theorems: - diff_str = f"{diff:.2f}" if diff is not None else "N/A" - print( - f" - {name} (File: {path}, Difficulty: {diff_str})" - ) - - print("\nOverall Statistics:") - total_theorems = sum( - len(theorems) - for categories in categorized_theorems.values() - for theorems in categories.values() - ) - for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]: - count = sum( - len(categories[category]) - for categories in categorized_theorems.values() - ) - percentage = (count / total_theorems) * 100 - print(f"{category}: {count} theorems ({percentage:.2f}%)") - - print( - f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}" - ) - - logger.info("Finding compatible repositories...") - updated_repos = find_and_save_compatible_commits( - repo_info_file, sorted_repos - ) - lean_git_repos = [ - LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos - ] - logger.info("Finished finding compatible repositories") - else: - logger.info("Starting without curriculum learning") - repo_info_file = f"{RAID_DIR}/{DATA_DIR}/repo_info_compatible.json" - if is_main_process: - search_github_repositories("Lean", num_repos) - - for i in range(len(lean_git_repos)): - repo = lean_git_repos[i] - logger.info(f"Processing {repo.url}") - result = add_repo_to_database(dynamic_database_json_path, repo, db) - if result is not None: - logger.info(f"Successfully added repo {repo.url}") - logger.info( - f"Successfully added {num_repos} repositories to the database" - ) - - logger.info("Finding compatible repositories...") - updated_repos = find_and_save_compatible_commits( - repo_info_file, lean_git_repos - ) - lean_git_repos = [ - LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos - ] - logger.info("Finished finding compatible repositories") + lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db) + repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json") # All processes wait for the file to be created and then read from it + # TODO: Fix with a semaphore or file lock max_attempts = 30 for attempt in range(max_attempts): try: @@ -1242,6 +550,8 @@ def main(): for info in repo_info ] + is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0 + # Iterate over each repository and lambda value for i in range(num_repos): for lambda_value in lambdas: diff --git a/run_leanagent.sh b/run_leanagent.sh old mode 100644 new mode 100755 index 180ff70..e127b50 --- a/run_leanagent.sh +++ b/run_leanagent.sh @@ -26,15 +26,17 @@ #!/bin/bash export RAID_DIR="~/Desktop/LeanAgent/RAID/" export LEAN_AGENT_DIR="~/Desktop/LeanAgent" +export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent" +export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo" +export RAY_TMPDIR="${RAID_DIR}/tmp" +export CONDA_SH="/Users/motiwari/miniforge3/etc/profile.d/conda.sh" +source ${CONDA_SH} + cd ${LEAN_AGENT_DIR} echo "Script executed from: ${PWD}" -source /Users/motiwari/miniforge3/etc/profile.d/conda.sh conda activate LeanAgent -export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent" -export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo" echo "Removing old cache files" rm -rf /tmp/ray -export RAY_TMPDIR="${RAID_DIR}/tmp" rm -rf ${RAY_TMPDIR} mkdir "${RAY_TMPDIR}" echo "Stopping ray" From ef397bef8acfdf9086d8411e268c822beaebcca1 Mon Sep 17 00:00:00 2001 From: motiwari Date: Mon, 25 Aug 2025 14:07:53 -0700 Subject: [PATCH 05/29] Querying commit hashes --- git_utils.py | 56 +++++++++++++++++++--------------------------------- 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/git_utils.py b/git_utils.py index 6d56fb4..5cd4630 100644 --- a/git_utils.py +++ b/git_utils.py @@ -16,7 +16,7 @@ import math import os -from constants import known_repositories, known_dead_repos, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE +from constants import known_repositories, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN") BATCH_SIZE = 4 @@ -141,22 +141,10 @@ def create_pull_request(repo_full_name, title, body, head_branch): print("Failed to create pull request", response.text) return "" -def ensure_inside_git(): - """Ensure that the current directory is inside a git repository.""" - try: - subprocess.run( - ["git", "rev-parse", "--is-inside-work-tree"], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - logger.info("Already in a Git repository") - except subprocess.CalledProcessError: - logger.info("Not in a Git repository. Initializing one.") - subprocess.run(["git", "init"], check=True) - + def get_compatible_commit(url): """Find the most recent commit with a Lean version that LeanAgent supports.""" + import ipdb; ipdb.set_trace() try: process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) stdout, stderr = process.communicate() @@ -177,9 +165,18 @@ def get_compatible_commit(url): return latest_commit, v logger.info(f"Searching for compatible commit for {url}") - - ensure_inside_git() - ZZ + try: + subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + logger.info("Already in a Git repository") + except subprocess.CalledProcessError: + logger.info("Not in a Git repository. Initializing one.") + subprocess.run(["git", "init"], check=True) + process = subprocess.Popen( ["git", "fetch", "--depth=1000000", url], # Fetch commits stdout=subprocess.PIPE, @@ -209,21 +206,10 @@ def get_compatible_commit(url): logger.info(f"Found {len(commits)} commits for {url}") new_url = url.replace(".git", "") - - repo_human_name = "/".join(new_url.split("/")[-2:]) - - # Delete repo if it exists, because it might be checked out to a different commit - if os.path.exists(os.path.join("repos", repo_human_name)): - shutil.rmtree(os.path.join("repos", repo_human_name)) - - subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True) - for commit in commits: - logger.info(f"Checking commit {commit} for {url}") - # Check out the commit locally - subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True) - import ipdb; ipdb.set_trace() - repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit) + + + repo = LeanGitRepo(new_url, commit) config = repo.get_config("lean-toolchain") v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) if generate_benchmark_lean4.is_supported_version(v): @@ -239,6 +225,7 @@ def get_compatible_commit(url): def find_and_save_compatible_commits(repo_info_file, lean_git_repos): """Finds compatible commits for various repositories""" + import ipdb; ipdb.set_trace() with open(repo_info_file, "r") as repo_compatibility_file: updated_repos = json.loads(repo_compatibility_file) @@ -317,7 +304,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos # Skip repos that are already known - if repo_full_name not in known_repositories + known_dead_repos + repos: + if repo_full_name not in known_repositories: print("\n\n") logger.info(f"Processing new repo: {repo_full_name}") name = None @@ -326,10 +313,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos repo_name, sha = clone_repo(clone_url) name = repo_name url = clone_url.replace(".git", "") - - # TODO: This constructor can be very slow lean_git_repo = LeanGitRepo(url, sha) - lean_git_repos.append(lean_git_repo) repos.append(repo_full_name) cloned_count += 1 From b1d5280f77da3cedc03e4319ce255c83ff3c6070 Mon Sep 17 00:00:00 2001 From: motiwari Date: Mon, 25 Aug 2025 14:08:17 -0700 Subject: [PATCH 06/29] Revert --- git_utils.py | 56 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/git_utils.py b/git_utils.py index 5cd4630..6d56fb4 100644 --- a/git_utils.py +++ b/git_utils.py @@ -16,7 +16,7 @@ import math import os -from constants import known_repositories, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE +from constants import known_repositories, known_dead_repos, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN") BATCH_SIZE = 4 @@ -141,10 +141,22 @@ def create_pull_request(repo_full_name, title, body, head_branch): print("Failed to create pull request", response.text) return "" - +def ensure_inside_git(): + """Ensure that the current directory is inside a git repository.""" + try: + subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + logger.info("Already in a Git repository") + except subprocess.CalledProcessError: + logger.info("Not in a Git repository. Initializing one.") + subprocess.run(["git", "init"], check=True) + def get_compatible_commit(url): """Find the most recent commit with a Lean version that LeanAgent supports.""" - import ipdb; ipdb.set_trace() try: process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) stdout, stderr = process.communicate() @@ -165,18 +177,9 @@ def get_compatible_commit(url): return latest_commit, v logger.info(f"Searching for compatible commit for {url}") - try: - subprocess.run( - ["git", "rev-parse", "--is-inside-work-tree"], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - logger.info("Already in a Git repository") - except subprocess.CalledProcessError: - logger.info("Not in a Git repository. Initializing one.") - subprocess.run(["git", "init"], check=True) - + + ensure_inside_git() + ZZ process = subprocess.Popen( ["git", "fetch", "--depth=1000000", url], # Fetch commits stdout=subprocess.PIPE, @@ -206,10 +209,21 @@ def get_compatible_commit(url): logger.info(f"Found {len(commits)} commits for {url}") new_url = url.replace(".git", "") + + repo_human_name = "/".join(new_url.split("/")[-2:]) + + # Delete repo if it exists, because it might be checked out to a different commit + if os.path.exists(os.path.join("repos", repo_human_name)): + shutil.rmtree(os.path.join("repos", repo_human_name)) + + subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True) + for commit in commits: - - - repo = LeanGitRepo(new_url, commit) + logger.info(f"Checking commit {commit} for {url}") + # Check out the commit locally + subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True) + import ipdb; ipdb.set_trace() + repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit) config = repo.get_config("lean-toolchain") v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) if generate_benchmark_lean4.is_supported_version(v): @@ -225,7 +239,6 @@ def get_compatible_commit(url): def find_and_save_compatible_commits(repo_info_file, lean_git_repos): """Finds compatible commits for various repositories""" - import ipdb; ipdb.set_trace() with open(repo_info_file, "r") as repo_compatibility_file: updated_repos = json.loads(repo_compatibility_file) @@ -304,7 +317,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos # Skip repos that are already known - if repo_full_name not in known_repositories: + if repo_full_name not in known_repositories + known_dead_repos + repos: print("\n\n") logger.info(f"Processing new repo: {repo_full_name}") name = None @@ -313,7 +326,10 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos repo_name, sha = clone_repo(clone_url) name = repo_name url = clone_url.replace(".git", "") + + # TODO: This constructor can be very slow lean_git_repo = LeanGitRepo(url, sha) + lean_git_repos.append(lean_git_repo) repos.append(repo_full_name) cloned_count += 1 From 8bf2234a2633664769e0a65e33ac30c95a8c4bf2 Mon Sep 17 00:00:00 2001 From: motiwari Date: Mon, 25 Aug 2025 15:33:37 -0700 Subject: [PATCH 07/29] organization --- compute_fisher.py | 1 - constants.py | 7 +- custom_traced_data.py | 6 +- custom_utils.py | 3 +- filenames.py | 12 ++ git_utils.py | 277 ++++++++++++++++++++---------------------- leanagent.py | 38 +++--- leanagent_utils.py | 4 +- 8 files changed, 165 insertions(+), 183 deletions(-) create mode 100644 filenames.py diff --git a/compute_fisher.py b/compute_fisher.py index 56b32fe..0ef43fa 100644 --- a/compute_fisher.py +++ b/compute_fisher.py @@ -96,7 +96,6 @@ def main(): # Save the FIM if needed if fisher_trainer.is_global_zero: fisher_file_path = os.path.join( - RAID_DIR, FISHER_DIR, f"fisher_info_{new_data_path.split('/')[-1]}_distributed.pkl", ) diff --git a/constants.py b/constants.py index 01f0ad4..b87e291 100644 --- a/constants.py +++ b/constants.py @@ -1,5 +1,4 @@ PR_TITLE = "[LeanAgent] Proofs" - PR_BODY = """ [LeanAgent](https://arxiv.org/abs/2410.06209) discovers a proof for a theorem with the `sorry` keyword. @@ -7,11 +6,13 @@ ~LeanAgent - From the [LeanDojo](https://leandojo.org/) family """ - TMP_BRANCH = "_LeanAgent" - COMMIT_MESSAGE = "[LeanAgent] Proofs" + +MARK_START_SYMBOL = "" +MARK_END_SYMBOL = "" + # List of known repositories to process or skip # Feel free to remove any repos from this list if you would like to test on them diff --git a/custom_traced_data.py b/custom_traced_data.py index ee61c83..18fa8c5 100644 --- a/custom_traced_data.py +++ b/custom_traced_data.py @@ -15,10 +15,8 @@ from lxml import etree from tqdm import tqdm -from ..constants import (LEAN4_PACKAGES_DIR, LOAD_USED_PACKAGES_ONLY, - NUM_WORKERS) -from ..utils import (compute_md5, is_git_repo, to_dep_path, to_json_path, - to_lean_path, to_xml_path) +from ..constants import (LEAN4_PACKAGES_DIR, LOAD_USED_PACKAGES_ONLY, NUM_WORKERS) +from ..utils import (compute_md5, is_git_repo, to_dep_path, to_json_path, to_lean_path, to_xml_path) from .ast import * from .lean import LeanFile, LeanGitRepo, Pos, Theorem diff --git a/custom_utils.py b/custom_utils.py index 181e9ac..33fd56d 100644 --- a/custom_utils.py +++ b/custom_utils.py @@ -15,8 +15,7 @@ from loguru import logger -from .constants import (LEAN4_BUILD_DIR, LEAN4_PACKAGES_DIR, NUM_WORKERS, - TMP_DIR) +from .constants import LEAN4_BUILD_DIR, LEAN4_PACKAGES_DIR, TMP_DIR @contextmanager diff --git a/filenames.py b/filenames.py new file mode 100644 index 0000000..8f7cfc4 --- /dev/null +++ b/filenames.py @@ -0,0 +1,12 @@ +import os + +RAID_DIR = os.environ.get("RAID_DIR") +os.environ["RAY_TMPDIR"] = os.path.join(RAID_DIR, "tmp") +REPO_DIR = os.path.join(RAID_DIR, "repos") +DATA_DIR = os.path.join(RAID_DIR, "data") +CHECKPOINT_DIR = os.path.join(RAID_DIR, "checkpoints") +EVAL_RESULTS_FILE_PATH = os.path.join(RAID_DIR, "eval_results.txt") +DB_FILE_NAME = "db_file.txt" +PROOF_LOG_FILE_NAME = os.path.join(RAID_DIR, "proof_log.txt") +ENCOUNTERED_THEOREMS_FILE = os.path.join(RAID_DIR, "encountered_theorems.pkl") +FISHER_DIR = os.path.join(RAID_DIR, "fisher") # Optional \ No newline at end of file diff --git a/git_utils.py b/git_utils.py index 6d56fb4..2c793ea 100644 --- a/git_utils.py +++ b/git_utils.py @@ -20,17 +20,7 @@ personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN") BATCH_SIZE = 4 -RAID_DIR = os.environ.get("RAID_DIR") -os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp" -repo_dir = f"{RAID_DIR}/repos_new" - -DATA_DIR = f"{RAID_DIR}/data" -CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints" -EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt" -DB_FILE_NAME = "db_file.txt" -PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt" -ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl" -FISHER_DIR = f"{RAID_DIR}/fisher" # Optional +from filenames import REPO_DIR, DATA_DIR def clone_repo(repo_url): @@ -39,7 +29,8 @@ def clone_repo(repo_url): repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "") logger.info(f"Cloning {repo_url}") logger.info(f"Repo name: {repo_name}") - repo_name = os.path.join(repo_dir, repo_name) + repo_name = os.path.join(REPO_DIR, repo_name) + if os.path.exists(repo_name): print(f"Deleting existing repository directory: {repo_name}") shutil.rmtree(repo_name) @@ -54,7 +45,7 @@ def clone_repo(repo_url): def branch_exists(repo_name, branch_name): """Check if a branch exists in a git repository.""" proc = subprocess.run( - ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True + ["git", "-C", repo_name, "branch", "-a"], stdout=subprocess.PIPE, text=True ) branches = proc.stdout.split("\n") local_branch = branch_name @@ -157,130 +148,125 @@ def ensure_inside_git(): def get_compatible_commit(url): """Find the most recent commit with a Lean version that LeanAgent supports.""" - try: - process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) - stdout, stderr = process.communicate() - latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0] - logger.info(f"Latest commit: {latest_commit}") - - new_url = url.replace(".git", "") - logger.info(f"Creating LeanGitRepo for {new_url}") - - repo = LeanGitRepo(new_url, latest_commit) - logger.info(f"Getting config for {url}") - - config = repo.get_config("lean-toolchain") - v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) - - if generate_benchmark_lean4.is_supported_version(v): - logger.info(f"Latest commit compatible for url {url}") - return latest_commit, v - - logger.info(f"Searching for compatible commit for {url}") - - ensure_inside_git() - ZZ - process = subprocess.Popen( - ["git", "fetch", "--depth=1000000", url], # Fetch commits - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - logger.info(f"Fetching commits for {url}") - _, stderr = process.communicate() - - if process.returncode != 0: - raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}") - - logger.info(f"Fetched commits for {url}") - - process = subprocess.Popen( - ["git", "log", "--format=%H", "FETCH_HEAD"], # Get list of commits - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - logger.info(f"Getting list of commits for {url}") - - stdout, stderr = process.communicate() - if process.returncode != 0: - raise Exception(f"Git log command failed: {stderr.decode('utf-8')}") - - commits = stdout.decode("utf-8").strip().split("\n") - logger.info(f"Found {len(commits)} commits for {url}") - - new_url = url.replace(".git", "") - - repo_human_name = "/".join(new_url.split("/")[-2:]) - - # Delete repo if it exists, because it might be checked out to a different commit - if os.path.exists(os.path.join("repos", repo_human_name)): - shutil.rmtree(os.path.join("repos", repo_human_name)) + if "mathlib4" in url or "SciLean" in url or "pfr" in url: + if "mathlib4" in url: + sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" + v = "v4.8.0" + elif "SciLean" in url: + sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744" + v = "v4.7.0" + elif "pfr" in url: + sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687" + v = "v4.8.0-rc1" + return sha, v + else: + with open(os.path.join("RAID", "data", "repo_info_compatible.json"), "r") as f: + try: + repos_and_compatible_commits = json.load(f) + except json.JSONDecodeError: + repos_and_compatible_commits = [] - subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True) - - for commit in commits: - logger.info(f"Checking commit {commit} for {url}") - # Check out the commit locally - subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True) - import ipdb; ipdb.set_trace() - repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit) + if url in [repo["url"] + ".git" for repo in repos_and_compatible_commits if repo["commit"]]: + logger.info(f"Repository {url} already has a compatible commit.") + return None, None + + try: + process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) + stdout, stderr = process.communicate() + latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0] + logger.info(f"Latest commit: {latest_commit}") + + new_url = url.replace(".git", "") + logger.info(f"Creating LeanGitRepo for {new_url}") + + repo = LeanGitRepo(new_url, latest_commit) + logger.info(f"Getting config for {url}") + config = repo.get_config("lean-toolchain") v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) + if generate_benchmark_lean4.is_supported_version(v): - logger.info(f"Found compatible commit {commit} for {url}") - return commit, v - - raise Exception("No compatible commit found") - - except Exception as e: - logger.info(f"Error in get_compatible_commit: {str(e)}") - return None, None - - -def find_and_save_compatible_commits(repo_info_file, lean_git_repos): - """Finds compatible commits for various repositories""" - with open(repo_info_file, "r") as repo_compatibility_file: - updated_repos = json.loads(repo_compatibility_file) - - for repo in lean_git_repos: - url = repo.url - if not url.endswith(".git"): - url = url + ".git" + logger.info(f"Latest commit compatible for url {url}") + return latest_commit, v - sha = None - v = None + logger.info(f"Searching for compatible commit for {url}") - # TODO: Check these - if "mathlib4" in url: - sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" - v = "v4.8.0" - elif "SciLean" in url: - sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744" - v = "v4.7.0" - elif "pfr" in url: - sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687" - v = "v4.8.0-rc1" - else: - # Check if it's in any element - for elem in updated_repos: - if url.replace(".git", "") == elem["url"]: - continue - - sha, v = get_compatible_commit(url) + ensure_inside_git() + process = subprocess.Popen( + ["git", "fetch", "--depth=1000000", url], # Fetch commits + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) - - # Always write to json, even for null repos - updated_repos.append( - {"url": url.replace(".git", ""), "commit": sha if sha else None, "version": v if v else None} + logger.info(f"Fetching commits for {url}") + _, stderr = process.communicate() + + if process.returncode != 0: + raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}") + + logger.info(f"Fetched commits for {url}") + + process = subprocess.Popen( + ["git", "log", "--format=%H", "FETCH_HEAD"], # Get list of commits + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) - if not sha: - logger.info(f"Failed to find a compatible commit for {url}") + logger.info(f"Getting list of commits for {url}") + + stdout, stderr = process.communicate() + if process.returncode != 0: + raise Exception(f"Git log command failed: {stderr.decode('utf-8')}") + + commits = stdout.decode("utf-8").strip().split("\n") + logger.info(f"Found {len(commits)} commits for {url}") + + new_url = url.replace(".git", "") + + repo_human_name = "/".join(new_url.split("/")[-2:]) + + # Delete repo if it exists, because it might be checked out to a different commit + if os.path.exists(os.path.join("repos", repo_human_name)): + shutil.rmtree(os.path.join("repos", repo_human_name)) + + subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True) + for commit in commits: + logger.info(f"Checking commit {commit} for {url}") + # Check out the commit locally + subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], capture_output=False, check=True) + + # Check the lean-toolchain file manually, avoid calling LeanGitRepo because it makes a lot of web requests + with open(os.path.join("repos", repo_human_name, "lean-toolchain"), "r") as f: + config_content = f.read() + + v = generate_benchmark_lean4.get_lean4_version_from_config(config_content) + if generate_benchmark_lean4.is_supported_version(v): + logger.info(f"Found compatible commit {commit} for {url}") + repos_and_compatible_commits.append({"url": url.replace(".git", ""), "commit": commit, "version": v}) + with open(os.path.join(DATA_DIR, "repo_info_compatible.json"), "w") as f: + json.dump(repos_and_compatible_commits, f, indent=2) + f.flush() + + return commit, v + raise Exception("No compatible commit found") + except Exception as e: + logger.info(f"Error in get_compatible_commit: {str(e)}") + return None, None - # Write per repo in case of interrupt - with open(repo_info_file, "w") as f: - json.dump(updated_repos, f) +def find_and_save_compatible_commits(repo_info_file, lean_git_repos): + """Finds and saves compatible commits for various repositories""" + for repo in lean_git_repos: + url = repo.url + if not url.endswith(".git"): + url = url + ".git" + + # Saves the compatible commit in repo_info_file + _sha, _v = get_compatible_commit(url) + + with open(repo_info_file, "r") as repos_and_compatible_commits_f: + updated_repos = json.load(repos_and_compatible_commits_f) + return updated_repos @@ -289,7 +275,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos headers = {"Authorization": personal_access_token} query_params = { "q": f"language:{language}", - "sort": "stars", + "sort": "stars", # What can this be? "order": "desc", "per_page": 100, } @@ -362,17 +348,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): url = url + ".git" logger.info(f"\n\nProcessing {url}") - if "mathlib4" in url: - sha = "2b29e73438e240a427bcecc7c0fe19306beb1310" - v = "v4.8.0" - elif "SciLean" in url: - sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744" - v = "v4.7.0" - elif "pfr" in url: - sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687" - v = "v4.8.0-rc1" - else: - sha, v = get_compatible_commit(url) + sha, v = get_compatible_commit(url) if not sha: logger.info(f"Failed to find a compatible commit for {url}") @@ -382,25 +358,29 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): url = url.replace(".git", "") repo = LeanGitRepo(url, sha) dir_name = repo.url.split("/")[-1] + "_" + sha - dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name + dst_dir = os.path.join(DATA_DIR, dir_name) logger.info(f"Generating benchmark at {dst_dir}") traced_repo, _, _, total_theorems = generate_benchmark_lean4.main( repo.url, sha, dst_dir ) + if not traced_repo: logger.info(f"Failed to trace {url}") return None - if total_theorems < 3 * BATCH_SIZE: # Should be enough theorems for train/val/test - logger.info(f"No theorems found in {url}") + + if total_theorems < 3 * BATCH_SIZE: # Require enough theorems for train/val/test + logger.info(f"Not enough theorems found in {url}") return None + logger.info(f"Finished generating benchmark at {dst_dir}") # Add the new repo to the dynamic database config = repo.get_config("lean-toolchain") v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) - theorems_folder = dst_dir + "/random" - premise_files_corpus = dst_dir + "/corpus.jsonl" - files_traced = dst_dir + "/traced_files.jsonl" + theorems_folder = os.path.join(dst_dir, "theorems") + premise_files_corpus = os.path.join(dst_dir, "corpus.jsonl") + files_traced = os.path.join(dst_dir, "traced_files.jsonl") + pr_url = None data = { "url": repo.url, @@ -420,9 +400,12 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): repo = Repository.from_dict(data) logger.info("Before adding new repo:") db.print_database_contents() - db.add_repository(repo) + + logger.info("After adding new repo:") + db.add_repository(repo) db.print_database_contents() + db.to_json(dynamic_database_json_path) return "Done" @@ -532,14 +515,14 @@ def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]: def write_skip_file(repo_url): """Writes a repository URL to a file to skip it.""" - skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") + skip_file_path = os.path.join(DATA_DIR, "skip_repo.txt") with open(skip_file_path, "w") as f: f.write(repo_url) def should_skip_repo(): """Checks if a repository should be skipped.""" - skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt") + skip_file_path = os.path.join(DATA_DIR, "skip_repo.txt") if os.path.exists(skip_file_path): with open(skip_file_path, "r") as f: repo_url = f.read().strip() diff --git a/leanagent.py b/leanagent.py index 68a3e4d..d29ad0b 100644 --- a/leanagent.py +++ b/leanagent.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Set, Tuple import numpy as np @@ -36,23 +36,14 @@ from retrieval.model import PremiseRetriever from git_utils import find_and_save_compatible_commits, search_github_repositories, should_skip_repo, add_repo_to_database, sort_repositories_by_difficulty, save_sorted_repos +from filenames import DATA_DIR, RAID_DIR, CHECKPOINT_DIR, EVAL_RESULTS_FILE_PATH, DB_FILE_NAME, PROOF_LOG_FILE_NAME, ENCOUNTERED_THEOREMS_FILE, FISHER_DIR # Set the seed for reproducibility personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN") random.seed(3407) # https://arxiv.org/abs/2109.08203 BATCH_SIZE = 4 -RAID_DIR = os.environ.get("RAID_DIR") -os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp" -repo_dir = f"{RAID_DIR}/repos_new" - -DATA_DIR = f"{RAID_DIR}/data" -CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints" -EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt" -DB_FILE_NAME = "db_file.txt" -PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt" -ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl" -FISHER_DIR = f"{RAID_DIR}/fisher" # Optional + repos_for_merged_dataset = [] repos_for_proving = [] @@ -74,6 +65,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]: pred = preds_map[key] else: continue + all_pos_premises = set(pred["all_pos_premises"]) if len(all_pos_premises) == 0: continue @@ -111,14 +103,14 @@ def load_fisher_information(file_path): def find_latest_checkpoint(): """Finds the most recent checkpoint.""" - checkpoint_dir = RAID_DIR + "/" + CHECKPOINT_DIR all_checkpoints = [ - os.path.join(checkpoint_dir, f) - for f in os.listdir(checkpoint_dir) + os.path.join(CHECKPOINT_DIR, f) + for f in os.listdir(CHECKPOINT_DIR) if f.endswith(".ckpt") ] - if not all_checkpoints: + if len(all_checkpoints) == 0: raise FileNotFoundError("No checkpoints found.") + latest_checkpoint = max(all_checkpoints, key=os.path.getmtime) logger.info(f"Using the latest checkpoint: {latest_checkpoint}") return latest_checkpoint @@ -126,14 +118,14 @@ def find_latest_checkpoint(): def find_latest_fisher(): """Finds the most recent Fisher Information Matrix.""" - fisher_dir = RAID_DIR + "/" + FISHER_DIR all_fisher = [ - os.path.join(fisher_dir, f) - for f in os.listdir(fisher_dir) + os.path.join(FISHER_DIR, f) + for f in os.listdir(FISHER_DIR) if f.endswith(".pkl") ] - if not all_fisher: + if len(all_fisher) == 0: raise FileNotFoundError("No Fisher Information Matrices found.") + latest_fisher = max(all_fisher, key=os.path.getmtime) logger.info(f"Using the latest Fisher Information Matrix: {latest_fisher}") return latest_fisher @@ -401,7 +393,7 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p global lean_git_repos global repos # If curriculum learning is enabled, initialize repositories and sort them by difficulty - repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json") + repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json") # Check if the current process is the main one is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0 if curriculum_learning: @@ -528,7 +520,7 @@ def main(): lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db) - repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json") + repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json") # All processes wait for the file to be created and then read from it # TODO: Fix with a semaphore or file lock max_attempts = 30 @@ -703,7 +695,7 @@ def main(): if is_main_process: logger.info("Removing skip file") skip_file_path = os.path.join( - RAID_DIR, DATA_DIR, "skip_repo.txt" + DATA_DIR, "skip_repo.txt" ) os.remove(skip_file_path) continue diff --git a/leanagent_utils.py b/leanagent_utils.py index 684b390..381ab8b 100644 --- a/leanagent_utils.py +++ b/leanagent_utils.py @@ -1,6 +1,4 @@ -MARK_START_SYMBOL = "" -MARK_END_SYMBOL = "" - +from constants import MARK_END_SYMBOL, MARK_START_SYMBOL def remove_marks(s: str) -> str: """Remove all :code:`` and :code:`` from ``s``.""" From 29523db5259111385a86688b5b7d4d16ef47dfc4 Mon Sep 17 00:00:00 2001 From: motiwari Date: Mon, 15 Sep 2025 10:58:52 -0700 Subject: [PATCH 08/29] Trying to update paths so they're created in RAID_DIR/reposM --- constants.py | 6 ++++++ generate_benchmark_lean4.py | 3 +++ git_utils.py | 8 ++++++-- leanagent.py | 8 +++----- run_leanagent.sh | 5 +++-- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/constants.py b/constants.py index b87e291..d11b663 100644 --- a/constants.py +++ b/constants.py @@ -229,10 +229,16 @@ "madvorak/chomsky", "T-Brick/ControlFlow", "pa-ba/guarded-lean", + + + ] known_dead_repos = [ "uwdb/Cosette", "notepad-plus-plus/userDefinedLanguages", "teorth/analysis", + + # Added by Mo to find smaller repo to iterate on + ] \ No newline at end of file diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index 67c182c..f1942f1 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -551,13 +551,16 @@ def main(url, commit, dst_dir): logger.info("LeanDojo configured") try: + import ipdb; ipdb.set_trace() logger.info("Tracing the repo...") traced_repo = trace(repo) logger.info("Successfully traced the repo") except Exception as e: logger.info(f"Failed to trace repo {repo} because of {e}") return None, 0, 0, 10 + safe_remove_dir(dst_dir) + splits = split_data(traced_repo) logger.info("Successfully split the data") num_premises, num_files_traced, total_theorems = export_data( diff --git a/git_utils.py b/git_utils.py index 2c793ea..cf4c459 100644 --- a/git_utils.py +++ b/git_utils.py @@ -8,9 +8,12 @@ from lean_dojo import LeanGitRepo from datetime import datetime import lean_dojo +from lean_dojo.data_extraction.cache import _split_git_url from collections import defaultdict from dynamic_database import Repository, DynamicDatabase, Theorem + + from loguru import logger from typing import Union, List, Tuple import math @@ -26,7 +29,7 @@ def clone_repo(repo_url): """Clone a git repository and return the path to the repository and its sha.""" # TODO: Fix - repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "") + repo_name = os.path.join(*_split_git_url(repo_url)).replace(".git", "") logger.info(f"Cloning {repo_url}") logger.info(f"Repo name: {repo_name}") repo_name = os.path.join(REPO_DIR, repo_name) @@ -168,7 +171,8 @@ def get_compatible_commit(url): if url in [repo["url"] + ".git" for repo in repos_and_compatible_commits if repo["commit"]]: logger.info(f"Repository {url} already has a compatible commit.") - return None, None + repo = [repo for repo in repos_and_compatible_commits if repo["url"] + ".git" == url][0] + return repo["commit"], repo["version"] try: process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE) diff --git a/leanagent.py b/leanagent.py index d29ad0b..509e9bf 100644 --- a/leanagent.py +++ b/leanagent.py @@ -574,9 +574,7 @@ def main(): db.generate_merged_dataset(dst_dir, repos_for_merged_dataset) - dst_dir = ( - RAID_DIR + "/" + DATA_DIR + "/" + f"merged_with_new_{dir_name}" - ) + dst_dir = os.path.join(DATA_DIR, f"merged_with_new_{dir_name}") new_data_path = dst_dir logger.info("All GPUs") @@ -631,7 +629,7 @@ def main(): dir_name = new_data_path.split("/")[-1] filename_suffix = f"_lambda_{lambda_value}" checkpoint_callback = ModelCheckpoint( - dirpath=RAID_DIR + "/" + CHECKPOINT_DIR, + dirpath=CHECKPOINT_DIR, filename=dir_name + filename_suffix + "_{epoch}-{Recall@10_val:.2f}", @@ -768,7 +766,7 @@ def main(): logger.info("Testing...") total_R1, total_R10, total_MRR = [], [], [] - dataset_path = RAID_DIR + "/" + DATA_DIR + dataset_path = DATA_DIR testing_paths = [ os.path.join(dataset_path, d) for d in os.listdir(dataset_path) ] diff --git a/run_leanagent.sh b/run_leanagent.sh index e127b50..9cb9aff 100755 --- a/run_leanagent.sh +++ b/run_leanagent.sh @@ -24,10 +24,11 @@ # # Usage: bash run_leanagent.sh #!/bin/bash -export RAID_DIR="~/Desktop/LeanAgent/RAID/" -export LEAN_AGENT_DIR="~/Desktop/LeanAgent" +export RAID_DIR="/Users/motiwari/Desktop/LeanAgent/RAID" +export LEAN_AGENT_DIR="/Users/motiwari/Desktop/LeanAgent" export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent" export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo" +export REPO_DIR ="${RAID_DIR}/repos" export RAY_TMPDIR="${RAID_DIR}/tmp" export CONDA_SH="/Users/motiwari/miniforge3/etc/profile.d/conda.sh" source ${CONDA_SH} From c2a9f0a04b8d75ac81be7763964880bca5dc4b01 Mon Sep 17 00:00:00 2001 From: motiwari Date: Thu, 18 Sep 2025 11:15:47 -0700 Subject: [PATCH 09/29] Updating changes --- .gitignore | 4 ++++ generate_benchmark_lean4.py | 2 -- git_utils.py | 34 +++++++++++++++++++++++++--------- leanagent.py | 7 +++++++ 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index dce68d7..5c87a7b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +workspace +workspace-backup + + *.pkl retrieval/bm25 .idea/ diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index f1942f1..27056b5 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -549,9 +549,7 @@ def main(url, commit, dst_dir): logger.info("Configuring LeanDojo again...") configure_leandojo() logger.info("LeanDojo configured") - try: - import ipdb; ipdb.set_trace() logger.info("Tracing the repo...") traced_repo = trace(repo) logger.info("Successfully traced the repo") diff --git a/git_utils.py b/git_utils.py index cf4c459..2be50c3 100644 --- a/git_utils.py +++ b/git_utils.py @@ -28,23 +28,30 @@ def clone_repo(repo_url): """Clone a git repository and return the path to the repository and its sha.""" - # TODO: Fix repo_name = os.path.join(*_split_git_url(repo_url)).replace(".git", "") - logger.info(f"Cloning {repo_url}") + logger.info(f"Repo name: {repo_name}") - repo_name = os.path.join(REPO_DIR, repo_name) + repo_name = os.path.join(REPO_DIR, repo_name) if os.path.exists(repo_name): - print(f"Deleting existing repository directory: {repo_name}") - shutil.rmtree(repo_name) + print(f"Repository already exists in directory: {repo_name}") + process = subprocess.Popen( + ["git", "-C", repo_name, "rev-parse", "HEAD"], stdout=subprocess.PIPE + ) + stdout, _stderr = process.communicate() + else: + logger.info(f"Cloning {repo_url} from scratch") + subprocess.run(["git", "clone", repo_url, repo_name]) + process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE) + stdout, _stderr = process.communicate() - subprocess.run(["git", "clone", repo_url, repo_name]) - process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE) - stdout, _stderr = process.communicate() sha = re.split(r"\t+", stdout.decode("utf-8"))[0] + sha = sha.strip() + print("Sha is " + sha) return repo_name, sha + def branch_exists(repo_name, branch_name): """Check if a branch exists in a git repository.""" proc = subprocess.run( @@ -231,6 +238,7 @@ def get_compatible_commit(url): # Delete repo if it exists, because it might be checked out to a different commit if os.path.exists(os.path.join("repos", repo_human_name)): + logger.info(f"CAREFUL: Deleting existing repo at {os.path.join('repos', repo_human_name)}") shutil.rmtree(os.path.join("repos", repo_human_name)) subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True) @@ -316,7 +324,6 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos repo_name, sha = clone_repo(clone_url) name = repo_name url = clone_url.replace(".git", "") - # TODO: This constructor can be very slow lean_git_repo = LeanGitRepo(url, sha) @@ -325,6 +332,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos cloned_count += 1 logger.info(f"Cloned {repo_full_name}") except Exception as e: + logger.info(f"CAREFUL: Deleting existing repo at {os.path.join('repos', repo_full_name)}") shutil.rmtree(name) logger.info(f"Failed to clone {repo_full_name} because of {e}") else: @@ -359,11 +367,19 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): return None logger.info(f"Found compatible commit {sha} for {url} with lean version: {v}") + + # Ensure that the repo is checked out to the compatible commit + repo_name, _ = clone_repo(url) + subprocess.run(["git", "-C", repo_name, "checkout", sha], check=True) + logger.info(f"Checked out {url} to commit {sha}") + + url = url.replace(".git", "") repo = LeanGitRepo(url, sha) dir_name = repo.url.split("/")[-1] + "_" + sha dst_dir = os.path.join(DATA_DIR, dir_name) logger.info(f"Generating benchmark at {dst_dir}") + traced_repo, _, _, total_theorems = generate_benchmark_lean4.main( repo.url, sha, dst_dir ) diff --git a/leanagent.py b/leanagent.py index 509e9bf..de84f97 100644 --- a/leanagent.py +++ b/leanagent.py @@ -399,6 +399,10 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p if curriculum_learning: logger.info("Starting curriculum learning") if is_main_process: + if num_repos < 3: + logger.warning("num_repos should be at least 3 for curriculum learning") + + lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "Lean", num_repos) for i in range(len(lean_git_repos)): @@ -413,6 +417,9 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p f"Successfully added {num_repos} repositories to the database" ) + if len(db.repositories) < 3: + raise ValueError("The database should contain at least 3 repositories for curriculum learning") + sorted_repos, categorized_theorems, percentiles = ( sort_repositories_by_difficulty(db) ) From 97dc30308f5aa7272ed61cc803e61b6133badbbc Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 23 Sep 2025 09:37:12 -0400 Subject: [PATCH 10/29] fix: run on CPU-only dev envs; handle empty difficulty list --- .gitignore | 7 +++++++ common.py | 11 +++++++++-- git_utils.py | 4 ++++ prover/proof_search.py | 7 ++++++- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 5c87a7b..4f25a37 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,10 @@ dmypy.json # Pyre type checker .pyre/ /.lake + +# local artifacts +RAID/ +lean_dojo/ +*.log +.lake/ +**/.wt-lean48* diff --git a/common.py b/common.py index b9eafef..1509193 100644 --- a/common.py +++ b/common.py @@ -10,10 +10,17 @@ import networkx as nx import pytorch_lightning as pl import torch -from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam +try: + from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam +except Exception: + DeepSpeedCPUAdam = None + FusedAdam = None from lean_dojo import Pos from loguru import logger -from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy +try: + from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy +except Exception: + class DeepSpeedStrategy: ... # placeholder so isinstance checks won't explode from pytorch_lightning.utilities.deepspeed import \ convert_zero_checkpoint_to_fp32_state_dict from transformers import get_cosine_schedule_with_warmup diff --git a/git_utils.py b/git_utils.py index 2be50c3..28a2d32 100644 --- a/git_utils.py +++ b/git_utils.py @@ -480,6 +480,10 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]: db.update_repository(repo) print(f"Finished {repo.name}") + if len(all_difficulties) == 0: + from loguru import logger + logger.warning("No theorem difficulties found; skipping difficulty bucketing.") + return [] percentiles = np.percentile(all_difficulties, [33, 67]) categorized_theorems = defaultdict(lambda: defaultdict(list)) diff --git a/prover/proof_search.py b/prover/proof_search.py index dd08fe6..0c80122 100644 --- a/prover/proof_search.py +++ b/prover/proof_search.py @@ -17,7 +17,12 @@ TimeoutError) from loguru import logger from ray.util.actor_pool import ActorPool -from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams +try: + from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams + VLLM_AVAILABLE = True +except Exception: + AsyncEngineArgs = AsyncLLMEngine = RequestOutput = SamplingParams = None + VLLM_AVAILABLE = False from common import zip_strict from generator.model import FixedTacticGenerator, RetrievalAugmentedGenerator From 6066b3acf19e3958b223a156ce3bf75dbf720ee1 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Thu, 23 Oct 2025 23:26:07 -0400 Subject: [PATCH 11/29] Guard repo_info reads with a file lock --- leanagent.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/leanagent.py b/leanagent.py index de84f97..c87e84e 100644 --- a/leanagent.py +++ b/leanagent.py @@ -1,3 +1,4 @@ +import fcntl import json import os import pickle @@ -506,7 +507,7 @@ def main(): use_fisher = False single_repo = True curriculum_learning = True - num_repos = 1 + num_repos = 4 dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME) lambdas = None @@ -528,14 +529,18 @@ def main(): lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db) repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json") - # All processes wait for the file to be created and then read from it - # TODO: Fix with a semaphore or file lock + lock_path = f"{repo_info_file}.lock" max_attempts = 30 for attempt in range(max_attempts): try: - with open(repo_info_file, "r") as f: - repo_info = json.load(f) - break + with open(lock_path, "a") as lock_handle: + fcntl.flock(lock_handle.fileno(), fcntl.LOCK_EX) + try: + with open(repo_info_file, "r") as f: + repo_info = json.load(f) + break + finally: + fcntl.flock(lock_handle.fileno(), fcntl.LOCK_UN) except (json.JSONDecodeError, FileNotFoundError): if attempt == max_attempts - 1: raise Exception( From 8a94e5b036b8a8009b327f2c44abe14641dd89a6 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 28 Oct 2025 19:12:18 -0400 Subject: [PATCH 12/29] Guard JSON access with file lock and adjust dataset handling --- .mplcache/fontlist-v390.json | 4664 ++++++++++++++++++++++++++++++++++ filenames.py | 6 +- git_utils.py | 6 +- leanagent.py | 38 +- requirements-local.txt | 20 + requirements.cpu.txt | 17 + retrieval/bm25/main.py | 37 +- scripts/manual_trace.py | 76 + testfile_root | 2 + 9 files changed, 4828 insertions(+), 38 deletions(-) create mode 100644 .mplcache/fontlist-v390.json create mode 100644 requirements-local.txt create mode 100644 requirements.cpu.txt create mode 100644 scripts/manual_trace.py create mode 100644 testfile_root diff --git a/.mplcache/fontlist-v390.json b/.mplcache/fontlist-v390.json new file mode 100644 index 0000000..5910b87 --- /dev/null +++ b/.mplcache/fontlist-v390.json @@ -0,0 +1,4664 @@ +{ + "_version": 390, + "_FontManager__default_weight": "normal", + "default_size": null, + "defaultFamily": { + "ttf": "DejaVu Sans", + "afm": "Helvetica" + }, + "afmlist": [ + { + "fname": "fonts/afm/pbkli8a.afm", + "name": "ITC Bookman", + "style": "italic", + "variant": "normal", + "weight": "light", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Courier-Bold.afm", + "name": "Courier", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/cmex10.afm", + "name": "Computer Modern", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvb8a.afm", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pbkdi8a.afm", + "name": "ITC Bookman", + "style": "italic", + "variant": "normal", + "weight": "demi", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/ptmri8a.afm", + "name": "Times", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/cmtt10.afm", + "name": "Computer Modern", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvlo8a.afm", + "name": "Helvetica", + "style": "italic", + "variant": "normal", + "weight": "light", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pzcmi8a.afm", + "name": "ITC Zapf Chancery", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/putri8a.afm", + "name": "Utopia", + "style": "italic", + "variant": "normal", + "weight": "regular", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Symbol.afm", + "name": "Symbol", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/ptmbi8a.afm", + "name": "Times", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pagko8a.afm", + "name": "ITC Avant Garde Gothic", + "style": "italic", + "variant": "normal", + "weight": "book", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvro8an.afm", + "name": "Helvetica", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvr8an.afm", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pzdr.afm", + "name": "ITC Zapf Dingbats", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pplri8a.afm", + "name": "Palatino", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/cmmi10.afm", + "name": "Computer Modern", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Times-Italic.afm", + "name": "Times", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Helvetica-Oblique.afm", + "name": "Helvetica", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Helvetica-BoldOblique.afm", + "name": "Helvetica", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/ptmr8a.afm", + "name": "Times", + "style": "normal", + "variant": "normal", + "weight": "roman", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Times-Bold.afm", + "name": "Times", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pagdo8a.afm", + "name": "ITC Avant Garde Gothic", + "style": "italic", + "variant": "normal", + "weight": "demi", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvro8a.afm", + "name": "Helvetica", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Times-BoldItalic.afm", + "name": "Times", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Helvetica.afm", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvl8a.afm", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": "light", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvbo8an.afm", + "name": "Helvetica", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/putbi8a.afm", + "name": "Utopia", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pcrb8a.afm", + "name": "Courier", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pplbi8a.afm", + "name": "Palatino", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pcrbo8a.afm", + "name": "Courier", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/ZapfDingbats.afm", + "name": "ZapfDingbats", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Courier.afm", + "name": "Courier", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pcrr8a.afm", + "name": "Courier", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/cmsy10.afm", + "name": "Computer Modern", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/ptmb8a.afm", + "name": "Times", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvb8an.afm", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pncri8a.afm", + "name": "New Century Schoolbook", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/putb8a.afm", + "name": "Utopia", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pncb8a.afm", + "name": "New Century Schoolbook", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pagk8a.afm", + "name": "ITC Avant Garde Gothic", + "style": "normal", + "variant": "normal", + "weight": "book", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pplb8a.afm", + "name": "Palatino", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Courier-BoldOblique.afm", + "name": "Courier", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pbkl8a.afm", + "name": "ITC Bookman", + "style": "normal", + "variant": "normal", + "weight": "light", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvr8a.afm", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/phvbo8a.afm", + "name": "Helvetica", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Courier-Oblique.afm", + "name": "Courier", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/putr8a.afm", + "name": "Utopia", + "style": "normal", + "variant": "normal", + "weight": "regular", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/psyr.afm", + "name": "Symbol", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pplr8a.afm", + "name": "Palatino", + "style": "normal", + "variant": "normal", + "weight": "roman", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Times-Roman.afm", + "name": "Times", + "style": "normal", + "variant": "normal", + "weight": "roman", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pagd8a.afm", + "name": "ITC Avant Garde Gothic", + "style": "normal", + "variant": "normal", + "weight": "demi", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pbkd8a.afm", + "name": "ITC Bookman", + "style": "normal", + "variant": "normal", + "weight": "demi", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pcrro8a.afm", + "name": "Courier", + "style": "italic", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/pdfcorefonts/Helvetica-Bold.afm", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pncr8a.afm", + "name": "New Century Schoolbook", + "style": "normal", + "variant": "normal", + "weight": "roman", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/pncbi8a.afm", + "name": "New Century Schoolbook", + "style": "italic", + "variant": "normal", + "weight": "bold", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/afm/cmr10.afm", + "name": "Computer Modern", + "style": "normal", + "variant": "normal", + "weight": "medium", + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + } + ], + "ttflist": [ + { + "fname": "fonts/ttf/DejaVuSansMono-Oblique.ttf", + "name": "DejaVu Sans Mono", + "style": "oblique", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizOneSymBol.ttf", + "name": "STIXSizeOneSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSans.ttf", + "name": "DejaVu Sans", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXGeneral.ttf", + "name": "STIXGeneral", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSansMono-Bold.ttf", + "name": "DejaVu Sans Mono", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXGeneralBolIta.ttf", + "name": "STIXGeneral", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSerif.ttf", + "name": "DejaVu Serif", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizFourSymBol.ttf", + "name": "STIXSizeFourSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/cmsy10.ttf", + "name": "cmsy10", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/cmex10.ttf", + "name": "cmex10", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizThreeSymBol.ttf", + "name": "STIXSizeThreeSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/cmss10.ttf", + "name": "cmss10", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/cmmi10.ttf", + "name": "cmmi10", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSerif-Italic.ttf", + "name": "DejaVu Serif", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/cmtt10.ttf", + "name": "cmtt10", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXGeneralItalic.ttf", + "name": "STIXGeneral", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXNonUniIta.ttf", + "name": "STIXNonUnicode", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSans-BoldOblique.ttf", + "name": "DejaVu Sans", + "style": "oblique", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSansMono.ttf", + "name": "DejaVu Sans Mono", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSerifDisplay.ttf", + "name": "DejaVu Serif Display", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSerif-Bold.ttf", + "name": "DejaVu Serif", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSansDisplay.ttf", + "name": "DejaVu Sans Display", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizOneSymReg.ttf", + "name": "STIXSizeOneSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXNonUniBolIta.ttf", + "name": "STIXNonUnicode", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXGeneralBol.ttf", + "name": "STIXGeneral", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizFiveSymReg.ttf", + "name": "STIXSizeFiveSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/cmr10.ttf", + "name": "cmr10", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/cmb10.ttf", + "name": "cmb10", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSansMono-BoldOblique.ttf", + "name": "DejaVu Sans Mono", + "style": "oblique", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizTwoSymReg.ttf", + "name": "STIXSizeTwoSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSerif-BoldItalic.ttf", + "name": "DejaVu Serif", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXNonUni.ttf", + "name": "STIXNonUnicode", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSans-Oblique.ttf", + "name": "DejaVu Sans", + "style": "oblique", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/DejaVuSans-Bold.ttf", + "name": "DejaVu Sans", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXNonUniBol.ttf", + "name": "STIXNonUnicode", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizThreeSymReg.ttf", + "name": "STIXSizeThreeSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizTwoSymBol.ttf", + "name": "STIXSizeTwoSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "fonts/ttf/STIXSizFourSymReg.ttf", + "name": "STIXSizeFourSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Herculanum.ttf", + "name": "Herculanum", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansBuhid-Regular.ttf", + "name": "Noto Sans Buhid", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/InaiMathi-MN.ttc", + "name": "InaiMathi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Verdana.ttf", + "name": "Verdana", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/SnellRoundhand.ttc", + "name": "Snell Roundhand", + "style": "normal", + "variant": "normal", + "weight": 500, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTaiViet-Regular.ttf", + "name": "Noto Sans Tai Viet", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMeeteiMayek-Regular.ttf", + "name": "Noto Sans Meetei Mayek", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansCaucasianAlbanian-Regular.ttf", + "name": "Noto Sans Caucasian Albanian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansHanunoo-Regular.ttf", + "name": "Noto Sans Hanunoo", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMahajani-Regular.ttf", + "name": "Noto Sans Mahajani", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Apple Braille.ttf", + "name": "Apple Braille", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXTwoText-Italic.ttf", + "name": "STIX Two Text", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/AlBayan.ttc", + "name": "Al Bayan", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Luminari.ttf", + "name": "Luminari", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansEgyptianHieroglyphs-Regular.ttf", + "name": "Noto Sans Egyptian Hieroglyphs", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Ayuthaya.ttf", + "name": "Ayuthaya", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/PartyLET-plain.ttf", + "name": "Party LET", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Cochin.ttc", + "name": "Cochin", + "style": "normal", + "variant": "normal", + "weight": 500, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W8.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 800, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Gurmukhi.ttf", + "name": "Gurmukhi MT", + "style": "normal", + "variant": "normal", + "weight": 500, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansUgaritic-Regular.ttf", + "name": "Noto Sans Ugaritic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/ITFDevanagari.ttc", + "name": "ITF Devanagari", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Georgia Bold Italic.ttf", + "name": "Georgia", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Khmer Sangam MN.ttf", + "name": "Khmer Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/ThonburiUI.ttc", + "name": ".ThonburiUI", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntDReg.otf", + "name": "STIXIntegralsD", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSerifNyiakengPuachueHmong-Regular.ttf", + "name": "Noto Serif Hmong Nyiakeng", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansBrahmi-Regular.ttf", + "name": "Noto Sans Brahmi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Hoefler Text Ornaments.ttf", + "name": "Hoefler Text", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansBhaiksuki-Regular.ttf", + "name": "Noto Sans Bhaiksuki", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Georgia Italic.ttf", + "name": "Georgia", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Sathu.ttf", + "name": "Sathu", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Kailasa.ttc", + "name": "Kailasa", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFNSMonoItalic.ttf", + "name": ".SF NS Mono", + "style": "italic", + "variant": "normal", + "weight": 295, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Unicode.ttf", + "name": "Arial Unicode MS", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSharada-Regular.ttf", + "name": "Noto Sans Sharada", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Mishafi.ttf", + "name": "Mishafi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Diwan Thuluth.ttf", + "name": "Diwan Thuluth", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXVarBol.otf", + "name": "STIXVariants", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Futura.ttc", + "name": "Futura", + "style": "normal", + "variant": "normal", + "weight": 500, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansThaana-Regular.ttf", + "name": "Noto Sans Thaana", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Gurmukhi Sangam MN.ttc", + "name": "Gurmukhi Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFArmenianRounded.ttf", + "name": ".SF Armenian Rounded", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Diwan Kufi.ttc", + "name": "Diwan Kufi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Webdings.ttf", + "name": "Webdings", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Damascus.ttc", + "name": "Damascus", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/KohinoorGujarati.ttc", + "name": "Kohinoor Gujarati", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTaiTham-Regular.ttf", + "name": "Noto Sans Tai Tham", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Narrow Bold Italic.ttf", + "name": "Arial Narrow", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansElbasan-Regular.ttf", + "name": "Noto Sans Elbasan", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizFiveSymReg.otf", + "name": "STIXSizeFiveSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/PTSans.ttc", + "name": "PT Sans", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansCypriot-Regular.ttf", + "name": "Noto Sans Cypriot", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W6.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 600, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Comic Sans MS.ttf", + "name": "Comic Sans MS", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOldSouthArabian-Regular.ttf", + "name": "Noto Sans Old South Arabian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMiao-Regular.ttf", + "name": "Noto Sans Miao", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansKharoshthi-Regular.ttf", + "name": "Noto Sans Kharoshthi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/DIN Alternate Bold.ttf", + "name": "DIN Alternate", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXNonUniIta.otf", + "name": "STIXNonUnicode", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS Italic.ttf", + "name": "Trebuchet MS", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOsmanya-Regular.ttf", + "name": "Noto Sans Osmanya", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Kannada Sangam MN.ttc", + "name": "Kannada Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Verdana Italic.ttf", + "name": "Verdana", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W1.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 200, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFHebrewRounded.ttf", + "name": ".SF Hebrew Rounded", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/CJKSymbolsFallback.ttc", + "name": ".CJK Symbols Fallback HK", + "style": "normal", + "variant": "normal", + "weight": 542, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizThreeSymBol.otf", + "name": "STIXSizeThreeSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/GillSans.ttc", + "name": "Gill Sans", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansGunjalaGondi-Regular.otf", + "name": "Noto Sans Gunjala Gondi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMultani-Regular.ttf", + "name": "Noto Sans Multani", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTagbanwa-Regular.ttf", + "name": "Noto Sans Tagbanwa", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Black.ttf", + "name": "Arial Black", + "style": "normal", + "variant": "normal", + "weight": 900, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansBatak-Regular.ttf", + "name": "Noto Sans Batak", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Optima.ttc", + "name": "Optima", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFCompactRounded.ttf", + "name": ".SF Compact Rounded", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSaurashtra-Regular.ttf", + "name": "Noto Sans Saurashtra", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Chalkboard.ttc", + "name": "Chalkboard", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Marion.ttc", + "name": "Marion", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/HelveticaNeue.ttc", + "name": "Helvetica Neue", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Wingdings 2.ttf", + "name": "Wingdings 2", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansChakma-Regular.ttf", + "name": "Noto Sans Chakma", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Sinhala MN.ttc", + "name": "Sinhala MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXGeneral.otf", + "name": "STIXGeneral", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSoraSompeng-Regular.ttf", + "name": "Noto Sans Sora Sompeng", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSiddham-Regular.otf", + "name": "Noto Sans Siddham", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansImperialAramaic-Regular.ttf", + "name": "Noto Sans Imperial Aramaic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/Library/Fonts/Arial Unicode.ttf", + "name": "Arial Unicode MS", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSyriac-Regular.ttf", + "name": "Noto Sans Syriac", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansKaithi-Regular.ttf", + "name": "Noto Sans Kaithi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Courier New Bold.ttf", + "name": "Courier New", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXGeneralItalic.otf", + "name": "STIXGeneral", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Lao Sangam MN.ttf", + "name": "Lao Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Krungthep.ttf", + "name": "Krungthep", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSerifYezidi-Regular.otf", + "name": "Noto Serif Yezidi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NotoSansOriya.ttc", + "name": "Noto Sans Oriya", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Apple Symbols.ttf", + "name": "Apple Symbols", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Impact.ttf", + "name": "Impact", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Noteworthy.ttc", + "name": "Noteworthy", + "style": "normal", + "variant": "normal", + "weight": 300, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/ArialHB.ttc", + "name": "Arial Hebrew", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/GujaratiMT.ttc", + "name": "Gujarati MT", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSylotiNagri-Regular.ttf", + "name": "Noto Sans Syloti Nagri", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMeroitic-Regular.ttf", + "name": "Noto Sans Meroitic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Corsiva.ttc", + "name": "Corsiva Hebrew", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansCanadianAboriginal-Regular.otf", + "name": "Noto Sans Canadian Aboriginal", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Galvji.ttc", + "name": "Galvji", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTirhuta-Regular.ttf", + "name": "Noto Sans Tirhuta", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntSmBol.otf", + "name": "STIXIntegralsSm", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W4.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXGeneralBolIta.otf", + "name": "STIXGeneral", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntDBol.otf", + "name": "STIXIntegralsD", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMandaic-Regular.ttf", + "name": "Noto Sans Mandaic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Rounded Bold.ttf", + "name": "Arial Rounded MT Bold", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Verdana Bold Italic.ttf", + "name": "Verdana", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/PTMono.ttc", + "name": "PT Mono", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Courier New Bold Italic.ttf", + "name": "Courier New", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Iowan Old Style.ttc", + "name": "Iowan Old Style", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Nadeem.ttc", + "name": "Nadeem", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Bodoni 72.ttc", + "name": "Bodoni 72", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/PTSerif.ttc", + "name": "PT Serif", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Mshtakan.ttc", + "name": "Mshtakan", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansJavanese-Regular.otf", + "name": "Noto Sans Javanese", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Andale Mono.ttf", + "name": "Andale Mono", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Phosphate.ttc", + "name": "Phosphate", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/BigCaslon.ttf", + "name": "Big Caslon", + "style": "normal", + "variant": "normal", + "weight": 500, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansHanifiRohingya-Regular.ttf", + "name": "Noto Sans Hanifi Rohingya", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansCarian-Regular.ttf", + "name": "Noto Sans Carian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Academy Engraved LET Fonts.ttf", + "name": "Academy Engraved LET", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Courier New.ttf", + "name": "Courier New", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Times.ttc", + "name": "Times", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOldPersian-Regular.ttf", + "name": "Noto Sans Old Persian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Seravek.ttc", + "name": "Seravek", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Microsoft Sans Serif.ttf", + "name": "Microsoft Sans Serif", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Keyboard.ttf", + "name": ".Keyboard", + "style": "normal", + "variant": "normal", + "weight": 100, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NotoSansKannada.ttc", + "name": "Noto Sans Kannada", + "style": "normal", + "variant": "normal", + "weight": 100, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Courier.ttc", + "name": "Courier", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Bold Italic.ttf", + "name": "Arial", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizTwoSymBol.otf", + "name": "STIXSizeTwoSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansPhagsPa-Regular.ttf", + "name": "Noto Sans PhagsPa", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Helvetica.ttc", + "name": "Helvetica", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Athelas.ttc", + "name": "Athelas", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizFourSymBol.otf", + "name": "STIXSizeFourSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansInscriptionalPahlavi-Regular.ttf", + "name": "Noto Sans Inscriptional Pahlavi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Apple Braille Pinpoint 6 Dot.ttf", + "name": "Apple Braille", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansLisu-Regular.ttf", + "name": "Noto Sans Lisu", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFNSRounded.ttf", + "name": ".SF NS Rounded", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansKayahLi-Regular.ttf", + "name": "Noto Sans Kayah Li", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/AquaKana.ttc", + "name": ".Aqua Kana", + "style": "normal", + "variant": "normal", + "weight": 300, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS.ttf", + "name": "Trebuchet MS", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansNKo-Regular.ttf", + "name": "Noto Sans NKo", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansNewTaiLue-Regular.ttf", + "name": "Noto Sans New Tai Lue", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Apple Braille Outline 8 Dot.ttf", + "name": "Apple Braille", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Trattatello.ttf", + "name": "Trattatello", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Sinhala Sangam MN.ttc", + "name": "Sinhala Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W2.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 250, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NewYorkItalic.ttf", + "name": ".New York", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/SukhumvitSet.ttc", + "name": "Sukhumvit Set", + "style": "normal", + "variant": "normal", + "weight": 250, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Gurmukhi MN.ttc", + "name": "Gurmukhi MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Hiragino Sans GB.ttc", + "name": "Hiragino Sans GB", + "style": "normal", + "variant": "normal", + "weight": 300, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOldPermic-Regular.ttf", + "name": "Noto Sans Old Permic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFNSItalic.ttf", + "name": "System Font", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansGlagolitic-Regular.ttf", + "name": "Noto Sans Glagolitic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansGothic-Regular.ttf", + "name": "Noto Sans Gothic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Farisi.ttf", + "name": "Farisi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Silom.ttf", + "name": "Silom", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Wingdings.ttf", + "name": "Wingdings", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntUpReg.otf", + "name": "STIXIntegralsUp", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansAdlam-Regular.ttf", + "name": "Noto Sans Adlam", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansCuneiform-Regular.ttf", + "name": "Noto Sans Cuneiform", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Bradley Hand Bold.ttf", + "name": "Bradley Hand", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOlChiki-Regular.ttf", + "name": "Noto Sans Ol Chiki", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/KohinoorBangla.ttc", + "name": "Kohinoor Bangla", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansLycian-Regular.ttf", + "name": "Noto Sans Lycian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/MarkerFelt.ttc", + "name": "Marker Felt", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMarchen-Regular.ttf", + "name": "Noto Sans Marchen", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Bold.ttf", + "name": "Arial", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u4e38\u30b3\u3099 ProN W4.ttc", + "name": "Hiragino Maru Gothic Pro", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansPhoenician-Regular.ttf", + "name": "Noto Sans Phoenician", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Avenir Next.ttc", + "name": "Avenir Next", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Apple Chancery.ttf", + "name": "Apple Chancery", + "style": "normal", + "variant": "normal", + "weight": 0, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXTwoMath.otf", + "name": "STIX Two Math", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Comic Sans MS Bold.ttf", + "name": "Comic Sans MS", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizTwoSymReg.otf", + "name": "STIXSizeTwoSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Chalkduster.ttf", + "name": "Chalkduster", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Sana.ttc", + "name": "Sana", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFNSMono.ttf", + "name": ".SF NS Mono", + "style": "normal", + "variant": "normal", + "weight": 295, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXTwoText.ttf", + "name": "STIX Two Text", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/PlantagenetCherokee.ttf", + "name": "Plantagenet Cherokee", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOldItalic-Regular.ttf", + "name": "Noto Sans Old Italic", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/KufiStandardGK.ttc", + "name": "KufiStandardGK", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Malayalam Sangam MN.ttc", + "name": "Malayalam Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Times New Roman Bold.ttf", + "name": "Times New Roman", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NotoNastaliq.ttc", + "name": "Noto Nastaliq Urdu", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizFourSymReg.otf", + "name": "STIXSizeFourSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansHatran-Regular.ttf", + "name": "Noto Sans Hatran", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOsage-Regular.ttf", + "name": "Noto Sans Osage", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansVai-Regular.ttf", + "name": "Noto Sans Vai", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/KohinoorTelugu.ttc", + "name": "Kohinoor Telugu", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/DecoTypeNaskh.ttc", + "name": "DecoType Naskh", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFNS.ttf", + "name": "System Font", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFCamera.ttf", + "name": ".SF Camera", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Bangla MN.ttc", + "name": "Bangla MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOldHungarian-Regular.ttf", + "name": "Noto Sans Old Hungarian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Menlo.ttc", + "name": "Menlo", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Apple Braille Outline 6 Dot.ttf", + "name": "Apple Braille", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansLydian-Regular.ttf", + "name": "Noto Sans Lydian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W7.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFCompactItalic.ttf", + "name": ".SF Compact", + "style": "italic", + "variant": "normal", + "weight": 1000, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTagalog-Regular.ttf", + "name": "Noto Sans Tagalog", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Hoefler Text.ttc", + "name": "Hoefler Text", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/SuperClarendon.ttc", + "name": "Superclarendon", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NotoSansArmenian.ttc", + "name": "Noto Sans Armenian", + "style": "normal", + "variant": "normal", + "weight": 900, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Bodoni 72 Smallcaps Book.ttf", + "name": "Bodoni 72 Smallcaps", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Wingdings 3.ttf", + "name": "Wingdings 3", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizThreeSymReg.otf", + "name": "STIXSizeThreeSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansWancho-Regular.ttf", + "name": "Noto Sans Wancho", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Symbol.ttf", + "name": "Symbol", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NewPeninimMT.ttc", + "name": "New Peninim MT", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/DecoTypeNastaleeqUrdu.ttc", + "name": ".DecoType Nastaleeq Urdu UI", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Muna.ttc", + "name": "Muna", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Rockwell.ttc", + "name": "Rockwell", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Charter.ttc", + "name": "Charter", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/AppleMyungjo.ttf", + "name": "AppleMyungjo", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Devanagari Sangam MN.ttc", + "name": "Devanagari Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansBuginese-Regular.ttf", + "name": "Noto Sans Buginese", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Verdana Bold.ttf", + "name": "Verdana", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NotoSansMyanmar.ttc", + "name": "Noto Sans Myanmar", + "style": "normal", + "variant": "normal", + "weight": 900, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Malayalam MN.ttc", + "name": "Malayalam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntUpSmBol.otf", + "name": "STIXIntegralsUpSm", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/STHeiti Light.ttc", + "name": "Heiti TC", + "style": "normal", + "variant": "normal", + "weight": 300, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMro-Regular.ttf", + "name": "Noto Sans Mro", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Bodoni 72 OS.ttc", + "name": "Bodoni 72 Oldstyle", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Skia.ttf", + "name": "Skia", + "style": "normal", + "variant": "normal", + "weight": 5, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u660e\u671d ProN.ttc", + "name": "Hiragino Mincho ProN", + "style": "normal", + "variant": "normal", + "weight": 300, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/ADTNumeric.ttc", + "name": ".SF Numeric", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial.ttf", + "name": "Arial", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Thonburi.ttc", + "name": "Thonburi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/MuktaMahee.ttc", + "name": "Mukta Mahee", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/PTSerifCaption.ttc", + "name": "PT Serif Caption", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansModi-Regular.ttf", + "name": "Noto Sans Modi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/DevanagariMT.ttc", + "name": "Devanagari MT", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXNonUniBolIta.otf", + "name": "STIXNonUnicode", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansLinearA-Regular.ttf", + "name": "Noto Sans Linear A", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMendeKikakui-Regular.ttf", + "name": "Noto Sans Mende Kikakui", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Khmer MN.ttc", + "name": "Khmer MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W0.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 100, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Oriya Sangam MN.ttc", + "name": "Oriya Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Oriya MN.ttc", + "name": "Oriya MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansInscriptionalParthian-Regular.ttf", + "name": "Noto Sans Inscriptional Parthian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSamaritan-Regular.ttf", + "name": "Noto Sans Samaritan", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Narrow Bold.ttf", + "name": "Arial Narrow", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS Bold.ttf", + "name": "Trebuchet MS", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NotoSerifMyanmar.ttc", + "name": "Noto Serif Myanmar", + "style": "normal", + "variant": "normal", + "weight": 900, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W3.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 300, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS Bold Italic.ttf", + "name": "Trebuchet MS", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Waseem.ttc", + "name": "Waseem", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFArmenian.ttf", + "name": ".SF Armenian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/NewYork.ttf", + "name": ".New York", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansBamum-Regular.ttf", + "name": "Noto Sans Bamum", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTaiLe-Regular.ttf", + "name": "Noto Sans Tai Le", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Telugu Sangam MN.ttc", + "name": "Telugu Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXNonUniBol.otf", + "name": "STIXNonUnicode", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Avenir Next Condensed.ttc", + "name": "Avenir Next Condensed", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Al Nile.ttc", + "name": "Al Nile", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntSmReg.otf", + "name": "STIXIntegralsSm", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Zapfino.ttf", + "name": "Zapfino", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Narrow Italic.ttf", + "name": "Arial Narrow", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTifinagh-Regular.otf", + "name": "Noto Sans Tifinagh", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/STHeiti Medium.ttc", + "name": "Heiti TC", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansLepcha-Regular.ttf", + "name": "Noto Sans Lepcha", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansDuployan-Regular.ttf", + "name": "Noto Sans Duployan", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/ZapfDingbats.ttf", + "name": "Zapf Dingbats", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizOneSymReg.otf", + "name": "STIXSizeOneSym", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Beirut.ttc", + "name": "Beirut", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Palatino.ttc", + "name": "Palatino", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXGeneralBol.otf", + "name": "STIXGeneral", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Georgia.ttf", + "name": "Georgia", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/SignPainter.ttc", + "name": "SignPainter", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Copperplate.ttc", + "name": "Copperplate", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFHebrew.ttf", + "name": ".SF Hebrew", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansCham-Regular.ttf", + "name": "Noto Sans Cham", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Geneva.ttf", + "name": "Geneva", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansManichaean-Regular.ttf", + "name": "Noto Sans Manichaean", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Raanana.ttc", + "name": "Raanana", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntUpDBol.otf", + "name": "STIXIntegralsUpD", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Italic.ttf", + "name": "Arial", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntUpSmReg.otf", + "name": "STIXIntegralsUpSm", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFArabic.ttf", + "name": ".SF Arabic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Apple Braille Pinpoint 8 Dot.ttf", + "name": "Apple Braille", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMongolian-Regular.ttf", + "name": "Noto Sans Mongolian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Savoye LET.ttc", + "name": "Savoye LET", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Bangla Sangam MN.ttc", + "name": "Bangla Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/AppleGothic.ttf", + "name": "AppleGothic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansNewa-Regular.ttf", + "name": "Noto Sans Newa", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansLinearB-Regular.ttf", + "name": "Noto Sans Linear B", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Papyrus.ttc", + "name": "Papyrus", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansAvestan-Regular.ttf", + "name": "Noto Sans Avestan", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Courier New Italic.ttf", + "name": "Courier New", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Songti.ttc", + "name": "Songti SC", + "style": "normal", + "variant": "normal", + "weight": 900, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSerifAhom-Regular.ttf", + "name": "Noto Serif Ahom", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Lao MN.ttc", + "name": "Lao MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansPahawhHmong-Regular.ttf", + "name": "Noto Sans Pahawh Hmong", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/LucidaGrande.ttc", + "name": "Lucida Grande", + "style": "normal", + "variant": "normal", + "weight": 500, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansBassaVah-Regular.ttf", + "name": "Noto Sans Bassa Vah", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Didot.ttc", + "name": "Didot", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Baghdad.ttc", + "name": "Baghdad", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansPsalterPahlavi-Regular.ttf", + "name": "Noto Sans Psalter Pahlavi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Georgia Bold.ttf", + "name": "Georgia", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Kohinoor.ttc", + "name": "Kohinoor Devanagari", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSerifBalinese-Regular.ttf", + "name": "Noto Serif Balinese", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/KefaIII.ttf", + "name": "Kefa III", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Baskerville.ttc", + "name": "Baskerville", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOldTurkic-Regular.ttf", + "name": "Noto Sans Old Turkic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Kannada MN.ttc", + "name": "Kannada MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFGeorgianRounded.ttf", + "name": ".SF Georgian Rounded", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Monaco.ttf", + "name": "Monaco", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFGeorgian.ttf", + "name": ".SF Georgian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/AmericanTypewriter.ttc", + "name": "American Typewriter", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Al Tarikh.ttc", + "name": "Al Tarikh", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Tamil MN.ttc", + "name": "Tamil MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFCompact.ttf", + "name": ".SF Compact", + "style": "normal", + "variant": "normal", + "weight": 1000, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansRejang-Regular.ttf", + "name": "Noto Sans Rejang", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansLimbu-Regular.ttf", + "name": "Noto Sans Limbu", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Shree714.ttc", + "name": "Shree Devanagari 714", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansMasaramGondi-Regular.otf", + "name": "Noto Sans Masaram Gondi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Times New Roman.ttf", + "name": "Times New Roman", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Times New Roman Bold Italic.ttf", + "name": "Times New Roman", + "style": "italic", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Gujarati Sangam MN.ttc", + "name": "Gujarati Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Mishafi Gold.ttf", + "name": "Mishafi Gold", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/DIN Condensed Bold.ttf", + "name": "DIN Condensed", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Avenir.ttc", + "name": "Avenir", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansWarangCiti-Regular.ttf", + "name": "Noto Sans Warang Citi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W9.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 900, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Farah.ttc", + "name": "Farah", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXNonUni.otf", + "name": "STIXNonUnicode", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Times New Roman Italic.ttf", + "name": "Times New Roman", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntUpBol.otf", + "name": "STIXIntegralsUp", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/ChalkboardSE.ttc", + "name": "Chalkboard SE", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansOldNorthArabian-Regular.ttf", + "name": "Noto Sans Old North Arabian", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXVar.otf", + "name": "STIXVariants", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansTakri-Regular.ttf", + "name": "Noto Sans Takri", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Tahoma Bold.ttf", + "name": "Tahoma", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Telugu MN.ttc", + "name": "Telugu MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Tahoma.ttf", + "name": "Tahoma", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansNagMundari-Regular.ttf", + "name": "Noto Sans Nag Mundari", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/EuphemiaCAS.ttc", + "name": "Euphemia UCAS", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansYi-Regular.ttf", + "name": "Noto Sans Yi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/SFArabicRounded.ttf", + "name": ".SF Arabic Rounded", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXIntUpDReg.otf", + "name": "STIXIntegralsUpD", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/STIXSizOneSymBol.otf", + "name": "STIXSizeOneSym", + "style": "normal", + "variant": "normal", + "weight": 700, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Tamil Sangam MN.ttc", + "name": "Tamil Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansKhudawadi-Regular.ttf", + "name": "Noto Sans Khudawadi", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansPalmyrene-Regular.ttf", + "name": "Noto Sans Palmyrene", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Myanmar MN.ttc", + "name": "Myanmar MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/AppleSDGothicNeo.ttc", + "name": "Apple SD Gothic Neo", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Myanmar Sangam MN.ttc", + "name": "Myanmar Sangam MN", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansKhojki-Regular.ttf", + "name": "Noto Sans Khojki", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansPauCinHau-Regular.ttf", + "name": "Noto Sans Pau Cin Hau", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansSundanese-Regular.ttf", + "name": "Noto Sans Sundanese", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Brush Script.ttf", + "name": "Brush Script MT", + "style": "italic", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/GeezaPro.ttc", + "name": "Geeza Pro", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Bodoni Ornaments.ttf", + "name": "Bodoni Ornaments", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W5.ttc", + "name": "Hiragino Sans", + "style": "normal", + "variant": "normal", + "weight": 500, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Kokonor.ttf", + "name": "Kokonor", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansNabataean-Regular.ttf", + "name": "Noto Sans Nabataean", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/NotoSansCoptic-Regular.ttf", + "name": "Noto Sans Coptic", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "normal", + "size": "scalable", + "__class__": "FontEntry" + }, + { + "fname": "/System/Library/Fonts/Supplemental/Arial Narrow.ttf", + "name": "Arial Narrow", + "style": "normal", + "variant": "normal", + "weight": 400, + "stretch": "condensed", + "size": "scalable", + "__class__": "FontEntry" + } + ], + "__class__": "FontManager" +} \ No newline at end of file diff --git a/filenames.py b/filenames.py index 8f7cfc4..ea7828c 100644 --- a/filenames.py +++ b/filenames.py @@ -1,7 +1,9 @@ import os RAID_DIR = os.environ.get("RAID_DIR") -os.environ["RAY_TMPDIR"] = os.path.join(RAID_DIR, "tmp") +ray_tmp = "/tmp/ray" +os.makedirs(ray_tmp, exist_ok=True) +os.environ["RAY_TMPDIR"] = ray_tmp REPO_DIR = os.path.join(RAID_DIR, "repos") DATA_DIR = os.path.join(RAID_DIR, "data") CHECKPOINT_DIR = os.path.join(RAID_DIR, "checkpoints") @@ -9,4 +11,4 @@ DB_FILE_NAME = "db_file.txt" PROOF_LOG_FILE_NAME = os.path.join(RAID_DIR, "proof_log.txt") ENCOUNTERED_THEOREMS_FILE = os.path.join(RAID_DIR, "encountered_theorems.pkl") -FISHER_DIR = os.path.join(RAID_DIR, "fisher") # Optional \ No newline at end of file +FISHER_DIR = os.path.join(RAID_DIR, "fisher") # Optional diff --git a/git_utils.py b/git_utils.py index 28a2d32..4883df1 100644 --- a/git_utils.py +++ b/git_utils.py @@ -397,7 +397,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): # Add the new repo to the dynamic database config = repo.get_config("lean-toolchain") v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"]) - theorems_folder = os.path.join(dst_dir, "theorems") + theorems_folder = os.path.join(dst_dir, "random") premise_files_corpus = os.path.join(dst_dir, "corpus.jsonl") files_traced = os.path.join(dst_dir, "traced_files.jsonl") @@ -409,7 +409,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): "lean_version": v, "lean_dojo_version": lean_dojo.__version__, "metadata": { - "date_processed": datetime.datetime.now(), + "date_processed": datetime.now(), }, "theorems_folder": theorems_folder, "premise_files_corpus": premise_files_corpus, @@ -551,4 +551,4 @@ def should_skip_repo(): with open(skip_file_path, "r") as f: repo_url = f.read().strip() return True, repo_url - return False, None \ No newline at end of file + return False, None diff --git a/leanagent.py b/leanagent.py index c87e84e..7f9801c 100644 --- a/leanagent.py +++ b/leanagent.py @@ -8,6 +8,7 @@ import time import traceback +from contextlib import contextmanager from datetime import datetime, timedelta from pathlib import Path from typing import List, Optional, Set, Tuple @@ -52,6 +53,30 @@ repos = [] +@contextmanager +def _locked(path: str, mode: str): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, mode) as handle: + fcntl.flock(handle.fileno(), fcntl.LOCK_EX) + try: + yield handle + finally: + if any(flag in mode for flag in ("w", "a", "+")): + handle.flush() + os.fsync(handle.fileno()) + fcntl.flock(handle.fileno(), fcntl.LOCK_UN) + + +def read_json_locked(path: str): + with _locked(path, "r") as handle: + return json.load(handle) + + +def write_json_locked(path: str, obj) -> None: + with _locked(path, "w") as handle: + json.dump(obj, handle, indent=2, sort_keys=True) + + def _eval(data, preds_map) -> Tuple[float, float, float]: """Evaluates the retrieval model.""" R1 = [] @@ -507,7 +532,7 @@ def main(): use_fisher = False single_repo = True curriculum_learning = True - num_repos = 4 + num_repos = 3 dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME) lambdas = None @@ -529,18 +554,11 @@ def main(): lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db) repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json") - lock_path = f"{repo_info_file}.lock" max_attempts = 30 for attempt in range(max_attempts): try: - with open(lock_path, "a") as lock_handle: - fcntl.flock(lock_handle.fileno(), fcntl.LOCK_EX) - try: - with open(repo_info_file, "r") as f: - repo_info = json.load(f) - break - finally: - fcntl.flock(lock_handle.fileno(), fcntl.LOCK_UN) + repo_info = read_json_locked(repo_info_file) + break except (json.JSONDecodeError, FileNotFoundError): if attempt == max_attempts - 1: raise Exception( diff --git a/requirements-local.txt b/requirements-local.txt new file mode 100644 index 0000000..2a77334 --- /dev/null +++ b/requirements-local.txt @@ -0,0 +1,20 @@ +numpy +loguru +rich +pydantic +gitpython +requests +tqdm +docker +filelock +typing_extensions +ray +python-dotenv +toml +PyGithub +networkx +rank_bm25 +lxml +regex +packaging +psutil diff --git a/requirements.cpu.txt b/requirements.cpu.txt new file mode 100644 index 0000000..f41b96f --- /dev/null +++ b/requirements.cpu.txt @@ -0,0 +1,17 @@ +pytorch-lightning[extra] +numpy +deepspeed +lean_dojo==1.9.0 +loguru +networkx +ray +requests +torch +tqdm +transformers +openai +python-dotenv +rank_bm25 +torchmetrics +pytest==8.4.0 +pytest-cov==6.2.1 diff --git a/retrieval/bm25/main.py b/retrieval/bm25/main.py index 1d41415..48eb9b3 100644 --- a/retrieval/bm25/main.py +++ b/retrieval/bm25/main.py @@ -92,30 +92,21 @@ def _process_theorem( @ray.remote(num_cpus=1) -""" -A Ray remote class for processing theorems with BM25 retrieval. - -This class handles the initialization of necessary components for theorem processing, -including loading the tokenizer, corpus, and setting up the BM25 retrieval model. -It provides a method to process individual theorems by retrieving relevant premises. - -Parameters ----------- -tokenizer_path : str - Path to the tokenizer file -data_path : str - Path to the data directory containing corpus files -num_retrieved : int - Number of premises to retrieve for each theorem -use_all_premises : bool - Whether to use all available premises or just retrieved ones - -Methods -------- -process_theorem(thm: Dict[str, Any]) - Process a single theorem, retrieving relevant premises using BM25 -""" class TheoremProcessor: + """ + Ray remote class that processes theorems with BM25 retrieval. + + Parameters + ---------- + tokenizer_path : str + Path to the tokenizer file. + data_path : str + Path to the data directory containing corpus files. + num_retrieved : int + Number of premises to retrieve for each theorem. + use_all_premises : bool + Whether to use all available premises or just retrieved ones. + """ def __init__( self, tokenizer_path: str, diff --git a/scripts/manual_trace.py b/scripts/manual_trace.py new file mode 100644 index 0000000..3fe35b1 --- /dev/null +++ b/scripts/manual_trace.py @@ -0,0 +1,76 @@ +import argparse, json, os, sys +from pathlib import Path + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--repo", required=True, help="Path to repo root (has .lake/)") + ap.add_argument("--url", required=True, help="Repo URL (e.g. https://github.com/owner/name)") + ap.add_argument("--commit", default="", help="Commit SHA (defaults to git rev-parse HEAD)") + ap.add_argument("--out_root", required=True, help="Datasets root (e.g. RAID/data)") + ap.add_argument("--zip", action="store_true", help="Also create a zip bundle with IR + corpus.jsonl") + args = ap.parse_args() + + repo = Path(args.repo).resolve() + if not repo.exists(): + print(f"[ERR] Repo not found: {repo}", file=sys.stderr) + sys.exit(2) + + # detect commit if not given + commit = args.commit.strip() + if not commit: + import subprocess + try: + commit = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=str(repo)).decode().strip() + except Exception as e: + print(f"[ERR] Could not detect commit via git: {e}", file=sys.stderr) + sys.exit(2) + + ir_root = repo / ".lake" / "build" / "ir" + if not ir_root.is_dir(): + print(f"[ERR] IR dir not found: {ir_root}\nRun `lake build` first.", file=sys.stderr) + sys.exit(2) + + asts = sorted(ir_root.rglob("*.ast.json")) + if not asts: + print(f"[ERR] No *.ast.json files under {ir_root}", file=sys.stderr) + sys.exit(2) + + # dataset folder name: owner_repo_commit + owner_repo = "/".join(args.url.rstrip("/").split("/")[-2:]) + owner_repo_flat = owner_repo.replace("/", "_") + out_dir = Path(args.out_root) / f"{owner_repo_flat}_{commit}" + out_dir.mkdir(parents=True, exist_ok=True) + + corpus_path = out_dir / "corpus.jsonl" + with corpus_path.open("w") as f: + for p in asts: + rec = { + "repo_url": args.url, + "commit": commit, + "ast_path": str(p.relative_to(repo)), + } + f.write(json.dumps(rec) + "\n") + + print(f"[OK] Wrote {corpus_path} records: {len(asts)}") + + if args.zip: + # bundle: corpus.jsonl + IR tree + exports = Path(os.environ.get("RAID_DIR", repo.parent.parent)) / "exports" + exports.mkdir(parents=True, exist_ok=True) + zip_name = f"{owner_repo_flat}_{commit}_bundle.zip" + zip_path = exports / zip_name + + # use system zip via subprocess to preserve paths + import subprocess + cmd = [ + "zip","-r", str(zip_path), + str(corpus_path), + str(ir_root), + "-x","*.DS_Store" + ] + print("[ZIP] ", " ".join(cmd)) + subprocess.check_call(cmd, cwd=str(Path(os.environ.get("RAID_DIR", repo.parent.parent)))) + print(f"[OK] Bundle ready: {zip_path}") + +if __name__ == "__main__": + main() diff --git a/testfile_root b/testfile_root new file mode 100644 index 0000000..6824beb --- /dev/null +++ b/testfile_root @@ -0,0 +1,2 @@ +hello +EOF && ls /Users/aum/Desktop/leanagent-work/LeanAgent/testfile_root From 17d166f6ba1f6e83bc7b37f1b9cc026032108f10 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 28 Oct 2025 22:21:12 -0400 Subject: [PATCH 13/29] Fix dynamic database datetime serialization --- dynamic_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dynamic_database.py b/dynamic_database.py index 4f1b297..6ab7352 100644 --- a/dynamic_database.py +++ b/dynamic_database.py @@ -1,6 +1,6 @@ from __future__ import annotations import time -from datetime import datetime +import datetime import json import os import random From 458684d9c5ae4af4246ee7b6a983ca91a74032e7 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 28 Oct 2025 22:28:00 -0400 Subject: [PATCH 14/29] Use datetime.now helper correctly --- leanagent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/leanagent.py b/leanagent.py index 7f9801c..c4e1116 100644 --- a/leanagent.py +++ b/leanagent.py @@ -270,7 +270,7 @@ def prove_sorry_theorems( all_encountered_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = ( set() ) - last_save_time = datetime.datetime.now() + last_save_time = datetime.now() save_interval = timedelta(minutes=30) # Load previously encountered theorems @@ -333,7 +333,7 @@ def prove_sorry_theorems( theorem_batch = [] positions_batch = [] - current_time = datetime.datetime.now() + current_time = datetime.now() if current_time - last_save_time >= save_interval: save_progress(all_encountered_theorems) last_save_time = current_time From b9f4f8573abf07256b12f3d04b054d065f9335db Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Wed, 29 Oct 2025 19:28:52 -0400 Subject: [PATCH 15/29] Retry curriculum repo discovery until minimum satisfied --- leanagent.py | 59 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/leanagent.py b/leanagent.py index c4e1116..e911522 100644 --- a/leanagent.py +++ b/leanagent.py @@ -429,23 +429,54 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p logger.warning("num_repos should be at least 3 for curriculum learning") - lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "Lean", num_repos) - - for i in range(len(lean_git_repos)): - repo = lean_git_repos[i] - print("\n\n") - logger.info(f"Processing new repo: {repo.url}") - result = add_repo_to_database(dynamic_database_json_path, repo, db) - if result is not None: - logger.info(f"Successfully added repo {repo.url}") - + existing_repo_count = len(db.repositories) + target_repo_count = max(3, num_repos) + + lean_git_repos, repos = search_github_repositories( + lean_git_repos, repos, "Lean", target_repo_count + ) + + processed_idx = 0 + extra_searches = 0 + max_extra_searches = 10 + + while len(db.repositories) < target_repo_count: + while ( + processed_idx < len(lean_git_repos) + and len(db.repositories) < target_repo_count + ): + repo = lean_git_repos[processed_idx] + processed_idx += 1 + print("\n\n") + logger.info(f"Processing new repo: {repo.url}") + result = add_repo_to_database( + dynamic_database_json_path, repo, db + ) + if result is not None: + logger.info(f"Successfully added repo {repo.url}") + + if len(db.repositories) >= target_repo_count: + break + + if extra_searches >= max_extra_searches: + raise ValueError( + "Unable to find enough compatible repositories for curriculum learning" + ) + + extra_searches += 1 + needed = max(1, target_repo_count - len(db.repositories)) + logger.info( + f"Searching for {needed} additional repositories to meet the curriculum requirement" + ) + lean_git_repos, repos = search_github_repositories( + lean_git_repos, repos, "Lean", needed + ) + + newly_added = len(db.repositories) - existing_repo_count logger.info( - f"Successfully added {num_repos} repositories to the database" + f"Successfully added {newly_added} repositories to the database (total: {len(db.repositories)})" ) - if len(db.repositories) < 3: - raise ValueError("The database should contain at least 3 repositories for curriculum learning") - sorted_repos, categorized_theorems, percentiles = ( sort_repositories_by_difficulty(db) ) From c84672e647f8be6c1e86dce664dcab91fe4e611a Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Wed, 29 Oct 2025 19:38:49 -0400 Subject: [PATCH 16/29] Cache dataset exports when artifacts already present --- generate_benchmark_lean4.py | 68 ++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index 27056b5..d9ac591 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -9,7 +9,7 @@ from copy import copy from datetime import datetime from pathlib import Path -from typing import Dict, List, Union +from typing import Dict, List, Union, Optional, Tuple import os import lean_dojo import networkx as nx @@ -24,6 +24,49 @@ SPLIT = Dict[SPLIT_NAME, List[TracedTheorem]] SPLIT_STRATEGY = str _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P.+?)") +_REQUIRED_EXPORT_FILES = [ + ("metadata.json",), + ("corpus.jsonl",), + ("traced_files.jsonl",), + ("random", "train.json"), + ("random", "val.json"), + ("random", "test.json"), + ("novel_premises", "train.json"), + ("novel_premises", "val.json"), + ("novel_premises", "test.json"), +] + + +def _existing_export_stats(dst_path: Union[str, Path]) -> Optional[Tuple[int, int, int]]: + """Return cached export statistics if the dataset artifacts already exist.""" + dst_path = Path(dst_path) + required_paths = [dst_path.joinpath(*parts) for parts in _REQUIRED_EXPORT_FILES] + + if not all(path.is_file() and path.stat().st_size > 0 for path in required_paths): + return None + + metadata_path = dst_path / "metadata.json" + try: + metadata = json.load(metadata_path.open("rt")) + except (OSError, json.JSONDecodeError): + return None + + required_keys = {"total_theorems", "num_premises", "num_files_traced"} + if not required_keys.issubset(metadata.keys()): + return None + + try: + total_theorems = int(metadata["total_theorems"]) + num_premises = int(metadata["num_premises"]) + num_files_traced = int(metadata["num_files_traced"]) + except (TypeError, ValueError): + return None + + if total_theorems <= 0 or num_premises < 0 or num_files_traced <= 0: + return None + + logger.info(f"Reusing cached export at {dst_path}") + return num_premises, num_files_traced, total_theorems def get_lean4_version_from_config(toolchain: str) -> str: @@ -465,7 +508,19 @@ def export_data( logger.info("Successfully exported the premises") # Export metadata. - export_metadata(traced_repo, dst_path, **kwargs) + split_summary = { + strategy: {name: len(theorems) for name, theorems in split.items()} + for strategy, split in splits.items() + } + export_metadata( + traced_repo, + dst_path, + total_theorems=total_theorems, + num_premises=num_premises, + num_files_traced=num_files_traced, + split_counts=split_summary, + **kwargs, + ) logger.info("Successfully exported the metadata") return num_premises, num_files_traced, total_theorems @@ -556,9 +611,14 @@ def main(url, commit, dst_dir): except Exception as e: logger.info(f"Failed to trace repo {repo} because of {e}") return None, 0, 0, 10 - + + cached_stats = _existing_export_stats(dst_dir) + if cached_stats is not None: + num_premises, num_files_traced, total_theorems = cached_stats + return traced_repo, num_premises, num_files_traced, total_theorems + safe_remove_dir(dst_dir) - + splits = split_data(traced_repo) logger.info("Successfully split the data") num_premises, num_files_traced, total_theorems = export_data( From f7b729e807215600da71b8bab51fa821aa91cfa7 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Wed, 29 Oct 2025 20:46:18 -0400 Subject: [PATCH 17/29] Revert export caching guard --- generate_benchmark_lean4.py | 68 +++---------------------------------- 1 file changed, 4 insertions(+), 64 deletions(-) diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index d9ac591..27056b5 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -9,7 +9,7 @@ from copy import copy from datetime import datetime from pathlib import Path -from typing import Dict, List, Union, Optional, Tuple +from typing import Dict, List, Union import os import lean_dojo import networkx as nx @@ -24,49 +24,6 @@ SPLIT = Dict[SPLIT_NAME, List[TracedTheorem]] SPLIT_STRATEGY = str _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P.+?)") -_REQUIRED_EXPORT_FILES = [ - ("metadata.json",), - ("corpus.jsonl",), - ("traced_files.jsonl",), - ("random", "train.json"), - ("random", "val.json"), - ("random", "test.json"), - ("novel_premises", "train.json"), - ("novel_premises", "val.json"), - ("novel_premises", "test.json"), -] - - -def _existing_export_stats(dst_path: Union[str, Path]) -> Optional[Tuple[int, int, int]]: - """Return cached export statistics if the dataset artifacts already exist.""" - dst_path = Path(dst_path) - required_paths = [dst_path.joinpath(*parts) for parts in _REQUIRED_EXPORT_FILES] - - if not all(path.is_file() and path.stat().st_size > 0 for path in required_paths): - return None - - metadata_path = dst_path / "metadata.json" - try: - metadata = json.load(metadata_path.open("rt")) - except (OSError, json.JSONDecodeError): - return None - - required_keys = {"total_theorems", "num_premises", "num_files_traced"} - if not required_keys.issubset(metadata.keys()): - return None - - try: - total_theorems = int(metadata["total_theorems"]) - num_premises = int(metadata["num_premises"]) - num_files_traced = int(metadata["num_files_traced"]) - except (TypeError, ValueError): - return None - - if total_theorems <= 0 or num_premises < 0 or num_files_traced <= 0: - return None - - logger.info(f"Reusing cached export at {dst_path}") - return num_premises, num_files_traced, total_theorems def get_lean4_version_from_config(toolchain: str) -> str: @@ -508,19 +465,7 @@ def export_data( logger.info("Successfully exported the premises") # Export metadata. - split_summary = { - strategy: {name: len(theorems) for name, theorems in split.items()} - for strategy, split in splits.items() - } - export_metadata( - traced_repo, - dst_path, - total_theorems=total_theorems, - num_premises=num_premises, - num_files_traced=num_files_traced, - split_counts=split_summary, - **kwargs, - ) + export_metadata(traced_repo, dst_path, **kwargs) logger.info("Successfully exported the metadata") return num_premises, num_files_traced, total_theorems @@ -611,14 +556,9 @@ def main(url, commit, dst_dir): except Exception as e: logger.info(f"Failed to trace repo {repo} because of {e}") return None, 0, 0, 10 - - cached_stats = _existing_export_stats(dst_dir) - if cached_stats is not None: - num_premises, num_files_traced, total_theorems = cached_stats - return traced_repo, num_premises, num_files_traced, total_theorems - + safe_remove_dir(dst_dir) - + splits = split_data(traced_repo) logger.info("Successfully split the data") num_premises, num_files_traced, total_theorems = export_data( From 2f243a69c832d30af6f7af769425b183ce4f161f Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 4 Nov 2025 08:22:40 -0500 Subject: [PATCH 18/29] Instrument tracing workflow and add skip prompts --- constants.py | 3 +- generate_benchmark_lean4.py | 44 ++++++++++----- git_utils.py | 103 ++++++++++++++++++++++++++++++++---- leanagent.py | 46 ++++++++++++++-- trace_only.py | 22 ++++++++ 5 files changed, 193 insertions(+), 25 deletions(-) create mode 100644 trace_only.py diff --git a/constants.py b/constants.py index d11b663..9313943 100644 --- a/constants.py +++ b/constants.py @@ -158,6 +158,7 @@ "proost-assistant/ProostLean", "DavePearce/LeanEVM", "algebraic-dev/ash", + "google-deepmind/formal-conjectures", "FormalizedFormalLogic/Arithmetization", "cmu-l3/ntp-toolkit", "dwrensha/tryAtEachStep", @@ -241,4 +242,4 @@ # Added by Mo to find smaller repo to iterate on -] \ No newline at end of file +] diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index 27056b5..7fed1d2 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -465,7 +465,19 @@ def export_data( logger.info("Successfully exported the premises") # Export metadata. - export_metadata(traced_repo, dst_path, **kwargs) + split_summary = { + strategy: {name: len(theorems) for name, theorems in split.items()} + for strategy, split in splits.items() + } + export_metadata( + traced_repo, + dst_path, + total_theorems=total_theorems, + num_premises=num_premises, + num_files_traced=num_files_traced, + split_counts=split_summary, + **kwargs, + ) logger.info("Successfully exported the metadata") return num_premises, num_files_traced, total_theorems @@ -529,17 +541,15 @@ def main(url, commit, dst_dir): logger.info("Unsupported version") v = v[1:] # ignore "v" at beginning - lean_dir2 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}" - lean_dir3 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}" - logger.info(f"lean path2 {lean_dir2}") - logger.info(f"lean path3 {lean_dir3}") - if not os.path.exists(lean_dir2): - logger.info(f"Lean toolchain path 2 does not exist: {lean_dir2}") - if not os.path.exists(lean_dir3): - logger.info(f"Lean toolchain path 3 does not exist: {lean_dir3}") - os.environ["LEAN4_PATH"] = lean_dir2 - os.environ["PATH"] = f"{lean_dir2}/bin:{os.environ.get('PATH', '')}" - logger.info(f"Switched to Lean toolchain at: {lean_dir2}") + elan_toolchains = Path( + os.environ.get("ELAN_TOOLCHAINS", Path.home() / ".elan" / "toolchains") + ) + lean_dir = elan_toolchains / f"leanprover--lean4---{v}" + if not lean_dir.exists(): + logger.warning(f"Lean toolchain path does not exist locally: {lean_dir}") + os.environ["LEAN4_PATH"] = str(lean_dir) + os.environ["PATH"] = f"{lean_dir}/bin:{os.environ.get('PATH', '')}" + logger.info(f"Switched to Lean toolchain at: {lean_dir}") logger.info( f"lean --version: {subprocess.run(['lean', '--version'], capture_output=True).stdout.decode('utf-8')}" @@ -553,6 +563,12 @@ def main(url, commit, dst_dir): logger.info("Tracing the repo...") traced_repo = trace(repo) logger.info("Successfully traced the repo") + traced_files_count = len(traced_repo.traced_files) + deps_count = sum(len(tf.get_premise_definitions()) for tf in traced_repo.traced_files) + logger.info( + f"Trace summary for {url}@{commit}: " + f"{traced_files_count} traced files, {deps_count} premise definitions discovered" + ) except Exception as e: logger.info(f"Failed to trace repo {repo} because of {e}") return None, 0, 0, 10 @@ -565,4 +581,8 @@ def main(url, commit, dst_dir): traced_repo, splits, dst_dir ) logger.info("Successfully exported the data") + logger.info( + f"Export summary for {url}@{commit}: " + f"{total_theorems} theorems, {num_premises} premises, {num_files_traced} traced files" + ) return traced_repo, num_premises, num_files_traced, total_theorems diff --git a/git_utils.py b/git_utils.py index 4883df1..b0e3672 100644 --- a/git_utils.py +++ b/git_utils.py @@ -15,7 +15,7 @@ from loguru import logger -from typing import Union, List, Tuple +from typing import Union, List, Tuple, Optional import math import os @@ -25,6 +25,46 @@ BATCH_SIZE = 4 from filenames import REPO_DIR, DATA_DIR +MIN_SUPPORTED_LEAN_VERSION = (4, 6, 0) +MIN_SUPPORTED_LEAN_VERSION_STR = "v4.6.0" +PAUSE_AFTER_TRACE = os.environ.get("PAUSE_AFTER_TRACE", "0") == "1" + + +def _parse_lean_version(version: str) -> Optional[Tuple[int, int, int]]: + version = version.lower().lstrip("v") + if not version: + return None + base = version.split("-")[0] + parts = base.split(".") + if len(parts) < 2: + return None + while len(parts) < 3: + parts.append("0") + try: + major, minor, patch = (int(parts[0]), int(parts[1]), int(parts[2])) + except ValueError: + return None + return major, minor, patch + + +def _is_supported_lean_version(version: str) -> bool: + parsed = _parse_lean_version(version) + if parsed is None: + return False + return parsed >= MIN_SUPPORTED_LEAN_VERSION + + +def _pause_after_trace(repo_url: str, status: str) -> None: + if not PAUSE_AFTER_TRACE: + return + try: + input( + f"[TRACE] {repo_url} finished with status '{status}'. " + "Press Enter to continue..." + ) + except KeyboardInterrupt: + raise + def clone_repo(repo_url): """Clone a git repository and return the path to the repository and its sha.""" @@ -360,38 +400,81 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): url = url + ".git" logger.info(f"\n\nProcessing {url}") + normalized_url = url.replace(".git", "") + sha, v = get_compatible_commit(url) if not sha: logger.info(f"Failed to find a compatible commit for {url}") - return None + status = "no_compatible_commit" + _pause_after_trace(normalized_url, status) + return status logger.info(f"Found compatible commit {sha} for {url} with lean version: {v}") + if db.get_repository(normalized_url, sha) is not None: + logger.info( + f"Repository {normalized_url}@{sha} already present in dynamic database. Skipping." + ) + status = "already_present" + _pause_after_trace(normalized_url, status) + return status + + parsed_version = _parse_lean_version(v) + if parsed_version is None or not _is_supported_lean_version(v): + logger.info( + f"Skipping {normalized_url} due to unsupported Lean toolchain {v}. " + f"Minimum required {MIN_SUPPORTED_LEAN_VERSION_STR}" + ) + status = "unsupported_toolchain" + _pause_after_trace(normalized_url, status) + return status + # Ensure that the repo is checked out to the compatible commit repo_name, _ = clone_repo(url) subprocess.run(["git", "-C", repo_name, "checkout", sha], check=True) logger.info(f"Checked out {url} to commit {sha}") - url = url.replace(".git", "") - repo = LeanGitRepo(url, sha) + repo = LeanGitRepo(normalized_url, sha) dir_name = repo.url.split("/")[-1] + "_" + sha dst_dir = os.path.join(DATA_DIR, dir_name) logger.info(f"Generating benchmark at {dst_dir}") - traced_repo, _, _, total_theorems = generate_benchmark_lean4.main( + traced_repo, num_premises, num_files_traced, total_theorems = generate_benchmark_lean4.main( repo.url, sha, dst_dir ) if not traced_repo: logger.info(f"Failed to trace {url}") - return None + shutil.rmtree(dst_dir, ignore_errors=True) + status = "trace_failed" + _pause_after_trace(normalized_url, status) + return status + if total_theorems is None: + logger.info(f"Trace produced no theorem count for {url}") + shutil.rmtree(dst_dir, ignore_errors=True) + status = "missing_theorem_count" + _pause_after_trace(normalized_url, status) + return status + + logger.info( + f"Trace produced {total_theorems} theorems for {url} " + f"(minimum required {3 * BATCH_SIZE})" + ) + if total_theorems < 3 * BATCH_SIZE: # Require enough theorems for train/val/test logger.info(f"Not enough theorems found in {url}") - return None - + shutil.rmtree(dst_dir, ignore_errors=True) + status = "insufficient_theorems" + _pause_after_trace(normalized_url, status) + return status + + logger.info( + f"Export includes {num_premises} premises across {num_files_traced} traced files" + ) + logger.info(f"Finished generating benchmark at {dst_dir}") # Add the new repo to the dynamic database @@ -427,7 +510,9 @@ def add_repo_to_database(dynamic_database_json_path, repo, db): db.print_database_contents() db.to_json(dynamic_database_json_path) - return "Done" + status = "success" + _pause_after_trace(normalized_url, status) + return status def calculate_difficulty(theorem: Theorem) -> Union[float, None]: """Calculates the difficulty of a theorem.""" diff --git a/leanagent.py b/leanagent.py index e911522..edaa5a1 100644 --- a/leanagent.py +++ b/leanagent.py @@ -52,6 +52,21 @@ lean_git_repos = [] repos = [] +SEED_REPOS = [ + LeanGitRepo( + "https://github.com/ImperialCollegeLondon/FLT", + "b208a302cdcbfadce33d8165f0b054bfa17e2147", + ), + LeanGitRepo( + "https://github.com/HEPLean/PhysLean", + "60f1ebc3eb015f78a3719ee4085344a600d0af50", + ), + LeanGitRepo( + "https://github.com/verse-lab/veil", + "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781", + ), +] + @contextmanager def _locked(path: str, mode: str): @@ -427,8 +442,26 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p if is_main_process: if num_repos < 3: logger.warning("num_repos should be at least 3 for curriculum learning") - - + + failure_records: List[Tuple[str, str]] = [] + + for seed_repo in SEED_REPOS: + if db.get_repository(seed_repo.url, seed_repo.commit) is None: + logger.info( + f"Seeding database with {seed_repo.url}@{seed_repo.commit}" + ) + result = add_repo_to_database( + dynamic_database_json_path, seed_repo, db + ) + if result in ("success", "already_present"): + logger.info(f"Seeded repo {seed_repo.url}") + else: + failure_records.append((seed_repo.url, result)) + else: + logger.info( + f"Seed repository {seed_repo.url}@{seed_repo.commit} already present" + ) + existing_repo_count = len(db.repositories) target_repo_count = max(3, num_repos) @@ -452,8 +485,10 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p result = add_repo_to_database( dynamic_database_json_path, repo, db ) - if result is not None: + if result in ("success", "already_present"): logger.info(f"Successfully added repo {repo.url}") + else: + failure_records.append((repo.url, result)) if len(db.repositories) >= target_repo_count: break @@ -472,6 +507,11 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p lean_git_repos, repos, "Lean", needed ) + if failure_records: + logger.warning("Tracing failures/skip summary:") + for repo_url, reason in failure_records: + logger.warning(f" {repo_url} -> {reason}") + newly_added = len(db.repositories) - existing_repo_count logger.info( f"Successfully added {newly_added} repositories to the database (total: {len(db.repositories)})" diff --git a/trace_only.py b/trace_only.py new file mode 100644 index 0000000..17c0d1b --- /dev/null +++ b/trace_only.py @@ -0,0 +1,22 @@ +from lean_dojo import LeanGitRepo +from dynamic_database import DynamicDatabase +from git_utils import add_repo_to_database +from filenames import RAID_DIR, DB_FILE_NAME +from pathlib import Path + +SEED_REPOS = [ + ("https://github.com/ImperialCollegeLondon/FLT", "b208a302cdcbfadce33d8165f0b054bfa17e2147"), + ("https://github.com/HEPLean/PhysLean", "60f1ebc3eb015f78a3719ee4085344a600d0af50"), + ("https://github.com/verse-lab/veil", "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781"), +] + +db_path = Path(RAID_DIR) / DB_FILE_NAME +if db_path.exists(): + db = DynamicDatabase.from_json(db_path) +else: + db = DynamicDatabase() + +for url, commit in SEED_REPOS: + repo = LeanGitRepo(url, commit) + print(f"Tracing {url}@{commit}") + add_repo_to_database(str(db_path), repo, db) From f8d29d6fc952566e61697933d8090cbc4834e8b4 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Thu, 6 Nov 2025 10:06:34 -0500 Subject: [PATCH 19/29] Add paper tracing scripts and enable merged dataset build --- constants.py | 73 ++++++++++++ leanagent.py | 36 ++++-- scripts/build_merged_dataset.py | 124 ++++++++++++++++++++ scripts/trace_paper_repos.py | 202 ++++++++++++++++++++++++++++++++ 4 files changed, 427 insertions(+), 8 deletions(-) create mode 100644 scripts/build_merged_dataset.py create mode 100755 scripts/trace_paper_repos.py diff --git a/constants.py b/constants.py index 9313943..a801d9b 100644 --- a/constants.py +++ b/constants.py @@ -243,3 +243,76 @@ # Added by Mo to find smaller repo to iterate on ] + +# Repos that appear in the paper – trace these first, in this order. +PAPER_REPOS = [ + {"url": "https://github.com/leanprover-community/PFR", + "commit": "fa398a5b853c7e94e3294c45e50c6aee013a2687"}, + + {"url": "https://github.com/leanprover-community/hairy-ball-theorem", + "commit": "a778826d19c8a7ddf1d26beeea628c45450612e6"}, + + {"url": "https://github.com/leanprover-community/coxeter", + "commit": "96af8aee7943ca8685ed1b00cc83a559ea389a97"}, + + {"url": "https://github.com/avigad/mathematics_in_lean_source", + "commit": "5297e0fb051367c48c0a084411853a576389ecf5"}, + + {"url": "https://github.com/leanprover-community/formal-book", + "commit": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"}, + + {"url": "https://github.com/yangky11/miniF2F-lean4", + "commit": "9e445f5435407f014b88b44a98436d50dd7abd00"}, + + {"url": "https://github.com/lecopivo/SciLean", + "commit": "22d53b2f4e3db2a172e71da6eb9c916e62655744"}, + + {"url": "https://github.com/leanprover-community/carleson", + "commit": "bec7808b907190882fa1fa54ce749af297c6cf37"}, + + {"url": "https://github.com/leanprover-community/lean4-pdl", + "commit": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"}, + + {"url": "https://github.com/AlexKontorovich/PrimeNumberTheoremAnd", + "commit": "29baddd685660b5fedd7bd67f9916ae24253d566"}, + + {"url": "https://github.com/dwrensha/compfiles", + "commit": "f99bf6f2928d47dd1a445b414b3a723c2665f091"}, + + {"url": "https://github.com/ImperialCollegeLondon/FLT", + "commit": "b208a302cdcbfadce33d8165f0b054bfa17e2147"}, + + {"url": "https://github.com/TODO/debate", + "commit": "7fb39251b705797ee54e08c96177fabd29a5b5a3"}, + + {"url": "https://github.com/TODO/lean4lean", + "commit": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"}, + + {"url": "https://github.com/eric-wieser/lean-matrix-cookbook", + "commit": "f15a149d321ac99ff9b9c024b58e7882f564669f"}, + + {"url": "https://github.com/TODO/math-workshop", + "commit": "5acd4b933d47fd6c1032798a6046c1baf261445d"}, + + {"url": "https://github.com/TODO/LeanEuclid", + "commit": "f1912c3090eb82820575758efc31e40b9db86bb8"}, + + {"url": "https://github.com/FormalizedFormalLogic/Foundation", + "commit": "d5fe5d057a90a0703a745cdc318a1b6621490c21"}, + + {"url": "https://github.com/TODO/Con-nf", + "commit": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"}, + + {"url": "https://github.com/TODO/Saturn", + "commit": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"}, + + {"url": "https://github.com/ahhwuhu/zeta_3_irrational", + "commit": "914712200e463cfc97fe37e929d518dd58806a38"}, + + {"url": "https://github.com/TODO/Formalization-of-Constructable-Numbers", + "commit": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"}, + + {"url": "https://github.com/LeanAPAP/LeanAPAP", + "commit": "951c660a8d7ba8e39f906fdf657674a984effa8b"}, +] + diff --git a/leanagent.py b/leanagent.py index edaa5a1..8fb50a0 100644 --- a/leanagent.py +++ b/leanagent.py @@ -70,7 +70,9 @@ @contextmanager def _locked(path: str, mode: str): - os.makedirs(os.path.dirname(path), exist_ok=True) + directory = os.path.dirname(path) + if directory: + os.makedirs(directory, exist_ok=True) with open(path, mode) as handle: fcntl.flock(handle.fileno(), fcntl.LOCK_EX) try: @@ -87,9 +89,27 @@ def read_json_locked(path: str): return json.load(handle) -def write_json_locked(path: str, obj) -> None: +def write_json_locked( + path: str, + obj, + *, + indent: int = 2, + ensure_ascii: bool = False, + sort_keys: bool = False, +) -> None: with _locked(path, "w") as handle: - json.dump(obj, handle, indent=2, sort_keys=True) + json.dump( + obj, + handle, + indent=indent, + ensure_ascii=ensure_ascii, + sort_keys=sort_keys, + ) + + +def save_database_locked(db: DynamicDatabase, path: str) -> None: + """Persist the dynamic database safely across processes.""" + write_json_locked(path, db.to_dict(), ensure_ascii=False) def _eval(data, preds_map) -> Tuple[float, float, float]: @@ -222,7 +242,7 @@ def process_theorem_batch( else: logger.warning(f"Unexpected result type") - db.to_json(dynamic_database_json_path) + save_database_locked(db, dynamic_database_json_path) def save_progress(all_encountered_theorems): @@ -414,7 +434,7 @@ def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase: f"\nInitializing new database at {dynamic_database_json_path}\n" ) db = DynamicDatabase() - db.to_json(dynamic_database_json_path) + save_database_locked(db, dynamic_database_json_path) else: try: logger.info(f"Loading database from {dynamic_database_json_path}") @@ -426,7 +446,7 @@ def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase: f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database." ) db = DynamicDatabase() - db.to_json(dynamic_database_json_path) + save_database_locked(db, dynamic_database_json_path) return db @@ -522,7 +542,7 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p ) print("Sorted repositories. Saving now...") - db.to_json(dynamic_database_json_path) + save_database_locked(db, dynamic_database_json_path) save_sorted_repos(sorted_repos, "sorted_repos.json") print("Summary of theorem difficulties by URL:") @@ -982,7 +1002,7 @@ def main(): prove_sorry_theorems( db, prover, dynamic_database_json_path, repos_for_proving ) - db.to_json(dynamic_database_json_path) + save_database_locked(db, dynamic_database_json_path) logger.info("Finished searching for proofs of sorry theorems") diff --git a/scripts/build_merged_dataset.py b/scripts/build_merged_dataset.py new file mode 100644 index 0000000..c1879f4 --- /dev/null +++ b/scripts/build_merged_dataset.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Build a merged dataset from already-traced repos. + +This script: + - Loads/creates the dynamic database at RAID/db_file.txt + - For each non-empty corpus under RAID/data/_/corpus.jsonl, + it infers (repo_url, commit) from the first line and adds the repo to the DB + using git_utils.add_repo_to_database (reuses LeanDojo caches if present). + - Exports a merged dataset to RAID/data/merged_paper_subset. + +Run from repo root: + export RAID_DIR="$PWD/RAID" + export REPO_DIR="$RAID_DIR/repos" + python LeanAgent/scripts/build_merged_dataset.py +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path + +from loguru import logger +import sys + +# Ensure repo modules are importable when running as a script +HERE = Path(__file__).resolve() +REPO_ROOT = HERE.parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from lean_dojo import LeanGitRepo # noqa: E402 +from dynamic_database import DynamicDatabase # noqa: E402 +from filenames import RAID_DIR, DB_FILE_NAME # noqa: E402 +from git_utils import add_repo_to_database # noqa: E402 +from scripts.trace_paper_repos import PAPER_REPOS # noqa: E402 + + +def iter_nonempty_corpora(data_root: Path): + for d in sorted(data_root.iterdir()): + cj = d / "corpus.jsonl" + if not cj.exists() or cj.stat().st_size == 0: + continue + yield d, cj + + +_SLUG_TO_URL = { + item["name"]: f"https://github.com/{item['owner']}/{item['name']}" + for item in PAPER_REPOS +} + + +def _infer_repo_from_dir(dir_path: Path) -> tuple[str, str]: + name = dir_path.name + if "_" not in name: + raise ValueError(f"Directory name {name} does not contain commit suffix") + slug, commit = name.rsplit("_", 1) + if len(commit) != 40: + raise ValueError(f"Directory {name} missing 40-char commit suffix") + url = _SLUG_TO_URL.get(slug) + if not url: + raise ValueError(f"Unknown repo slug '{slug}'. Please add it to PAPER_REPOS.") + return url, commit + + +def load_repo_from_corpus(corpus_path: Path) -> tuple[str, str]: + with corpus_path.open() as f: + first = f.readline() + url = commit = "" + if first: + try: + meta = json.loads(first) + url = meta.get("repo_url") or "" + commit = meta.get("commit") or "" + except Exception: + pass + if url and commit: + return url, commit + return _infer_repo_from_dir(corpus_path.parent) + + +def main() -> None: + raid_dir = Path(RAID_DIR) + db_path = raid_dir / DB_FILE_NAME + + db: DynamicDatabase + if not db_path.exists() or db_path.stat().st_size == 0: + logger.info(f"Initializing new database at {db_path}") + db = DynamicDatabase() + db.to_json(str(db_path)) + else: + logger.info(f"Loading database from {db_path}") + db = DynamicDatabase.from_json(str(db_path)) + + # Add repos discovered from existing corpora + data_root = raid_dir / "data" + targets = [] + for d, cj in iter_nonempty_corpora(data_root): + try: + url, commit = load_repo_from_corpus(cj) + targets.append((url, commit)) + except Exception as e: + logger.warning(f"Skipping {d} due to: {e}") + + logger.info(f"Found {len(targets)} repos with non-empty corpora to ingest") + + for url, commit in targets: + repo = LeanGitRepo(url, commit) + logger.info(f"Ingesting {url}@{commit}") + status = add_repo_to_database(str(db_path), repo, db) + logger.info(f"Status for {url}: {status}") + + # Export merged dataset + out_dir = raid_dir / "data" / "merged_paper_subset" + logger.info(f"Generating merged dataset at {out_dir}") + db.generate_merged_dataset(out_dir) + logger.info("DONE.") + + +if __name__ == "__main__": + if not os.environ.get("RAID_DIR"): + raise SystemExit("Please set RAID_DIR and REPO_DIR before running.") + main() diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py new file mode 100755 index 0000000..8d33d0a --- /dev/null +++ b/scripts/trace_paper_repos.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +""" +Trace the fixed set of paper repos and materialize corpus.jsonl +next to RAID/data/_/, mirroring the manual veil flow. + +Run from repo root: + export RAID_DIR="$PWD/RAID" + export REPO_DIR="$RAID_DIR/repos" + python scripts/trace_paper_repos.py +""" + +import os +import json +import pathlib + +from lean_dojo import LeanGitRepo +from lean_dojo.data_extraction.trace import get_traced_repo_path + + +# hardcoded list reconstructed from the paper / convo +PAPER_REPOS = [ + # 1. teorth/pfr + # { + # "owner": "teorth", + # "name": "pfr", + # "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687", + # }, + # 2. avigad/mathematics_in_lean_source + { + "owner": "avigad", + "name": "mathematics_in_lean_source", + "sha": "5297e0fb051367c48c0a084411853a576389ecf5", + }, + { + "owner": "verse-lab", + "name": "veil", + "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781", + }, + # 3. miniF2F + { + "owner": "yangky11", + "name": "miniF2F-lean4", + "sha": "9e445f5435407f014b88b44a98436d50dd7abd00", + }, + # 4. SciLean (in paper → we must make it work eventually) + # { + # "owner": "lecopivo", + # "name": "SciLean", + # "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744", + # }, + # 5. teorth/lean4-pdl + { + "owner": "teorth", + "name": "lean4-pdl", + "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e", + }, + # 6. prime number theorem notes + { + "owner": "AlexKontorovich", + "name": "PrimeNumberTheoremAnd", + "sha": "29baddd685660b5fedd7bd67f9916ae24253d566", + }, + # 7. compfiles + { + "owner": "dwrensha", + "name": "compfiles", + "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091", + }, + # 8. FLT + { + "owner": "ImperialCollegeLondon", + "name": "FLT", + "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147", + }, + { + "owner": "verse-lab", + "name": "veil", + "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781", + }, + # 9. lean4-cli (paper mentions tooling repos; we saw this in your crawl) + { + "owner": "leanprover-community", + "name": "lean4-cli", + "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f", + }, + # 10. matrix cookbook + { + "owner": "eric-wieser", + "name": "lean-matrix-cookbook", + "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f", + }, + # 11. LeanEuclid + { + "owner": "loganrjmurphy", + "name": "LeanEuclid", + "sha": "f1912c3090eb82820575758efc31e40b9db86bb8", + }, + # 12. formalized logic foundation + { + "owner": "FormalizedFormalLogic", + "name": "Foundation", + "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21", + }, + # 13. con-nf + { + "owner": "pengbaolin", + "name": "con-nf", + "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856", + }, + # 14. zeta_3_irrational + { + "owner": "ahhwuhu", + "name": "zeta_3_irrational", + "sha": "914712200e463cfc97fe37e929d518dd58806a38", + }, + # 15. LeanAPAP + { + "owner": "judicael-pvt", + "name": "LeanAPAP", + "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b", + }, + # paper had a few that we couldn't map to GH — keep extensible +] + + +def make_corpus_from_repo(source_root: pathlib.Path, out_dir: pathlib.Path, url: str, commit: str) -> int: + """Scan .lake/build/ir for *.ast.json and write corpus.jsonl.""" + ir_root = source_root / ".lake" / "build" / "ir" + if not ir_root.exists(): + print(f" !! no .lake/build/ir in {source_root}, skipping corpus.jsonl") + return 0 + + recs = [] + for p in ir_root.rglob("*.ast.json"): + recs.append( + { + "repo_url": url, + "commit": commit, + "ast_path": str(p.relative_to(source_root)), + } + ) + + out_dir.mkdir(parents=True, exist_ok=True) + out_file = out_dir / "corpus.jsonl" + with out_file.open("w") as f: + for r in recs: + f.write(json.dumps(r) + "\n") + print(f" wrote {len(recs)} records to {out_file}") + return len(recs) + + +def main() -> None: + raid_dir = os.environ.get("RAID_DIR") + repo_dir = os.environ.get("REPO_DIR") + + if not raid_dir or not repo_dir: + raise SystemExit("Please set RAID_DIR and REPO_DIR before running.") + + raid_dir = pathlib.Path(raid_dir) + repo_dir = pathlib.Path(repo_dir) + + for item in PAPER_REPOS: + url = f"https://github.com/{item['owner']}/{item['name']}" + commit = item["sha"] + + print(f"\n=== tracing {url}@{commit} ===") + try: + repo = LeanGitRepo(url, commit) + traced_path = get_traced_repo_path(repo, build_deps=False) + traced_path = pathlib.Path(traced_path) + print(f" lean_dojo traced into cache: {traced_path}") + except Exception as e: + print(f" !! lean_dojo failed for {url}@{commit}: {e}") + continue + + # repo as checked out by the earlier crawl + repo_root = repo_dir / item["owner"] / item["name"] + out_dir = raid_dir / "data" / f"{item['name']}_{commit}" + + if not repo_root.exists(): + print(f" !! repo root {repo_root} not found — was it cloned under RAID/repos/?") + + sources = [traced_path] + if repo_root.exists(): + sources.append(repo_root) + + exported = 0 + for src in sources: + exported = make_corpus_from_repo(src, out_dir, url, commit) + if exported > 0: + break + + if exported > 0: + print(f" ✅ exported corpus for {item['name']} ({exported} files)") + else: + print(f" ⚠ traced but no IR — likely a build/env issue for this repo") + + print("\nDONE.") + + +if __name__ == "__main__": + main() From a9d441c2cbd50099ad1ab960103b47aa7ac5ef53 Mon Sep 17 00:00:00 2001 From: motiwari Date: Thu, 18 Sep 2025 11:13:37 -0700 Subject: [PATCH 20/29] Initial commit of changes --- __init__.py | 36 + constants.py | 102 ++ container.py | 369 +++++++ data_extraction/ExtractData.lean | 530 +++++++++ data_extraction/ast.py | 1576 +++++++++++++++++++++++++++ data_extraction/build_lean4_repo.py | 214 ++++ data_extraction/cache.py | 107 ++ data_extraction/lean.py | 702 ++++++++++++ data_extraction/trace.py | 135 +++ data_extraction/traced_data.py | 1224 +++++++++++++++++++++ interaction/Lean4Repl.lean | 357 ++++++ interaction/dojo.py | 549 ++++++++++ interaction/parse_goals.py | 69 ++ utils.py | 314 ++++++ 14 files changed, 6284 insertions(+) create mode 100644 __init__.py create mode 100644 container.py create mode 100644 data_extraction/ExtractData.lean create mode 100644 data_extraction/ast.py create mode 100644 data_extraction/build_lean4_repo.py create mode 100644 data_extraction/cache.py create mode 100644 data_extraction/lean.py create mode 100644 data_extraction/trace.py create mode 100644 data_extraction/traced_data.py create mode 100644 interaction/Lean4Repl.lean create mode 100644 interaction/dojo.py create mode 100644 interaction/parse_goals.py create mode 100644 utils.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..8e8f7b3 --- /dev/null +++ b/__init__.py @@ -0,0 +1,36 @@ +import os +from loguru import logger + +from .data_extraction.trace import ( + trace, + get_traced_repo_path, + is_available_in_cache, +) + +from .data_extraction.traced_data import ( + TracedRepo, + TracedFile, + TracedTheorem, + TracedTactic, +) +from .interaction.dojo import ( + CommandState, + TacticState, + LeanError, + TimeoutError, + TacticResult, + DojoCrashError, + DojoHardTimeoutError, + DojoInitError, + Dojo, + ProofFinished, + ProofGivenUp, +) +from .interaction.parse_goals import Declaration, Goal, parse_goals +from .data_extraction.lean import get_latest_commit, LeanGitRepo, LeanFile, Theorem, Pos +from .constants import __version__ + +if os.geteuid() == 0: + logger.warning( + "Running LeanDojo as the root user may cause unexpected issues. Proceed with caution." + ) diff --git a/constants.py b/constants.py index a801d9b..01061a3 100644 --- a/constants.py +++ b/constants.py @@ -316,3 +316,105 @@ "commit": "951c660a8d7ba8e39f906fdf657674a984effa8b"}, ] +"""Constants controlling LeanDojo's behaviors. +Many of them are configurable via :ref:`environment-variables`. +""" + +import os +import re +import sys +import subprocess +import multiprocessing +from pathlib import Path +from typing import Tuple +from loguru import logger +from dotenv import load_dotenv + +load_dotenv() + +__version__ = "1.9.0" + +logger.remove() +if "VERBOSE" in os.environ or "DEBUG" in os.environ: + logger.add(sys.stderr, level="DEBUG") +else: + logger.add(sys.stderr, level="INFO") + +CACHE_DIR = ( + Path(os.environ["CACHE_DIR"]) + if "CACHE_DIR" in os.environ + else Path.home() / ".cache/lean_dojo" +).absolute() +"""Cache directory for storing traced repos (see :ref:`caching`). +""" + +REMOTE_CACHE_URL = "https://lean-dojo.s3.amazonaws.com" +"""URL of the remote cache (see :ref:`caching`).""" + +DISABLE_REMOTE_CACHE = "DISABLE_REMOTE_CACHE" in os.environ +"""Whether to disable remote caching (see :ref:`caching`) and build all repos locally. +""" + +TMP_DIR = Path(os.environ["TMP_DIR"]).absolute() if "TMP_DIR" in os.environ else None +"""Temporary directory used by LeanDojo for storing intermediate files +""" + +MAX_NUM_PROCS = 32 + +NUM_PROCS = int(os.getenv("NUM_PROCS", min(multiprocessing.cpu_count(), MAX_NUM_PROCS))) +"""Number of threads to use +""" + +NUM_WORKERS = NUM_PROCS - 1 + +LEAN4_URL = "https://github.com/leanprover/lean4" +"""The URL of the Lean 4 repo.""" + +LEAN4_PACKAGES_DIR = Path(".lake/packages") +"""The directory where Lean 4 dependencies are stored (since v4.3.0-rc2).""" + +LOAD_USED_PACKAGES_ONLY = "LOAD_USED_PACKAGES_ONLY" in os.environ +"""Only load depdendency files that are actually used by the target repo.""" + +LEAN4_BUILD_DIR = Path(".lake/build") + +TACTIC_CPU_LIMIT = int(os.getenv("TACTIC_CPU_LIMIT", 1)) +"""Number of CPUs for executing tactics when interacting with Lean (only useful when running within Docker). +""" + +TACTIC_MEMORY_LIMIT = os.getenv("TACTIC_MEMORY_LIMIT", "32g") +"""Maximum memory when interacting with Lean (only useful when running within Docker). +""" + +CONTAINER = os.getenv("CONTAINER", "native") +"""Container to use for running LeanDojo. Default to ``native`` but also support ``docker``. Using ``docker`` is recommended for Lean 3. +""" + +DOCKER_AVAILABLE = os.system("docker version 1>/dev/null 2>/dev/null") == 0 + +DOCKER_TAG = "yangky11/lean-dojo" + +if CONTAINER == "docker": + assert ( + DOCKER_AVAILABLE + ), "Failed to access Docker. Please make sure Docker is running and you have access. Alternatively, you can try to run without Docker by setting the `CONTAINER` environment variable to `native` (see https://leandojo.readthedocs.io/en/latest/user-guide.html#advanced-running-within-docker)." + os.system(f"docker pull {DOCKER_TAG} 1>/dev/null 2>/dev/null") + + +def check_git_version(min_version: Tuple[int, int, int]) -> Tuple[int, int, int]: + """Check the version of Git installed on the system.""" + res = subprocess.run("git --version", shell=True, capture_output=True, check=True) + output = res.stdout.decode() + error = res.stderr.decode() + assert error == "", error + m = re.match(r"git version (?P[0-9.]+)", output) + version = tuple(int(_) for _ in m["version"].split(".")) + + version_str = ".".join(str(_) for _ in version) + min_version_str = ".".join(str(_) for _ in min_version) + assert ( + version >= min_version + ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}." + + +check_git_version((2, 25, 0)) diff --git a/container.py b/container.py new file mode 100644 index 0000000..af9f25a --- /dev/null +++ b/container.py @@ -0,0 +1,369 @@ +"""Containers provide runtime environment for running LeanDojo. +Currently, LeanDojo supports two types of containers: ``docker`` and ``native``. +The former is the default and recommended option, while the latter is experimental. +""" + +import os +import shlex +import signal +import shutil +import tempfile +import subprocess +from pathlib import Path +from loguru import logger +from dataclasses import dataclass +from abc import ABC, abstractmethod +from typing import List, Dict, Union, Tuple, Optional + +from .constants import CONTAINER, DOCKER_TAG +from .utils import execute, report_critical_failure, working_directory + + +@dataclass(frozen=True) +class Mount: + """A mount is a pair of source and destination paths.""" + + src: Path + dst: Path + + def __post_init__(self): + object.__setattr__(self, "src", Path(self.src)) + object.__setattr__(self, "dst", Path(self.dst)) + + def __iter__(self): + yield self.src + yield self.dst + + +def create_mounts(mts: Dict[Union[str, Path], Union[str, Path]]) -> List[Mount]: + """Create a list of mounts from a dictionary.""" + return [Mount(Path(k), Path(v)) for k, v in mts.items()] + + +class Container(ABC): + """Abstract base class for containers.""" + + @abstractmethod + def run( + self, + command: str, + mounts: List[Mount], + envs: Dict[str, str], + as_current_user: bool, + capture_output: bool, + cpu_limit: Optional[int], + memory_limit: Optional[str], + work_dir: Optional[str], + ) -> None: + """Run a command in the container. + + Args: + command (str): _description_ + mounts (List[Mount]): _description_ + envs (Dict[str, str]): _description_ + as_current_user (bool): _description_ + capture_output (bool): _description_ + cpu_limit (Optional[int]): _description_ + memory_limit (Optional[str]): _description_ + work_dir (Optional[str]): _description_ + """ + raise NotImplementedError + + @abstractmethod + def run_interactive( + self, + command: str, + mounts: List[Mount], + envs: Dict[str, str], + as_current_user: bool, + cpu_limit: Optional[int], + memory_limit: Optional[str], + work_dir: Optional[str], + ) -> subprocess.Popen: + """Run a command in the container interactively. + + Args: + command (str): _description_ + mounts (List[Mount]): _description_ + envs (Dict[str, str]): _description_ + as_current_user (bool): _description_ + cpu_limit (Optional[int]): _description_ + memory_limit (Optional[str]): _description_ + work_dir (Optional[str]): _description_ + + Returns: + subprocess.Popen: _description_ + """ + raise NotImplementedError + + +def _copy_file_or_dir(src: Path, dst: Path, delete_existing: bool = False) -> None: + if src.is_file(): + shutil.copy(src, dst) + else: + assert src.is_dir() and not src.is_relative_to(dst) + + # Modified by motiwari so as not to delete existing repos while mounting + if not delete_existing: + if not dst.exists(): + shutil.copytree(src, dst, symlinks=True) + return + if dst.exists() and delete_existing: + shutil.rmtree(dst) + shutil.copytree(src, dst, symlinks=True) + + +class NativeContainer(Container): + """A container that runs commands natively.""" + + def _mount_files(self, mounts: List[Mount]) -> None: + cwd = Path.cwd() + import ipdb; ipdb.set_trace() + for src, dst in mounts: + if dst.is_absolute(): + dst = cwd / dst.relative_to(dst.root) + if src == cwd: + for path in src.glob("*"): + p = dst / path.relative_to(src) + p.parent.mkdir(parents=True, exist_ok=True) + _copy_file_or_dir(path, p) + continue + assert not cwd.is_relative_to(src) + dst.parent.mkdir(parents=True, exist_ok=True) + _copy_file_or_dir(src, dst) + + def _unmount_files(self, mounts: List[Mount]) -> None: + cwd = Path.cwd() + + for src, dst in mounts: + if dst.is_absolute(): + dst = cwd / dst.relative_to(dst.root) + + if dst.exists(): + if src.is_file(): + shutil.move(dst, src) + elif dst.is_relative_to(src): + for path in dst.glob("*"): + p = src / path.relative_to(dst) + p.parent.mkdir(parents=True, exist_ok=True) + _copy_file_or_dir(path, p) + shutil.rmtree(dst) + else: + with report_critical_failure( + f"Failed to override the directory {src}" + ): + shutil.rmtree(src) + shutil.move(dst, src) + + for path in dst.parents: + if ( + path.exists() + and path.is_relative_to(cwd) + and len(list(path.glob("**/*"))) == 0 + ): + path.rmdir() + + def _build_native_command(self, command: str, envs: Dict[str, str]) -> str: + if len(envs) == 0: + return command + else: + return " ".join(f"{k}={v}" for k, v in envs.items()) + " " + command + + def run( + self, + command: str, + mounts: List[Mount] = [], + envs: Dict[str, str] = {}, + as_current_user: bool = True, + capture_output: bool = False, + cpu_limit: Optional[int] = None, + memory_limit: Optional[str] = None, + work_dir: Union[Path, str, None] = None, + ) -> None: + assert as_current_user, "NativeContainer can only run as the current user." + assert memory_limit is None, "NativeContainer does not support memory limit." + assert cpu_limit is None, "NativeContainer does not support CPU limit." + + import ipdb; ipdb.set_trace() + self._mount_files(mounts) + + cmd = self._build_native_command(command, envs) + logger.debug(cmd) + + if work_dir is None: + work_dir = Path.cwd() + else: + work_dir = Path(work_dir) + if work_dir.is_absolute(): + work_dir = Path.cwd() / work_dir.relative_to(work_dir.root) + + with working_directory(work_dir): + execute(cmd, capture_output=capture_output) + + self._unmount_files(mounts) + + def run_interactive( + self, + command: str, + mounts: List[Mount] = [], + envs: Dict[str, str] = {}, + as_current_user: bool = True, + cpu_limit: Optional[int] = None, + memory_limit: Optional[str] = None, + work_dir: Optional[str] = None, + ) -> subprocess.Popen: + assert as_current_user, "NativeContainer can only run as the current user." + + self._mount_files(mounts) + self.mounts = mounts + + cmd = self._build_native_command(command, envs) + logger.debug(cmd) + + if work_dir is None: + work_dir = Path.cwd() + else: + work_dir = Path(work_dir) + if work_dir.is_absolute(): + work_dir = Path.cwd() / work_dir.relative_to(work_dir.root) + + with working_directory(work_dir): + proc = subprocess.Popen( + shlex.split(cmd), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + encoding="utf-8", + bufsize=1, + ) + + return proc + + def cleanup(self) -> None: + self._unmount_files(self.mounts) + + +class DockerContainer(Container): + """A container that runs commands in a Docker container.""" + + def __init__(self, image: str) -> None: + self.image = image + self.cid_file = None + + def _build_docker_command( + self, + command: str, + mounts: List[Mount], + envs: Dict[str, str], + as_current_user: bool, + cpu_limit: Optional[int] = None, + memory_limit: Optional[str] = None, + work_dir: Optional[str] = None, + interactive: bool = False, + ) -> Tuple[str, Path]: + cid_file = Path(next(tempfile._get_candidate_names()) + ".cid") + cmd = f"docker run --cidfile {cid_file} --rm" + if as_current_user: + cmd += f" -u {os.getuid()}" + for src, dst in mounts: + cmd += f' --mount type=bind,src="{src}",target="{dst}"' + for k, v in envs.items(): + cmd += f" --env {k}={v}" + if cpu_limit: + cmd += f" --cpus {cpu_limit}" + if memory_limit: + cmd += f" --memory {memory_limit}" + if work_dir: + cmd += f" --workdir {work_dir}" + if interactive: + cmd += " -i" + cmd += f" {self.image} {command}" + return cmd, cid_file + + def run( + self, + command: str, + mounts: List[Mount] = [], + envs: Dict[str, str] = {}, + as_current_user: bool = True, + capture_output: bool = False, + cpu_limit: Optional[int] = None, + memory_limit: Optional[str] = None, + work_dir: Optional[str] = None, + ) -> None: + cmd, cid_file = self._build_docker_command( + command, + mounts, + envs, + as_current_user, + cpu_limit, + memory_limit, + work_dir, + interactive=False, + ) + logger.debug(cmd) + + def _exit_gracefully(signum, frame): + cid = open(cid_file).read().strip() + execute(f"docker stop -t 1 {cid}", capture_output=True) + raise RuntimeError(f"Failed to execute {cmd}") + + old_sigint = signal.signal(signal.SIGINT, _exit_gracefully) + old_sigterm = signal.signal(signal.SIGTERM, _exit_gracefully) + + execute(cmd, capture_output=capture_output) + + signal.signal(signal.SIGINT, old_sigint) + signal.signal(signal.SIGTERM, old_sigterm) + if cid_file.exists(): + cid_file.unlink() + + def run_interactive( + self, + command: str, + mounts: List[Mount] = [], + envs: Dict[str, str] = {}, + as_current_user: bool = False, + cpu_limit: Optional[int] = None, + memory_limit: Optional[str] = None, + work_dir: Optional[str] = None, + ) -> subprocess.Popen: + cmd, self.cid_file = self._build_docker_command( + command, + mounts, + envs, + as_current_user, + cpu_limit, + memory_limit, + work_dir, + interactive=True, + ) + logger.debug(cmd) + proc = subprocess.Popen( + shlex.split(cmd), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + encoding="utf-8", + bufsize=1, + ) + return proc + + def cleanup(self) -> None: + # Cannot use `self.proc.terminate()` to stop Docker since it may be running as root. + if self.cid_file is None or not self.cid_file.exists(): + return + cid = self.cid_file.open().read().strip() + os.system(f"docker stop -t 1 {cid} 1>/dev/null 2>/dev/null") + + +def get_container() -> Container: + if CONTAINER == "docker": + return DockerContainer(DOCKER_TAG) + else: + assert ( + CONTAINER == "native" + ), "Currently only `docker` and `native` are supported." + return NativeContainer() diff --git a/data_extraction/ExtractData.lean b/data_extraction/ExtractData.lean new file mode 100644 index 0000000..d161e79 --- /dev/null +++ b/data_extraction/ExtractData.lean @@ -0,0 +1,530 @@ +import Lean +import Lake + + +open Lean Elab System + +set_option maxHeartbeats 2000000 -- 10x the default maxHeartbeats. + + +instance : ToJson Substring where + toJson s := toJson s.toString + +instance : ToJson String.Pos where + toJson n := toJson n.1 + +deriving instance ToJson for SourceInfo +deriving instance ToJson for Syntax.Preresolved +deriving instance ToJson for Syntax +deriving instance ToJson for Position + + +namespace LeanDojo + + +/-- +The trace of a tactic. +-/ +structure TacticTrace where + stateBefore: String + stateAfter: String + pos: String.Pos -- Start position of the tactic. + endPos: String.Pos -- End position of the tactic. +deriving ToJson + + +/-- +The trace of a premise. +-/ +structure PremiseTrace where + fullName: String -- Fully-qualified name of the premise. + defPos: Option Position -- Where the premise is defined. + defEndPos: Option Position + modName: String -- In which module the premise is defined. + defPath: String -- The path of the file where the premise is defined. + pos: Option Position -- Where the premise is used. + endPos: Option Position +deriving ToJson + + +/-- +The trace of a Lean file. +-/ +structure Trace where + commandASTs : Array Syntax -- The ASTs of the commands in the file. + tactics: Array TacticTrace -- All tactics in the file. + premises: Array PremiseTrace -- All premises in the file. +deriving ToJson + + +abbrev TraceM := StateT Trace MetaM + + +namespace Pp + + +private def addLine (s : String) : String := + if s.isEmpty then s else s ++ "\n" + + +-- Similar to `Meta.ppGoal` but uses String instead of Format to make sure local declarations are separated by "\n". +private def ppGoal (mvarId : MVarId) : MetaM String := do + match (← getMCtx).findDecl? mvarId with + | none => return "unknown goal" + | some mvarDecl => + let indent := 2 + let lctx := mvarDecl.lctx + let lctx := lctx.sanitizeNames.run' { options := (← getOptions) } + Meta.withLCtx lctx mvarDecl.localInstances do + -- The followint two `let rec`s are being used to control the generated code size. + -- Then should be remove after we rewrite the compiler in Lean + let rec pushPending (ids : List Name) (type? : Option Expr) (s : String) : MetaM String := do + if ids.isEmpty then + return s + else + let s := addLine s + match type? with + | none => return s + | some type => + let typeFmt ← Meta.ppExpr type + return (s ++ (Format.joinSep ids.reverse (format " ") ++ " :" ++ Format.nest indent (Format.line ++ typeFmt)).group).pretty + let rec ppVars (varNames : List Name) (prevType? : Option Expr) (s : String) (localDecl : LocalDecl) : MetaM (List Name × Option Expr × String) := do + match localDecl with + | .cdecl _ _ varName type _ _ => + let varName := varName.simpMacroScopes + let type ← instantiateMVars type + if prevType? == none || prevType? == some type then + return (varName :: varNames, some type, s) + else do + let s ← pushPending varNames prevType? s + return ([varName], some type, s) + | .ldecl _ _ varName type val _ _ => do + let varName := varName.simpMacroScopes + let s ← pushPending varNames prevType? s + let s := addLine s + let type ← instantiateMVars type + let typeFmt ← Meta.ppExpr type + let mut fmtElem := format varName ++ " : " ++ typeFmt + let val ← instantiateMVars val + let valFmt ← Meta.ppExpr val + fmtElem := fmtElem ++ " :=" ++ Format.nest indent (Format.line ++ valFmt) + let s := s ++ fmtElem.group.pretty + return ([], none, s) + let (varNames, type?, s) ← lctx.foldlM (init := ([], none, "")) fun (varNames, prevType?, s) (localDecl : LocalDecl) => + if localDecl.isAuxDecl || localDecl.isImplementationDetail then + -- Ignore auxiliary declarations and implementation details. + return (varNames, prevType?, s) + else + ppVars varNames prevType? s localDecl + let s ← pushPending varNames type? s + let goalTypeFmt ← Meta.ppExpr (← instantiateMVars mvarDecl.type) + let goalFmt := Meta.getGoalPrefix mvarDecl ++ Format.nest indent goalTypeFmt + let s := s ++ "\n" ++ goalFmt.pretty + match mvarDecl.userName with + | Name.anonymous => return s + | name => return "case " ++ name.eraseMacroScopes.toString ++ "\n" ++ s + + +def ppGoals (ctx : ContextInfo) (goals : List MVarId) : IO String := + if goals.isEmpty then + return "no goals" + else + let fmt := ctx.runMetaM {} (return Std.Format.prefixJoin "\n\n" (← goals.mapM (ppGoal ·))) + return (← fmt).pretty.trim + + +end Pp + + +namespace Path + +/-- +Return the path of `path` relative to `parent`. +-/ +def relativeTo (path parent : FilePath) : Option FilePath := + let rec componentsRelativeTo (pathComps parentComps : List String) : Option FilePath := + match pathComps, parentComps with + | _, [] => mkFilePath pathComps + | [], _ => none + | (h₁ :: t₁), (h₂ :: t₂) => + if h₁ == h₂ then + componentsRelativeTo t₁ t₂ + else + none + + componentsRelativeTo path.components parent.components + + +/-- +Return if the path `path` is relative to `parent`. +-/ +def isRelativeTo (path parent : FilePath) : Bool := + match relativeTo path parent with + | some _ => true + | none => false + + +/-- +Convert the path `path` to an absolute path. +-/ +def toAbsolute (path : FilePath) : IO FilePath := do + if path.isAbsolute then + pure path + else + let cwd ← IO.currentDir + pure $ cwd / path + + +private def trim (path : FilePath) : FilePath := + assert! path.isRelative + mkFilePath $ path.components.filter (· != ".") + + +def packagesDir : FilePath := + if Lake.defaultPackagesDir == "packages" then + ".lake" / Lake.defaultPackagesDir + else + Lake.defaultPackagesDir + + +def buildDir : FilePath := + if Lake.defaultPackagesDir.fileName == "packages" then -- Lean >= v4.3.0-rc2 + ".lake/build" + else -- Lean < v4.3.0-rc2 + "build" + + +def libDir : FilePath := buildDir / "lib" + + +/-- +Convert the path of a *.lean file to its corresponding file (e.g., *.olean) in the "build" directory. +-/ +def toBuildDir (subDir : FilePath) (path : FilePath) (ext : String) : Option FilePath := + let path' := (trim path).withExtension ext + match relativeTo path' $ packagesDir / "lean4/src" with + | some p => + match relativeTo p "lean/lake" with + | some p' => packagesDir / "lean4/lib/lean" / p' + | none => packagesDir / "lean4/lib" / p + | none => match relativeTo path' packagesDir with + | some p => + match p.components with + | [] => none + | hd :: tl => packagesDir / hd / buildDir / subDir / (mkFilePath tl) + | none => buildDir / subDir / path' + + +/-- +The reverse of `toBuildDir`. +-/ +-- proofwidgets/build/lib/ProofWidgets/Compat.lean +-- proofwidgets/.lake/build/lib +def toSrcDir! (path : FilePath) (ext : String) : FilePath := + let path' := (trim path).withExtension ext + match relativeTo path' $ packagesDir / "lean4/lib" with + | some p => -- E.g., `.lake/packages/lean4/lib/lean/Init/Prelude.olean` -> `.lake/packages/lean4/src/lean/Init/Prelude.lean` + packagesDir / "lean4/src" / p + | none => + match relativeTo path' packagesDir with + | some p => -- E.g., `.lake/packages/aesop/.lake/build/lib/Aesop.olean`-> `.lake/packages/aesop/Aesop.lean` + let pkgName := p.components.head! + let sep := "build/lib/" + packagesDir / pkgName / (p.toString.splitOn sep |>.tail!.head!) + | none => + -- E.g., `.lake/build/lib/Mathlib/LinearAlgebra/Basic.olean` -> `Mathlib/LinearAlgebra/Basic.lean` + relativeTo path' libDir |>.get! + + +/-- +Create all parent directories of `p` if they don't exist. +-/ +def makeParentDirs (p : FilePath) : IO Unit := do + let some parent := p.parent | throw $ IO.userError s!"Unable to get the parent of {p}" + IO.FS.createDirAll parent + + +/-- +Return the *.lean file corresponding to a module name. +-/ +def findLean (mod : Name) : IO FilePath := do + let modStr := mod.toString + if modStr.startsWith "«lake-packages»." then + return FilePath.mk (modStr.replace "«lake-packages»" "lake-packages" |>.replace "." "/") |>.withExtension "lean" + if modStr.startsWith "«.lake»." then + return FilePath.mk (modStr.replace "«.lake»" ".lake" |>.replace "." "/") |>.withExtension "lean" + let olean ← findOLean mod + -- Remove a "build/lib/" substring from the path. + let lean := olean.toString.replace ".lake/build/lib/" "" + |>.replace "build/lib/" "" |>.replace "lib/lean/Lake/" "lib/lean/lake/Lake/" + let mut path := FilePath.mk lean |>.withExtension "lean" + let leanLib ← getLibDir (← getBuildDir) + if let some p := relativeTo path leanLib then + path := packagesDir / "lean4/src/lean" / p + assert! ← path.pathExists + return path + +end Path + + +namespace Traversal + + +/-- +Extract tactic information from `TacticInfo` in `InfoTree`. +-/ +private def visitTacticInfo (ctx : ContextInfo) (ti : TacticInfo) (parent : InfoTree) : TraceM Unit := do + match ti.stx.getKind with + | ``Lean.Parser.Term.byTactic => + match ti.stx with + | .node _ _ #[.atom _ "by", .node _ ``Lean.Parser.Tactic.tacticSeq _] => pure () + | _ => assert! false + + | ``Lean.Parser.Tactic.tacticSeq => + match ti.stx with + | .node _ _ #[.node _ ``Lean.Parser.Tactic.tacticSeq1Indented _] => pure () + | .node _ _ #[.node _ ``Lean.Parser.Tactic.tacticSeqBracketed _] => pure () + | _ => assert! false + + | _ => pure () + + match parent with + | .node (Info.ofTacticInfo i) _ => + match i.stx.getKind with + | ``Lean.Parser.Tactic.tacticSeq1Indented | ``Lean.Parser.Tactic.tacticSeqBracketed | ``Lean.Parser.Tactic.rewriteSeq => + let ctxBefore := { ctx with mctx := ti.mctxBefore } + let ctxAfter := { ctx with mctx := ti.mctxAfter } + let stateBefore ← Pp.ppGoals ctxBefore ti.goalsBefore + let stateAfter ← Pp.ppGoals ctxAfter ti.goalsAfter + if stateBefore == "no goals" || stateBefore == stateAfter then + pure () + else + let some posBefore := ti.stx.getPos? true | pure () + let some posAfter := ti.stx.getTailPos? true | pure () + match ti.stx with + | .node _ _ _ => + modify fun trace => { + trace with tactics := trace.tactics.push { + stateBefore := stateBefore, + stateAfter := stateAfter, + pos := posBefore, + endPos := posAfter, + } + } + | _ => pure () + | _ => pure () + | _ => pure () + + +/-- +Extract premise information from `TermInfo` in `InfoTree`. +-/ +private def visitTermInfo (ti : TermInfo) (env : Environment) : TraceM Unit := do + let some fullName := ti.expr.constName? | return () + let fileMap ← getFileMap + + let posBefore := match ti.toElabInfo.stx.getPos? with + | some posInfo => fileMap.toPosition posInfo + | none => none + + let posAfter := match ti.toElabInfo.stx.getTailPos? with + | some posInfo => fileMap.toPosition posInfo + | none => none + + let decRanges ← withEnv env $ findDeclarationRanges? fullName + let defPos := decRanges >>= fun (decR : DeclarationRanges) => decR.selectionRange.pos + let defEndPos := decRanges >>= fun (decR : DeclarationRanges) => decR.selectionRange.endPos + + let modName := + if let some modIdx := env.const2ModIdx.find? fullName then + env.header.moduleNames[modIdx.toNat]! + else + env.header.mainModule + + let mut defPath := toString $ ← Path.findLean modName + if defPath.startsWith "./" then + defPath := defPath.drop 2 + if defPath.startsWith "/lake/" then + defPath := ".lake/" ++ (defPath.drop 6) + + if defPos != posBefore ∧ defEndPos != posAfter then -- Don't include defintions as premises. + modify fun trace => { + trace with premises := trace.premises.push { + fullName := toString fullName, + defPos := defPos, + defEndPos := defEndPos, + defPath := defPath, + modName := toString modName, + pos := posBefore, + endPos := posAfter, + } + } + + +private def visitInfo (ctx : ContextInfo) (i : Info) (parent : InfoTree) (env : Environment) : TraceM Unit := do + match i with + | .ofTacticInfo ti => visitTacticInfo ctx ti parent + | .ofTermInfo ti => visitTermInfo ti env + | _ => pure () + + +private partial def traverseTree (ctx: ContextInfo) (tree : InfoTree) +(parent : InfoTree) (env : Environment) : TraceM Unit := do + match tree with + | .context ctx' t => + match ctx'.mergeIntoOuter? ctx with + | some ctx' => traverseTree ctx' t tree env + | none => panic! "fail to synthesis contextInfo when traversing infoTree" + | .node i children => + visitInfo ctx i parent env + for x in children do + traverseTree ctx x tree env + | _ => pure () + + +private def traverseTopLevelTree (tree : InfoTree) (env : Environment) : TraceM Unit := do + match tree with + | .context ctx t => + match ctx.mergeIntoOuter? none with + | some ctx => traverseTree ctx t tree env + | none => panic! "fail to synthesis contextInfo for top-level infoTree" + | _ => pure () + + +/-- +Process an array of `InfoTree` (one for each top-level command in the file). +-/ +def traverseForest (trees : Array InfoTree) (env : Environment) : TraceM Trace := do + for t in trees do + traverseTopLevelTree t env + get + + +end Traversal + + +open Traversal + + +def getImports (header: Syntax) : IO String := do + -- Similar to `lean --deps` in Lean 3. + let mut s := "" + + for dep in headerToImports header do + let oleanPath ← findOLean dep.module + if oleanPath.isRelative then + let leanPath := Path.toSrcDir! oleanPath "lean" + assert! ← leanPath.pathExists + s := s ++ "\n" ++ leanPath.toString + else if ¬(oleanPath.toString.endsWith "/lib/lean/Init.olean") then + let mut p := (Path.packagesDir / "lean4").toString ++ FilePath.pathSeparator.toString + let mut found := false + for c in (oleanPath.withExtension "lean").components do + if c == "lib" then + found := true + p := p ++ "src" + continue + if found then + p := p ++ FilePath.pathSeparator.toString ++ c + p := p.replace "/lean4/src/lean/Lake" "/lean4/src/lean/lake/Lake" + assert! ← FilePath.mk p |>.pathExists + s := s ++ "\n" ++ p + + return s.trim + + +/-- +Trace a *.lean file. +-/ +unsafe def processFile (path : FilePath) : IO Unit := do + println! path + let input ← IO.FS.readFile path + enableInitializersExecution + let inputCtx := Parser.mkInputContext input path.toString + let (header, parserState, messages) ← Parser.parseHeader inputCtx + let (env, messages) ← processHeader header {} messages inputCtx + + if messages.hasErrors then + for msg in messages.toList do + if msg.severity == .error then + println! "ERROR: {← msg.toString}" + throw $ IO.userError "Errors during import; aborting" + + let env := env.setMainModule (← moduleNameOfFileName path none) + let commandState := { Command.mkState env messages {} with infoState.enabled := true } + let s ← IO.processCommands inputCtx parserState commandState + let env' := s.commandState.env + let commands := s.commands.pop -- Remove EOI command. + let trees := s.commandState.infoState.trees.toArray + + let traceM := (traverseForest trees env').run' ⟨#[header] ++ commands, #[], #[]⟩ + let (trace, _) ← traceM.run'.toIO {fileName := s!"{path}", fileMap := FileMap.ofString input} {env := env} + + let cwd ← IO.currentDir + assert! cwd.fileName != "lean4" + + let some relativePath := Path.relativeTo path cwd | throw $ IO.userError s!"Invalid path: {path}" + let json_path := Path.toBuildDir "ir" relativePath "ast.json" |>.get! + Path.makeParentDirs json_path + IO.FS.writeFile json_path (toJson trace).pretty + + let dep_path := Path.toBuildDir "ir" relativePath "dep_paths" |>.get! + Path.makeParentDirs dep_path + IO.FS.writeFile dep_path (← getImports header) + + +end LeanDojo + + +open LeanDojo + +/-- +Whether a *.lean file should be traced. +-/ +def shouldProcess (path : FilePath) (noDeps : Bool) : IO Bool := do + if (← path.isDir) ∨ path.extension != "lean" then + return false + + let cwd ← IO.currentDir + let some relativePath := Path.relativeTo path cwd | + throw $ IO.userError s!"Invalid path: {path}" + + if noDeps ∧ Path.isRelativeTo relativePath Path.packagesDir then + return false + + let some oleanPath := Path.toBuildDir "lib" relativePath "olean" | + throw $ IO.userError s!"Invalid path: {path}" + return ← oleanPath.pathExists + + +/-- +Trace all *.lean files in the current directory whose corresponding *.olean file exists. +-/ +def processAllFiles (noDeps : Bool) : IO Unit := do + let cwd ← IO.currentDir + assert! cwd.fileName != "lean4" + println! "Extracting data at {cwd}" + + let mut tasks := #[] + for path in ← System.FilePath.walkDir cwd do + if ← shouldProcess path noDeps then + println! path + let t ← IO.asTask $ IO.Process.run + {cmd := "lake", args := #["env", "lean", "--run", "ExtractData.lean", path.toString]} + tasks := tasks.push (t, path) + + for (t, path) in tasks do + match ← IO.wait t with + | Except.error _ => + println! s!"WARNING: Failed to process {path}" + pure () + -- throw e + | Except.ok _ => pure () + + +unsafe def main (args : List String) : IO Unit := do + match args with + | ["noDeps"] => processAllFiles (noDeps := true) + | [path] => processFile (← Path.toAbsolute ⟨path⟩) + | [] => processAllFiles (noDeps := false) + | _ => throw $ IO.userError "Invalid arguments" diff --git a/data_extraction/ast.py b/data_extraction/ast.py new file mode 100644 index 0000000..ff716ac --- /dev/null +++ b/data_extraction/ast.py @@ -0,0 +1,1576 @@ +from lxml import etree +from pathlib import Path +from dataclasses import dataclass, field +from xml.sax.saxutils import escape, unescape +from typing import List, Dict, Any, Optional, Callable, Tuple, Generator + +from ..utils import ( + camel_case, + is_optional_type, + remove_optional_type, + parse_int_list, + parse_str_list, +) +from .lean import Pos, LeanFile + + +@dataclass(frozen=True) +class Node: + lean_file: LeanFile + start: Optional[Pos] + end: Optional[Pos] + children: List["Node"] = field(repr=False) + + @classmethod + def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "Node": + subcls = cls._kind_to_node_type(node_data["kind"]) + return subcls.from_data(node_data, lean_file) + + @classmethod + def _kind_to_node_type(cls, kind: str) -> type: + prefix = "Lean.Parser." + if kind.startswith(prefix): + kind = kind[len(prefix) :] + cls_name = camel_case(kind.replace(".", "_")) + "Node" + gbs = globals() + if cls_name in gbs: + return gbs[cls_name] # type: ignore + else: + # logger.warning(kind) + return OtherNode + + @classmethod + def kind(cls: type) -> str: + return cls.__name__[:-4].lower() + + def traverse_preorder( + self, + callback: Callable[["Node", List["Node"]], Any], + node_cls: Optional[type], + parents: List["Node"] = [], + ) -> None: + if node_cls is None or isinstance(self, node_cls): + if callback(self, parents): + return + for child in self.children: + child.traverse_preorder(callback, node_cls, parents + [self]) + + def traverse_postorder( + self, + callback: Callable[["Node", List[Any]], Any], + ) -> Any: + return callback( + self, [child.traverse_postorder(callback) for child in self.children] + ) + + def to_xml(self, parent: etree.Element) -> None: + tree = etree.SubElement(parent, self.__class__.__name__) + + for k in self.__dataclass_fields__: + if k in ("lean_file", "children"): + continue + v = getattr(self, k) + if v is not None: + v = escape(str(v), entities={'"': """}) + tree.set(k, v) + + for child in self.children: + child.to_xml(tree) + + @classmethod + def from_xml(cls, tree: etree.Element, lean_file: LeanFile) -> "Node": + subcls = globals()[tree.tag] + start = Pos.from_str(tree.attrib["start"]) if "start" in tree.attrib else None + end = Pos.from_str(tree.attrib["end"]) if "end" in tree.attrib else None + children = [Node.from_xml(subtree, lean_file) for subtree in tree] + kwargs = {} + + for field in subcls.__dataclass_fields__.values(): + if field.name in ("lean_file", "start", "end", "children"): + continue + v = tree.attrib.get(field.name, None) + if v is None: + kwargs[field.name] = None + continue + + assert isinstance(v, str) + v = unescape(v, entities={""": '"'}) + tp = ( + remove_optional_type(field.type) + if is_optional_type(field.type) + else field.type + ) + if tp is Pos: + kwargs[field.name] = Pos.from_str(v) + elif tp is Path: + kwargs[field.name] = Path(v) + elif tp is List[int]: + kwargs[field.name] = parse_int_list(v) + elif tp is List[str]: + kwargs[field.name] = parse_str_list(v) + else: + kwargs[field.name] = v # type: ignore + + return subcls(lean_file, start, end, children, **kwargs) # type: ignore + + def get_closure(self) -> Tuple[Pos, Pos]: + return self.start, self.end + + +def _parse_pos(info: Dict[str, Any], lean_file: LeanFile) -> Pos: + if "synthetic" in info and not info["synthetic"]["canonical"]: + return None + + if ( + "original" in info + ): # | original (leading : Substring) (pos : String.Pos) (trailing : Substring) (endPos : String.Pos) + start, end = info["original"]["pos"], info["original"]["endPos"] + else: + assert ( + "synthetic" in info + ) # | synthetic (pos : String.Pos) (endPos : String.Pos) (canonical := false) + start, end = info["synthetic"]["pos"], info["synthetic"]["endPos"] + + start = lean_file.convert_pos(start) + end = lean_file.convert_pos(end) + + return start, end + + +@dataclass(frozen=True) +class AtomNode(Node): + leading: str + trailing: str + val: str + + @classmethod + def from_data( + cls, atom_data: Dict[str, Any], lean_file: LeanFile + ) -> Optional["AtomNode"]: + info = atom_data["info"] + start, end = _parse_pos(info, lean_file) + + if "original" in info: + leading = info["original"]["leading"] + trailing = info["original"]["trailing"] + else: + assert "synthetic" in info + leading = info["synthetic"]["leading"] + trailing = info["synthetic"]["trailing"] + + return cls(lean_file, start, end, [], leading, trailing, atom_data["val"]) + + +@dataclass(frozen=True) +class IdentNode(Node): + leading: str + trailing: str + raw_val: str + val: str + + full_name: Optional[str] = None + mod_name: Optional[str] = None + def_path: Optional[str] = None + def_start: Optional[Pos] = None + def_end: Optional[Pos] = None + + @classmethod + def from_data( + cls, ident_data: Dict[str, Any], lean_file: LeanFile + ) -> Optional["IdentNode"]: + info = ident_data["info"] + start, end = _parse_pos(info, lean_file) + assert ident_data["preresolved"] == [] + + if "original" in info: + leading = info["original"]["leading"] + trailing = info["original"]["trailing"] + else: + assert "synthetic" in info + leading = info["synthetic"]["leading"] + trailing = info["synthetic"]["trailing"] + + return cls( + lean_file, + start, + end, + [], + leading, + trailing, + ident_data["rawVal"], + ident_data["val"], + ) + + @property + def is_mutual(self) -> bool: + return not isinstance(self.full_name, str) + + +def is_leaf(node: Node) -> bool: + return isinstance(node, AtomNode) or isinstance(node, IdentNode) + + +@dataclass(frozen=True) +class FileNode(Node): + @classmethod + def from_data(cls, data: Dict[str, Any], lean_file: LeanFile) -> "FileNode": + children = [] + + def _get_closure(node: Node, child_spans: List[Tuple[Pos, Pos]]): + if len(child_spans) == 0: + return node.start, node.end + + child_starts = [s for s, _ in child_spans if s is not None] + if len(child_starts) == 0: + start = None + else: + start = min(child_starts) + + child_ends = [e for _, e in child_spans if e is not None] + if len(child_ends) == 0: + end = None + else: + end = max(child_ends) + + if node.start is None: + object.__setattr__(node, "start", start) + else: + assert node.start == start + + if node.end is None: + object.__setattr__(node, "end", end) + else: + assert node.end == end + + return start, end + + for i, d in enumerate(data["commandASTs"]): + node_data = d["node"] + if i == 0: + assert node_data["kind"] == "Lean.Parser.Module.header" + node = Node.from_data(node_data, lean_file) + node.traverse_postorder(_get_closure) + children.append(node) + + return cls(lean_file, lean_file.start_pos, lean_file.end_pos, children) + + +def _parse_children(node_data: Dict[str, Any], lean_file: LeanFile) -> List[Node]: + children = [] + + for d in node_data["args"]: + if ( + "node" in d + ): # | node (info : SourceInfo) (kind : SyntaxNodeKind) (args : Array Syntax) : Syntax + node = Node.from_data(d["node"], lean_file) + elif "atom" in d: # | atom (info : SourceInfo) (val : String) : Syntax + node = AtomNode.from_data(d["atom"], lean_file) + elif ( + "ident" in d + ): # | ident (info : SourceInfo) (rawVal : Substring) (val : Name) (preresolved : List Syntax.Preresolved) : Syntax + node = IdentNode.from_data(d["ident"], lean_file) + else: + raise ValueError(d) + + if node is not None: + children.append(node) + + return children + + +@dataclass(frozen=True) +class TermAttrkindNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TermAttrkindNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class TermAttrkindAntiquotNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TermAttrkindAntiquotNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class IdentAntiquotNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "IdentAntiquotNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + def get_ident(self) -> str: + return "".join(gc.val for gc in self.children if is_leaf(gc)) + + +@dataclass(frozen=True) +class LeanElabCommandCommandIrreducibleDefNode(Node): + name: Optional[str] + full_name: Optional[str] = None + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "LeanElabCommandCommandIrreducibleDefNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + if isinstance(children[0], CommandDeclmodifiersAntiquotNode): + name = None + else: + assert isinstance(children[0], CommandDeclmodifiersNode) + assert ( + isinstance(children[1], AtomNode) + and children[1].val == "irreducible_def" + ) + declid_node = children[2] + assert isinstance(declid_node, CommandDeclidNode) + ident_node = declid_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class GroupNode(Node): + @classmethod + def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "GroupNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class MathlibTacticLemmaNode(Node): + name: str + full_name: Optional[str] = None + _is_private_decl: Optional[bool] = ( + False # `_is_private` doesn't play well with lxml. + ) + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "MathlibTacticLemmaNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], CommandDeclmodifiersNode) + assert isinstance(children[1], GroupNode) + assert ( + isinstance(children[1].children[0], AtomNode) + and children[1].children[0].val == "lemma" + ) + declid_node = children[1].children[1] + assert isinstance(declid_node, CommandDeclidNode) + ident_node = declid_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + def is_private(self) -> bool: + return self._is_private_decl + + def get_proof_node(self) -> Node: + decl_val_node = self.children[1].children[3] + if isinstance( + decl_val_node, (CommandDeclvalsimpleNode, CommandWherestructinstNode) + ): + return decl_val_node.children[1] + else: + return decl_val_node + + def has_tactic_proof(self) -> bool: + node = self.get_proof_node() + return isinstance(node, TermBytacticNode) + + @property + def is_mutual(self) -> bool: + return not isinstance(self.name, str) + + +@dataclass(frozen=True) +class LemmaNode(Node): + name: str + full_name: Optional[str] = None + _is_private_decl: Optional[bool] = ( + False # `_is_private` doesn't play well with lxml. + ) + + @classmethod + def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "LemmaNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], CommandDeclmodifiersNode) + assert isinstance(children[1], GroupNode) + assert ( + isinstance(children[1].children[0], AtomNode) + and children[1].children[0].val == "lemma" + ) + declid_node = children[1].children[1] + assert isinstance(declid_node, CommandDeclidNode) + ident_node = declid_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + def is_private(self) -> bool: + return self._is_private_decl + + def get_proof_node(self) -> Node: + decl_val_node = self.children[1].children[3] + if isinstance( + decl_val_node, + ( + CommandDeclvalsimpleNode, + CommandWherestructinstNode, + ), + ): + return decl_val_node.children[1] + else: + return decl_val_node + + def has_tactic_proof(self) -> bool: + node = self.get_proof_node() + return isinstance(node, TermBytacticNode) + + @property + def is_mutual(self) -> bool: + return not isinstance(self.name, str) + + +@dataclass(frozen=True) +class CommandDeclarationNode(Node): + name: str + full_name: Optional[str] = None + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclarationNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + if isinstance(children[0], CommandDeclmodifiersAntiquotNode): + name = None + else: + assert isinstance(children[0], CommandDeclmodifiersNode) + assert isinstance( + children[1], + ( + CommandDefNode, + CommandDefinitionNode, + CommandTheoremNode, + CommandInductiveNode, + CommandClassinductiveNode, + CommandStructureNode, + CommandInstanceNode, + CommandAbbrevNode, + CommandOpaqueNode, + CommandAxiomNode, + CommandExampleNode, + ), + ) + name = children[1].name + + if children[0].is_private(): + for child in children: + if isinstance(child, CommandTheoremNode): + object.__setattr__(child, "_is_private_decl", True) + + return cls(lean_file, start, end, children, name) + + @property + def is_theorem(self) -> bool: + return isinstance(self.children[1], CommandTheoremNode) + + def get_theorem_node(self) -> "CommandTheoremNode": + assert self.is_theorem + return self.children[1] + + @property + def is_example(self) -> bool: + return isinstance(self.children[1], CommandExampleNode) + + +@dataclass(frozen=True) +class CommandDeclmodifiersAntiquotNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclmodifiersAntiquotNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandDeclmodifiersNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclmodifiersNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + def is_private(self) -> bool: + result = False + + def _callback(node: CommandPrivateNode, _) -> bool: + nonlocal result + result = True + return True + + self.traverse_preorder(_callback, CommandPrivateNode) + return result + + +@dataclass(frozen=True) +class CommandPrivateNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandPrivateNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandOpenNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandOpenNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandOpenonlyNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandOpenonlyNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class NullNode(Node): + @classmethod + def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "NullNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandStructuretkNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandStructuretkNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert isinstance(children[0], AtomNode) and children[0].val == "structure" + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandClasstkNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandClasstkNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert isinstance(children[0], AtomNode) and children[0].val == "class" + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandStructureNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandStructureNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], (CommandStructuretkNode, CommandClasstkNode)) + if isinstance(children[1], CommandDeclidAntiquotNode): + name = None + else: + assert isinstance(children[1], CommandDeclidNode) + decl_id_node = children[1] + ident_node = decl_id_node.children[0] + + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandInductiveNode(Node): + name: Optional[str] + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandInductiveNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], AtomNode) and children[0].val == "inductive" + + if isinstance(children[1], CommandDeclidAntiquotNode): + name = None + else: + assert isinstance(children[1], CommandDeclidNode) + decl_id_node = children[1] + ident_node = decl_id_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandClassinductiveNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandClassinductiveNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert ( + isinstance(children[0].children[0], AtomNode) + and children[0].children[0].val == "class" + ) + assert ( + isinstance(children[0].children[1], AtomNode) + and children[0].children[1].val == "inductive" + ) + + if isinstance(children[1], CommandDeclidAntiquotNode): + name = None + else: + assert isinstance(children[1], CommandDeclidNode) + decl_id_node = children[1] + ident_node = decl_id_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class TermHoleNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TermHoleNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert len(children) == 1 and isinstance( + children[0], + ( + AtomNode, + TokenAntiquotNode, + ), + ) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class LeanBinderidentNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "LeanBinderidentNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert len(children) == 1 and isinstance( + children[0], + ( + TermHoleNode, + IdentNode, + IdentAntiquotNode, + ), + ) + return cls(lean_file, start, end, children) + + def get_ident(self) -> Optional[str]: + if isinstance(self.children[0], TermHoleNode): + return None + else: + assert isinstance(self.children[0], IdentNode) + return self.children[0].val + + +@dataclass(frozen=True) +class LeanBinderidentAntiquotNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "LeanBinderidentAntiquotNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + def get_ident(self) -> Optional[str]: + return None + + +@dataclass(frozen=True) +class StdTacticAliasAliasNode(Node): + name: str + full_name: Optional[str] = None + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "StdTacticAliasAliasNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], CommandDeclmodifiersNode) + assert isinstance(children[1], AtomNode) and children[1].val == "alias" + if isinstance(children[2], IdentAntiquotNode): + name = None + else: + ident_node = children[2] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class StdTacticAliasAliaslrNode(Node): + name: List[str] + full_name: Optional[List[str]] = None + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "StdTacticAliasAliaslrNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], CommandDeclmodifiersNode) + assert isinstance(children[1], AtomNode) and children[1].val == "alias" + assert isinstance(children[2], AtomNode) and children[2].val == "⟨" + assert isinstance(children[4], AtomNode) and children[4].val == "," + assert isinstance(children[6], AtomNode) and children[6].val == "⟩" + + name = [] + assert isinstance( + children[3], (LeanBinderidentNode, LeanBinderidentAntiquotNode) + ) + name.append(children[3].get_ident()) + assert isinstance( + children[5], (LeanBinderidentNode, LeanBinderidentAntiquotNode) + ) + name.append(children[5].get_ident()) + name = [n for n in name if n is not None] + + return cls(lean_file, start, end, children, name) + + @property + def is_mutual(self) -> bool: + return True + + +@dataclass(frozen=True) +class CommandAbbrevNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandAbbrevNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], AtomNode) and children[0].val == "abbrev" + declid_node = children[1] + if isinstance(declid_node, CommandDeclidAntiquotNode): + name = None + else: + assert isinstance(declid_node, CommandDeclidNode) + ident_node = declid_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandOpaqueNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandOpaqueNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], AtomNode) and children[0].val == "opaque" + declid_node = children[1] + if isinstance(declid_node, CommandDeclidAntiquotNode): + name = None + else: + assert isinstance(declid_node, CommandDeclidNode) + ident_node = declid_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandAxiomNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandAxiomNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], AtomNode) and children[0].val == "axiom" + declid_node = children[1] + if isinstance(declid_node, CommandDeclidAntiquotNode): + name = None + else: + assert isinstance(declid_node, CommandDeclidNode) + ident_node = declid_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandExampleNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandExampleNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert isinstance(children[0], AtomNode) and children[0].val == "example" + name = None + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandInstanceNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandInstanceNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + if isinstance(children[0], TermAttrkindAntiquotNode): + name = None + else: + assert isinstance(children[0], TermAttrkindNode) + assert isinstance(children[1], AtomNode) and children[1].val == "instance" + if children[3].children != []: + declid_node = children[3].children[0] + if isinstance(declid_node, CommandDeclidNode): + ident_node = declid_node.children[0] + assert isinstance(ident_node, IdentNode) + name = ident_node.val + else: + name = None + else: + name = None + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandDefNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDefNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + if isinstance(children[0], TokenAntiquotNode) or isinstance( + children[1], CommandDeclidAntiquotNode + ): + name = None + else: + assert isinstance(children[0], AtomNode) and children[0].val == "def" + assert isinstance(children[1], CommandDeclidNode) + decl_id_node = children[1] + ident_node = decl_id_node.children[0] + + if isinstance(ident_node, IdentNode): + name = ident_node.val + else: + assert isinstance(ident_node, IdentAntiquotNode) + name = ident_node.get_ident() + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandDefinitionNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDefinitionNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + if isinstance(children[0], TokenAntiquotNode) or isinstance( + children[1], CommandDeclidAntiquotNode + ): + name = None + else: + assert isinstance(children[0], AtomNode) and children[0].val == "def" + assert isinstance(children[1], CommandDeclidNode) + decl_id_node = children[1] + ident_node = decl_id_node.children[0] + + if isinstance(ident_node, IdentNode): + name = ident_node.val + else: + assert isinstance(ident_node, IdentAntiquotNode) + name = ident_node.get_ident() + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandDeclidAntiquotNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclidAntiquotNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandDeclidNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclidNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandDeclvalsimpleNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclvalsimpleNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class TokenAntiquotNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TokenAntiquotNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandDeclvaleqnsNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclvaleqnsNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandWherestructinstNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandWherestructinstNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandDeclsigNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDeclsigNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class TermExplicitbinderNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TermExplicitbinderNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class TermTypespecNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TermTypespecNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class CommandTheoremNode(Node): + name: str + full_name: Optional[str] = None + _is_private_decl: Optional[bool] = ( + False # `_is_private` doesn't play well with lxml. + ) + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandTheoremNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], AtomNode) and children[0].val == "theorem" + + declid_node = children[1] + if isinstance(declid_node, CommandDeclidAntiquotNode): + name = None + else: + ident_node = declid_node.children[0] + if isinstance(ident_node, IdentNode): + name = ident_node.val + else: + assert isinstance(ident_node, IdentAntiquotNode) + name = ident_node.get_ident() + + if not isinstance(children[1], CommandDeclidAntiquotNode): + assert isinstance(children[2], CommandDeclsigNode) + decl_val_node = children[3] + assert isinstance( + decl_val_node, + ( + CommandDeclvalsimpleNode, + CommandDeclvaleqnsNode, + CommandWherestructinstNode, + ), + ) + + if isinstance(decl_val_node, CommandDeclvalsimpleNode): + assert ( + isinstance(decl_val_node.children[0], AtomNode) + and decl_val_node.children[0].val == ":=" + ) + elif isinstance(decl_val_node, CommandWherestructinstNode): + assert ( + isinstance(decl_val_node.children[0], AtomNode) + and decl_val_node.children[0].val == "where" + ) + + return cls(lean_file, start, end, children, name) + + def is_private(self) -> bool: + return self._is_private_decl + + def get_proof_node(self) -> Node: + decl_val_node = self.children[3] + if isinstance( + decl_val_node, + ( + CommandDeclvalsimpleNode, + CommandWherestructinstNode, + ), + ): + return decl_val_node.children[1] + else: + return decl_val_node + + def has_tactic_proof(self) -> bool: + node = self.get_proof_node() + return isinstance(node, TermBytacticNode) + + @property + def is_mutual(self) -> bool: + return not isinstance(self.name, str) + + +@dataclass(frozen=True) +class TermBytacticNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TermBytacticNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class TacticTacticseq1IndentedAntiquotNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TacticTacticseq1IndentedAntiquotNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + def get_tactic_nodes( + self, atomic_only: bool = False + ) -> Generator[Node, None, None]: + return + + +@dataclass(frozen=True) +class TacticTacticseqNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TacticTacticseqNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert len(children) == 1 and isinstance( + children[0], + ( + TacticTacticseq1IndentedNode, + TacticTacticseqbracketedNode, + TacticTacticseq1IndentedAntiquotNode, + ), + ) + return cls(lean_file, start, end, children) + + def get_tactic_nodes( + self, atomic_only: bool = False + ) -> Generator[Node, None, None]: + yield from self.children[0].get_tactic_nodes(atomic_only) + + +@dataclass(frozen=True) +class TacticTacticseq1IndentedNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TacticTacticseq1IndentedNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert len(children) == 1 and isinstance(children[0], NullNode) + return cls(lean_file, start, end, children) + + def get_tactic_nodes( + self, atomic_only: bool = False + ) -> Generator[Node, None, None]: + for i, tac_node in enumerate(self.children[0].children): + if i % 2 == 0: + if not atomic_only or not contains_tactic(tac_node): + yield tac_node + else: + assert isinstance(tac_node, NullNode) or isinstance(tac_node, AtomNode) + + +@dataclass(frozen=True) +class TacticTacticseqbracketedNode(Node): + state_before: Optional[str] = None + state_after: Optional[str] = None + tactic: Optional[str] = None + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "TacticTacticseqbracketedNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert len(children) == 3 + return cls(lean_file, start, end, children) + + @property + def tactic_nodes(self) -> List[Node]: + children = self.children + if not isinstance(children[0], AtomNode) or children[0].val != "{": + return [] + + assert isinstance(children[1], NullNode) + assert isinstance(children[2], AtomNode) and children[2].val == "}" + nodes = [] + for i, tac_node in enumerate(children[1].children): + if i % 2 == 0: + nodes.append(tac_node) + else: + assert isinstance(tac_node, NullNode) or isinstance(tac_node, AtomNode) + return nodes + + def get_tactic_nodes( + self, atomic_only: bool = False + ) -> Generator[Node, None, None]: + children = self.children + if isinstance(children[0], AtomNode) and children[0].val == "{": + assert isinstance(children[1], NullNode) + assert isinstance(children[2], AtomNode) and children[2].val == "}" + for i, tac_node in enumerate(children[1].children): + if i % 2 == 0: + if not atomic_only or not contains_tactic(tac_node): + yield tac_node + else: + assert isinstance(tac_node, NullNode) or isinstance( + tac_node, AtomNode + ) + + +def contains_tactic(node: Node) -> bool: + result = False + + def _callback(x, _) -> bool: + if x is not node and isinstance( + x, + ( + TacticTacticseq1IndentedNode, + TacticTacticseqbracketedNode, + ), + ): + nonlocal result + result = True + return True + + node.traverse_preorder(_callback, node_cls=None) + return result + + +@dataclass(frozen=True) +class ModuleHeaderNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "ModuleHeaderNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class ModulePreludeNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "ModulePreludeNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class ModulePreludeNode(Node): + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "ModulePreludeNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children) + + +@dataclass(frozen=True) +class ModuleImportNode(Node): + module: Optional[str] + path: Optional[Path] = None + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "ModuleImportNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert isinstance(children[0], AtomNode) and children[0].val == "import" + if isinstance(children[2], IdentNode): + module = children[2].val + else: + module = None + + return cls(lean_file, start, end, children, module) + + +@dataclass(frozen=True) +class CommandModuledocNode(Node): + comment: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandModuledocNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert len(children) == 2 and all(isinstance(_, AtomNode) for _ in children) + assert children[0].val == "/-!" + comment = children[1].val + return cls(lean_file, start, end, children, comment) + + +@dataclass(frozen=True) +class CommandDoccommentNode(Node): + comment: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandDoccommentNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + assert len(children) == 2 and all(isinstance(_, AtomNode) for _ in children) + assert children[0].val == "/--" + comment = children[1].val + return cls(lean_file, start, end, children, comment) + + +@dataclass(frozen=True) +class CommandNamespaceNode(Node): + name: str + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandNamespaceNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert len(children) == 2 + assert isinstance(children[0], AtomNode) and children[0].val == "namespace" + if isinstance(children[1], IdentNode): + name = children[1].val + else: + name = None + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandSectionNode(Node): + name: Optional[str] + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandNamespaceNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert len(children) == 2 + assert isinstance(children[0], AtomNode) and children[0].val == "section" + assert isinstance(children[1], NullNode) + + if len(children[1].children) == 1 and isinstance( + children[1].children[0], IdentNode + ): + name = children[1].children[0].val + else: + name = None + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandNoncomputablesectionNode(Node): + name: Optional[str] + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandNoncomputablesectionNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert len(children) == 3 + assert isinstance(children[0], AtomNode) and children[0].val == "noncomputable" + assert isinstance(children[1], AtomNode) and children[1].val == "section" + assert isinstance(children[2], NullNode) + + if len(children[2].children) == 1 and isinstance( + children[2].children[0], IdentNode + ): + name = children[2].children[0].val + else: + name = None + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class CommandEndNode(Node): + name: Optional[str] + + @classmethod + def from_data( + cls, node_data: Dict[str, Any], lean_file: LeanFile + ) -> "CommandEndNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + + assert len(children) == 2 + assert isinstance(children[0], AtomNode) and children[0].val == "end" + assert isinstance(children[1], NullNode) + + if len(children[1].children) == 1 and isinstance( + children[1].children[0], IdentNode + ): + name = children[1].children[0].val + else: + name = None + + return cls(lean_file, start, end, children, name) + + +@dataclass(frozen=True) +class OtherNode(Node): + kind: str # type: ignore + state_before: Optional[str] = None + state_after: Optional[str] = None + tactic: Optional[str] = None + + @classmethod + def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "OtherNode": + assert node_data["info"] == "none" + start, end = None, None + children = _parse_children(node_data, lean_file) + return cls(lean_file, start, end, children, node_data["kind"]) + + +def is_potential_premise_lean4(node: Node) -> bool: + """Check if ``node`` is a theorem/definition that can be used as a premise.""" + if (isinstance(node, CommandDeclarationNode) and not node.is_example) or isinstance( + node, + ( + LemmaNode, + MathlibTacticLemmaNode, + LeanElabCommandCommandIrreducibleDefNode, + StdTacticAliasAliasNode, + StdTacticAliasAliaslrNode, + ), + ): + return node.name is not None + else: + return False + + +def is_mutual_lean4(node: Node) -> bool: + return ( + isinstance(node, (IdentNode, CommandTheoremNode, StdTacticAliasAliaslrNode)) + and node.is_mutual + ) diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py new file mode 100644 index 0000000..a15dd61 --- /dev/null +++ b/data_extraction/build_lean4_repo.py @@ -0,0 +1,214 @@ +"""Build Lean 4 projects in Docker. + +Only this file runs in Docker. So it must be self-contained. +""" + +import os +import re +import shutil +import argparse +import itertools +import subprocess +from tqdm import tqdm +from loguru import logger +from time import sleep, monotonic +from pathlib import Path, PurePath +from multiprocessing import Process +from contextlib import contextmanager +from typing import Union, List, Optional, Generator + + +def run_cmd(cmd: Union[str, List[str]], capture_output: bool = False) -> Optional[str]: + """Run a shell command. + + Args: + cmd (Union[str, List[str]]): A command or a list of commands. + """ + if isinstance(cmd, list): + cmd = " && ".join(cmd) + res = subprocess.run(cmd, shell=True, capture_output=capture_output, check=True) + if capture_output: + return res.stdout.decode() + else: + return None + + +def record_paths(dir: Path, root: Path, lean_bin: Path) -> None: + """Run ``lean --deps`` for all Lean files in ``dir`` to record its dependencies. + + Args: + dir (Path): The directory containing Lean files. + """ + dir = Path(dir) + + for p in dir.glob("**/*.lean"): + with p.with_suffix(".dep_paths").open("wt") as oup: + for line in run_cmd( + f"{lean_bin} --deps {p}", capture_output=True + ).splitlines(): + olean_path = PurePath(line.strip()) + assert olean_path.suffix == ".olean" + lean_path = olean_path.relative_to(root).with_suffix(".lean") + oup.write(str(lean_path) + "\n") + + +def remove_files(dir: Path, suffix: str) -> None: + """Remove all files in ``dir`` that end with ``suffix``.""" + for p in Path(dir).glob(f"**/*{suffix}"): + p.unlink() + + +_PROGRESSBAR_UPDATE_INTERNAL = 5 + + +def _monitor(paths: List[Path], num_total: int) -> None: + with tqdm(total=num_total) as pbar: + while True: + time_start = monotonic() + try: + num_done = len( + list( + itertools.chain.from_iterable( + p.glob(f"**/*.ast.json") for p in paths + ) + ) + ) + except Exception: + continue + time_elapsed = monotonic() - time_start + if time_elapsed < _PROGRESSBAR_UPDATE_INTERNAL: + sleep(_PROGRESSBAR_UPDATE_INTERNAL - time_elapsed) + pbar.update(num_done - pbar.n) + if num_done >= num_total: + break + print("") + + +@contextmanager +def launch_progressbar(paths: List[Union[str, Path]]) -> Generator[None, None, None]: + """Launch an async progressbar to monitor the progress of tracing the repo.""" + paths = [Path(p) for p in paths] + olean_files = list( + itertools.chain.from_iterable(p.glob("**/*.olean") for p in paths) + ) + num_total = len(olean_files) + p = Process(target=_monitor, args=(paths, num_total), daemon=True) + p.start() + yield + p.kill() + + +def get_lean_version() -> str: + """Get the version of Lean.""" + output = run_cmd("lean --version", capture_output=True).strip() + m = re.match(r"Lean \(version (?P\S+?),", output) + return m["version"] + + +def check_files(packages_path: str, no_deps: bool) -> bool: + """Check if all *.lean files have been processed to produce *.ast.json and *.dep_paths files.""" + cwd = Path.cwd() + packages_path = cwd / packages_path + jsons = { + p.with_suffix("").with_suffix("") + for p in cwd.glob("**/build/ir/**/*.ast.json") + if not no_deps or not p.is_relative_to(packages_path) + } + deps = { + p.with_suffix("") + for p in cwd.glob("**/build/ir/**/*.dep_paths") + if not no_deps or not p.is_relative_to(packages_path) + } + oleans = { + Path(str(p.with_suffix("")).replace("/build/lib/", "/build/ir/")) + for p in cwd.glob("**/build/lib/**/*.olean") + if not no_deps or not p.is_relative_to(packages_path) + } + assert len(jsons) <= len(oleans) and len(deps) <= len(oleans) + missing_jsons = {p.with_suffix(".ast.json") for p in oleans - jsons} + missing_deps = {p.with_suffix(".dep_paths") for p in oleans - deps} + if len(missing_jsons) > 0 or len(missing_deps) > 0: + for p in missing_jsons.union(missing_deps): + logger.warning(f"Missing {p}") + return False + return True + + +def is_new_version(v: str) -> bool: + """Check if ``v`` is at least `4.3.0-rc2`.""" + major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")] + if major < 4 or (major == 4 and minor < 3): + return False + if ( + major > 4 + or (major == 4 and minor > 3) + or (major == 4 and minor == 3 and patch > 0) + ): + return True + assert major == 4 and minor == 3 and patch == 0 + if "4.3.0-rc" in v: + rc = int(v.split("-")[1][2:]) + return rc >= 2 + else: + return True + + +def main() -> None: + import ipdb; ipdb.set_trace() + parser = argparse.ArgumentParser() + parser.add_argument("repo_name") + parser.add_argument("--no-deps", action="store_true") + args = parser.parse_args() + + num_procs = int(os.environ["NUM_PROCS"]) + repo_name = args.repo_name + os.chdir(repo_name) + + if is_new_version(get_lean_version()): + packages_path = ".lake/packages" + build_path = ".lake/build" + else: + packages_path = "lake-packages" + build_path = "build" + + # if check_files(packages_path, args.no_deps): + # logger.info(f"The repo {repo_name} has already been traced.") + # return + + # If the lean4 package exists, we assume the build has completed and we just need to trace + if (Path(".lake/packages/lean4") if is_new_version(get_lean_version()) else Path("lake-packages/lean4")).exists(): + logger.info(f"The repo {repo_name} has already been built, but has not been traced.") + else: + # Build the repo using lake. + logger.info(f"Building {repo_name}") + if args.no_deps: + # The additional *.olean files wouldn't matter. + try: + run_cmd("lake exe cache get") + except subprocess.CalledProcessError: + pass + run_cmd("lake build") + + # Copy the Lean 4 stdlib into the path of packages. + lean_prefix = run_cmd(f"lean --print-prefix", capture_output=True).strip() + shutil.copytree(lean_prefix, f"{packages_path}/lean4") + + + # Run ExtractData.lean to extract ASTs, tactic states, and premise information. + dirs_to_monitor = [build_path] + if not args.no_deps: + dirs_to_monitor.append(packages_path) + + logger.info(f"Tracing {repo_name}") + with launch_progressbar(dirs_to_monitor): + cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean" + if args.no_deps: + cmd += " noDeps" + logger.debug(cmd) + run_cmd(cmd, capture_output=True) + + assert check_files(packages_path, args.no_deps), "Some files failed to be processed." + + +if __name__ == "__main__": + main() diff --git a/data_extraction/cache.py b/data_extraction/cache.py new file mode 100644 index 0000000..20bae84 --- /dev/null +++ b/data_extraction/cache.py @@ -0,0 +1,107 @@ +"""Cache manager of traced repos. +""" + +import os +import shutil +import tarfile +from pathlib import Path +from loguru import logger +from filelock import FileLock +from dataclasses import dataclass, field +from typing import Optional, Tuple, Generator + +from ..utils import ( + execute, + url_exists, + get_repo_info, + report_critical_failure, +) +from ..constants import ( + CACHE_DIR, + DISABLE_REMOTE_CACHE, + REMOTE_CACHE_URL, +) + + +def _split_git_url(url: str) -> Tuple[str, str]: + """Split a Git URL into user name and repo name.""" + if url.endswith("/"): + url = url[:-1] + assert not url.endswith("/"), f"Unexpected URL: {url}" + fields = url.split("/") + user_name = fields[-2] + repo_name = fields[-1] + return user_name, repo_name + + +def _format_dirname(url: str, commit: str) -> str: + user_name, repo_name = _split_git_url(url) + return f"{user_name}-{repo_name}-{commit}" + + +_CACHE_CORRPUTION_MSG = "The cache may have been corrupted!" + + +@dataclass(frozen=True, eq=False) +class Cache: + """Cache manager.""" + + cache_dir: Path + lock: FileLock = field(init=False, repr=False) + + def __iter__(self) -> Generator[Path, None, None]: + """Iterate over all traced repos in the cache.""" + yield from self.cache_dir.glob("*") + + def __post_init__(self): + if not os.path.exists(self.cache_dir): + self.cache_dir.mkdir(parents=True) + lock_path = self.cache_dir.with_suffix(".lock") + object.__setattr__(self, "lock", FileLock(lock_path)) + + def get(self, url: str, commit: str) -> Optional[Path]: + """Get the path of a traced repo with URL ``url`` and commit hash ``commit``. Return None if no such repo can be found.""" + _, repo_name = _split_git_url(url) + dirname = _format_dirname(url, commit) + dirpath = self.cache_dir / dirname + + with self.lock: + if dirpath.exists(): + assert (dirpath / repo_name).exists() + return dirpath / repo_name + + elif not DISABLE_REMOTE_CACHE: + url = os.path.join(REMOTE_CACHE_URL, f"{dirname}.tar.gz") + if not url_exists(url): + return None + logger.info( + f"Downloading the traced repo from the remote cache. Set the environment variable `DISABLE_REMOTE_CACHE` if you want to trace the repo locally." + ) + execute(f"wget {url} -O {dirpath}.tar.gz") + + with report_critical_failure(_CACHE_CORRPUTION_MSG): + with tarfile.open(f"{dirpath}.tar.gz") as tar: + tar.extractall(self.cache_dir) + os.remove(f"{dirpath}.tar.gz") + assert (dirpath / repo_name).exists() + + return dirpath / repo_name + + else: + return None + + def store(self, src: Path) -> Path: + """Store a traced repo at path ``src``. Return its path in the cache.""" + url, commit = get_repo_info(src) + dirpath = self.cache_dir / _format_dirname(url, commit) + _, repo_name = _split_git_url(url) + if not dirpath.exists(): + with self.lock: + with report_critical_failure(_CACHE_CORRPUTION_MSG): + shutil.copytree(src, dirpath / repo_name) + return dirpath / repo_name + + +cache = Cache(CACHE_DIR) +"""A global :class:`Cache` object managing LeanDojo's caching of traced repos (see :ref:`caching`). +""" diff --git a/data_extraction/lean.py b/data_extraction/lean.py new file mode 100644 index 0000000..187288c --- /dev/null +++ b/data_extraction/lean.py @@ -0,0 +1,702 @@ +"""This module define classes for repos, files, and theorems in Lean. +Objects of these classes contain only surface information, without extracting any trace. +""" + +import re +import os +import json +import toml +import time +import urllib +import webbrowser +from pathlib import Path +from loguru import logger +from functools import cache +from github import Github, Auth +from dataclasses import dataclass, field +from github.Repository import Repository +from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator + + +from ..utils import ( + execute, + read_url, + url_exists, + get_repo_info, + working_directory, +) +from ..constants import LEAN4_URL +from .cache import _split_git_url + + +GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None) +"""GiHub personal access token is optional. +If provided, it can increase the rate limit for GitHub API calls. +""" + +if GITHUB_ACCESS_TOKEN: + logger.debug("Using GitHub personal access token for authentication") + GITHUB = Github(auth=Auth.Token(GITHUB_ACCESS_TOKEN)) + GITHUB.get_user().login +else: + logger.debug( + "Using GitHub without authentication. Don't be surprised if you hit the API rate limit." + ) + GITHUB = Github() + +LEAN4_REPO = GITHUB.get_repo("leanprover/lean4") +"""The GitHub Repo for Lean 4 itself.""" + +LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly") +"""The GitHub Repo for Lean 4 nightly releases.""" + +_URL_REGEX = re.compile(r"(?P.*?)/*") + + +def normalize_url(url: str) -> str: + return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. + + +@cache +def url_to_repo(url: str, num_retries: int = 2) -> Repository: + url = normalize_url(url) + backoff = 1 + + while True: + try: + return GITHUB.get_repo("/".join(url.split("/")[-2:])) + except Exception as ex: + if num_retries <= 0: + raise ex + num_retries -= 1 + logger.debug(f'url_to_repo("{url}") failed. Retrying...') + time.sleep(backoff) + backoff *= 2 + + +@cache +def get_latest_commit(url: str) -> str: + """Get the hash of the latest commit of the Git repo at ``url``.""" + repo = url_to_repo(url) + return repo.get_branch(repo.default_branch).commit.sha + + +def cleanse_string(s: Union[str, Path]) -> str: + """Replace : and / with _ in a string.""" + return str(s).replace("/", "_").replace(":", "_") + + +@cache +def _to_commit_hash(repo: Repository, label: str) -> str: + """Convert a tag or branch to a commit hash.""" + logger.debug(f"Querying the commit hash for {repo.name} {label}") + + # Poor man's cache + if repo.name == "lean4": + if label == "v4.23.0-rc2": + return "ad1a017949674a947f0d6794cbf7130d642c6530" + elif label == "v4.17.0": + return "306f36116535cd226329f562b4675b8b6dbf948c" + elif label == "v4.8.0-rc2": + return "873ef2d894af80d8fc672e35f7e28bae314a1f6f" + + # if the label is a commit hash, return it directly + if len(label) == 40 and _COMMIT_REGEX.fullmatch(label.strip()): + return label + + for branch in repo.get_branches(): + if branch.name == label: + print(f"Found branch {branch.name} with commit {branch.commit.sha}") + return branch.commit.sha + + for tag in repo.get_tags(): + if tag.name == label: + print(f"Found tag {tag.name} with commit {tag.commit.sha}") + return tag.commit.sha + + raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") + + +@dataclass(eq=True, unsafe_hash=True) +class Pos: + """Position in source files. + + We use 1-index to keep it consistent with code editors such as Visual Studio Code. + """ + + line_nb: int + """Line number + """ + + column_nb: int + """Column number + """ + + @classmethod + def from_str(cls, s: str) -> "Pos": + """Construct a :class:`Pos` object from its string representation, e.g., :code:`"(323, 1109)"`.""" + assert s.startswith("(") and s.endswith( + ")" + ), f"Invalid string representation of a position: {s}" + line, column = s[1:-1].split(",") + line_nb = int(line) + column_nb = int(column) + return cls(line_nb, column_nb) + + def __iter__(self) -> Generator[int, None, None]: + yield self.line_nb + yield self.column_nb + + def __repr__(self) -> str: + return repr(tuple(self)) + + def __lt__(self, other): + return self.line_nb < other.line_nb or ( + self.line_nb == other.line_nb and self.column_nb < other.column_nb + ) + + def __le__(self, other): + return self < other or self == other + + +@dataclass(frozen=True) +class LeanFile: + """A Lean source file (:file:`*.lean`).""" + + root_dir: Path = field(repr=False) + """Root directory of the traced repo this :class:`LeanFile` object belongs to. + + ``root_dir`` must be an absolute path, e.g., :file:`/home/kaiyu/traced_lean-example/lean-example` + """ + + path: Path + """Relative path w.r.t. ``root_dir`` + + E.g., :file:`lean-example/src/example.lean` + """ + + code: List[str] = field(init=False, repr=False) + """Raw source code as a list of lines.""" + + endwith_newline: bool = field(init=False, repr=False) + """Whether the last line ends with a newline.""" + + num_bytes: List[int] = field(init=False, repr=False) + """The number of UTF-8 bytes of each line, including newlines. + """ + + def __post_init__(self) -> None: + assert ( + self.root_dir.is_absolute() + ), f"Root directory must be an absolute path: {self.root_dir}" + assert self.path.suffix == ".lean", f"File extension must be .lean: {self.path}" + assert not self.path.is_absolute(), f"Path must be a relative path: {self.path}" + + code = [] + endwith_newline = None + num_bytes = [] + + for line in self.abs_path.open("rb"): + if b"\r\n" in line: + raise RuntimeError( + f"{self.abs_path} contains Windows-style line endings. This is discouraged (see https://github.com/leanprover-community/mathlib4/pull/6506)." + ) + if line.endswith(b"\n"): + endwith_newline = True + line = line[:-1] + else: + endwith_newline = False + code.append(line.decode("utf-8")) + num_bytes.append(len(line) + 1) + + object.__setattr__(self, "code", code) + object.__setattr__(self, "endwith_newline", endwith_newline) + object.__setattr__(self, "num_bytes", num_bytes) + + @property + def abs_path(self) -> Path: + """Absolute path of a :class:`LeanFile` object. + + E.g., :file:`/home/kaiyu/traced_lean-example/lean-example/src/example.lean` + """ + return self.root_dir / self.path + + @property + def num_lines(self) -> int: + """Number of lines in a source file.""" + return len(self.code) + + def num_columns(self, line_nb: int) -> int: + """Number of columns in a source file.""" + return len(self.get_line(line_nb)) + + @property + def start_pos(self) -> Pos: + """Return the start position of a source file. + + Returns: + Pos: A :class:`Pos` object representing the start of this file. + """ + return Pos(1, 1) + + @property + def end_pos(self) -> Pos: + """Return the end position of a source file. + + Args: + zero_indexed (bool, optional): Whether to use 0-index instead of 1-index. Defaults to False. + + Returns: + Pos: A :class:`Pos` object representing the end of this file. + """ + # Line and column numbers are 1-indexed by default. + line_nb = self.num_lines + column_nb = 1 + len(self.code[-1]) + return Pos(line_nb, column_nb) + + def convert_pos(self, byte_idx: int) -> Pos: + """Convert a byte index (:code:`String.Pos` in Lean 4) to a :class:`Pos` object.""" + n = 0 + for i, num_bytes in enumerate(self.num_bytes, start=1): + n += num_bytes + if n > byte_idx: + line_byte_idx = byte_idx - (n - num_bytes) + if line_byte_idx == 0: + return Pos(i, 1) + + line = self.get_line(i) + m = 0 + + for j, c in enumerate(line, start=1): + m += len(c.encode("utf-8")) + if m >= line_byte_idx: + return Pos(i, j + 1) + + raise ValueError(f"Invalid byte index {byte_idx} in {self.path}.") + + def offset(self, pos: Pos, delta: int) -> Pos: + """Off set a position by a given number.""" + line_nb, column_nb = pos + num_columns = len(self.get_line(line_nb)) - column_nb + 1 + if delta <= num_columns: + return Pos(line_nb, column_nb + delta) + delta_left = delta - num_columns - 1 + + for i in range(line_nb, self.num_lines): + line = self.code[i] + l = len(line) + if delta_left <= l: + return Pos(i + 1, delta_left + 1) + delta_left -= l + 1 + + if delta_left == 0 and self.endwith_newline: + return Pos(self.num_lines + 1, 1) + + raise ValueError(f"Invalid offset {delta} in {self.path}: {pos}.") + + def get_line(self, line_nb: int) -> str: + """Return a given line of the source file. + + Args: + line_nb (int): Line number (1-indexed). + """ + return self.code[line_nb - 1] + + def __getitem__(self, key) -> str: + """Return a code segment given its start/end positions. + + This enables ``lean_file[start:end]``. + + Args: + key (slice): A slice of two :class:`Pos` objects for the start/end of the code segment. + """ + assert isinstance(key, slice) and key.step is None + if key.start is None: + start_line = start_column = 1 + else: + start_line, start_column = key.start + if key.stop is None: + end_line = self.num_lines + end_column = 1 + len(self.get_line(end_line)) + else: + end_line, end_column = key.stop + if start_line == end_line: + assert start_column <= end_column + return self.get_line(start_line)[start_column - 1 : end_column - 1] + else: + assert start_line < end_line + code_slice = [self.code[start_line - 1][start_column - 1 :]] + for line_nb in range(start_line + 1, end_line): + code_slice.append(self.get_line(line_nb)) + code_slice.append(self.get_line(end_line)[: end_column - 1]) + return "\n".join(code_slice) + + +_COMMIT_REGEX = re.compile(r"[0-9a-z]+") +_LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P.+?)") + + +def get_lean4_version_from_config(toolchain: str) -> str: + """Return the required Lean version given a ``lean-toolchain`` config.""" + m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) + assert m is not None, "Invalid config." + return m["version"] + + +def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str: + """Return the required Lean commit given a ``lean-toolchain`` config.""" + assert "content" in config_dict, "config_dict must have a 'content' field" + config = config_dict["content"].strip() + prefix = "leanprover/lean4:" + + if config == f"{prefix}nightly": + latest_tag = LEAN4_NIGHTLY_REPO.get_tags()[0] + return latest_tag.commit.sha + + assert config.startswith(prefix), f"Invalid Lean 4 version: {config}" + version = config[len(prefix) :] + + if version.startswith("nightly"): + return _to_commit_hash(LEAN4_NIGHTLY_REPO, version) + else: + return _to_commit_hash(LEAN4_REPO, version) + + +URL = TAG = COMMIT = str + + +@dataclass(frozen=True) +class RepoInfoCache: + """To minize the number of network requests, we cache and re-use the info + of all repos, assuming it does not change during the execution of LeanDojo.""" + + tag2commit: Dict[Tuple[URL, TAG], COMMIT] = field(default_factory=dict) + lean_version: Dict[Tuple[URL, COMMIT], str] = field(default_factory=dict) + + +info_cache = RepoInfoCache() + + +_LAKEFILE_LEAN_GIT_REQUIREMENT_REGEX = re.compile( + r"require\s+(?P\S+)\s+from\s+git\s+\"(?P.+?)\"(\s+@\s+\"(?P\S+)\")?" +) + +_LAKEFILE_LEAN_LOCAL_REQUIREMENT_REGEX = re.compile(r"require \S+ from \"") + +_LAKEFILE_TOML_REQUIREMENT_REGEX = re.compile(r"(?<=\[\[require\]\]).+(?=\n\n)") + + +def is_supported_version(v) -> bool: + """Check if ``v`` is at least `v4.3.0-rc2`.""" + if not v.startswith("v"): + return False + v = v[1:] + major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")] + if major < 4 or (major == 4 and minor < 3): + return False + if ( + major > 4 + or (major == 4 and minor > 3) + or (major == 4 and minor == 3 and patch > 0) + ): + return True + assert major == 4 and minor == 3 and patch == 0 + if "4.3.0-rc" in v: + rc = int(v.split("-")[1][2:]) + return rc >= 2 + else: + return True + + +@dataclass(frozen=True) +class LeanGitRepo: + """Git repo of a Lean project.""" + + url: str + """The repo's Github URL. + + Note that we only support Github as of now. + """ + + commit: str + """The repo's commit hash. + + You can also use tags such as ``v3.5.0``. They will be converted to commit hashes. + """ + + repo: Repository = field(init=False, repr=False) + """A :class:`github.Repository` object. + """ + + lean_version: str = field(init=False, repr=False) + """Required Lean version. + """ + + def __post_init__(self) -> None: + if "github.com" not in self.url: + raise ValueError(f"{self.url} is not a Github URL") + if not self.url.startswith("https://"): + raise ValueError(f"{self.url} is not a valid URL") + object.__setattr__(self, "url", normalize_url(self.url)) + object.__setattr__(self, "repo", url_to_repo(self.url)) + + # Convert tags or branches to commit hashes + if not (len(self.commit) == 40 and _COMMIT_REGEX.fullmatch(self.commit)): + if (self.url, self.commit) in info_cache.tag2commit: + commit = info_cache.tag2commit[(self.url, self.commit)] + else: + commit = _to_commit_hash(self.repo, self.commit) + assert _COMMIT_REGEX.fullmatch(commit), f"Invalid commit hash: {commit}" + info_cache.tag2commit[(self.url, self.commit)] = commit + object.__setattr__(self, "commit", commit) + + # Determine the required Lean version. + if (self.url, self.commit) in info_cache.lean_version: + lean_version = info_cache.lean_version[(self.url, self.commit)] + elif self.is_lean4: + lean_version = self.commit + else: + config = self.get_config("lean-toolchain") + lean_version = get_lean4_commit_from_config(config) + v = get_lean4_version_from_config(config["content"]) + if not is_supported_version(v): + logger.warning( + f"{self} relies on an unsupported Lean version: {lean_version}" + ) + info_cache.lean_version[(self.url, self.commit)] = lean_version + object.__setattr__(self, "lean_version", lean_version) + + @classmethod + def from_path(cls, path: Path) -> "LeanGitRepo": + """Construct a :class:`LeanGitRepo` object from the path to a local Git repo.""" + url, commit = get_repo_info(path) + return cls(url, commit) + + @property + def name(self) -> str: + return self.repo.name + + @property + def is_lean4(self) -> bool: + return self.url == LEAN4_URL + + @property + def commit_url(self) -> str: + return os.path.join(self.url, f"tree/{self.commit}") + + def show(self) -> None: + """Show the repo in the default browser.""" + webbrowser.open(self.commit_url) + + def exists(self) -> bool: + return url_exists(self.commit_url) + + def clone_and_checkout(self) -> None: + """Clone the repo to the current working directory and checkout a specific commit.""" + # Check if the repo already exists. + # If it exists, we assume it has been checked out to the correct commit. + + user_name, repo_name = _split_git_url(self.url) + local_repo_path = Path(os.environ["REPO_DIR"]) / user_name / repo_name + if os.path.exists(local_repo_path): + logger.info(f"{self} already exists locally.") + else: + logger.debug(f"Cloning {self}") + execute(f"git clone -n --recursive {self.url}", capture_output=True) + + + with working_directory(local_repo_path): + execute( + f"git checkout {self.commit} && git submodule update --recursive", + capture_output=True, + ) + + def get_dependencies( + self, path: Union[str, Path, None] = None + ) -> Dict[str, "LeanGitRepo"]: + """Return the dependencies required by the target repo. + + Args: + path (Union[str, Path, None], optional): Root directory of the repo if it is on the disk. + + Returns: + Dict[str, :class:`LeanGitRepo`]: A dictionary mapping the name of each + dependency to its :class:`LeanGitRepo` object. + """ + logger.debug(f"Querying the dependencies of {self}") + + toolchain = ( + self.get_config("lean-toolchain") + if path is None + else {"content": (Path(path) / "lean-toolchain").open().read()} + ) + commit = get_lean4_commit_from_config(toolchain) + deps = {"lean4": LeanGitRepo(LEAN4_URL, commit)} + + try: + lake_manifest = ( + self.get_config("lake-manifest.json", num_retries=0) + if path is None + else json.load((Path(path) / "lake-manifest.json").open()) + ) + for pkg in lake_manifest["packages"]: + deps[pkg["name"]] = LeanGitRepo(pkg["url"], pkg["rev"]) + except Exception: + for name, repo in self._parse_lakefile_dependencies(path): + if name not in deps: + deps[name] = repo + for dd_name, dd_repo in repo.get_dependencies().items(): + deps[dd_name] = dd_repo + + return deps + + def _parse_lakefile_dependencies( + self, path: Union[str, Path, None] + ) -> List[Tuple[str, "LeanGitRepo"]]: + if self.uses_lakefile_lean(): + return self._parse_lakefile_lean_dependencies(path) + else: + return self._parse_lakefile_toml_dependencies(path) + + def _parse_lakefile_lean_dependencies( + self, path: Union[str, Path, None] + ) -> List[Tuple[str, "LeanGitRepo"]]: + lakefile = ( + self.get_config("lakefile.lean")["content"] + if path is None + else (Path(path) / "lakefile.lean").open().read() + ) + + if _LAKEFILE_LEAN_LOCAL_REQUIREMENT_REGEX.search(lakefile): + raise ValueError("Local dependencies are not supported.") + + return self._parse_deps(_LAKEFILE_LEAN_GIT_REQUIREMENT_REGEX.finditer(lakefile)) + + def _parse_deps( + self, matches: Union[Iterator[re.Match[str]], Dict[str, str]] + ) -> List[Tuple[str, "LeanGitRepo"]]: + deps = [] + + for m in matches: + url = m["url"] + if url.endswith(".git"): + url = url[:-4] + if url.startswith("git@"): + url = "https://" + url[4:].replace(":", "/") + + rev = m["rev"] + if rev is None: + commit = get_latest_commit(url) + elif len(rev) == 40 and _COMMIT_REGEX.fullmatch(rev): + commit = rev + else: + try: + commit = _to_commit_hash(url_to_repo(url), rev) + except ValueError: + commit = get_latest_commit(url) + assert _COMMIT_REGEX.fullmatch(commit) + + deps.append((m["name"], LeanGitRepo(url, commit))) + + return deps + + def _parse_lakefile_toml_dependencies( + self, path: Union[str, Path, None] + ) -> List[Tuple[str, "LeanGitRepo"]]: + lakefile = ( + self.get_config("lakefile.toml")["content"] + if path is None + else (Path(path) / "lakefile.toml").open().read() + ) + matches = dict() + + for requirement in _LAKEFILE_TOML_REQUIREMENT_REGEX.finditer(lakefile): + for line in requirement.strip().splitlines(): + key, value = line.split("=") + key = key.strip() + value = value.strip() + if key == "path": + raise ValueError("Local dependencies are not supported.") + if key == "git": + matches["url"] = value + if key == "rev": + matches["rev"] = value + if key == "name": + matches["name"] = value + + return self._parse_deps(lakefile, matches) + + def get_license(self) -> Optional[str]: + """Return the content of the ``LICENSE`` file.""" + assert "github.com" in self.url, f"Unsupported URL: {self.url}" + url = self.url.replace("github.com", "raw.githubusercontent.com") + license_url = f"{url}/{self.commit}/LICENSE" + try: + return read_url(license_url) + except urllib.error.HTTPError: + return None + + def _get_config_url(self, filename: str) -> str: + assert "github.com" in self.url, f"Unsupported URL: {self.url}" + url = self.url.replace("github.com", "raw.githubusercontent.com") + return f"{url}/{self.commit}/{filename}" + + def get_config(self, filename: str, num_retries: int = 2) -> Dict[str, Any]: + """Return the repo's files.""" + config_url = self._get_config_url(filename) + content = read_url(config_url, num_retries) + if filename.endswith(".toml"): + return toml.loads(content) + elif filename.endswith(".json"): + return json.loads(content) + else: + return {"content": content} + + def uses_lakefile_lean(self) -> bool: + """Check if the repo uses a ``lakefile.lean``.""" + url = self._get_config_url("lakefile.lean") + return url_exists(url) + + def uses_lakefile_toml(self) -> bool: + """Check if the repo uses a ``lakefile.toml``.""" + url = self._get_config_url("lakefile.toml") + return url_exists(url) + + +@dataclass(frozen=True) +class Theorem: + """Theorem in Lean. + + Theorems are named constants of type :code:`Prop`. They are typically defined + using the keywords :code:`theorem` or :code:`lemma`, but it's possible to use other + keywords such as :code:`def` or :code:`instance` + """ + + repo: LeanGitRepo + """Lean repo the theorem comes from. + """ + + file_path: Path + """Lean source file the theorem comes from. + """ + + full_name: str + """Fully qualified name of the theorem. + """ + + def __post_init__(self) -> None: + if isinstance(self.file_path, str): + object.__setattr__(self, "file_path", Path(self.file_path)) + assert ( + self.file_path.suffix == ".lean" + ), f"File extension must be .lean: {self.file_path}" + + @property + def uid(self) -> str: + """Unique identifier of the theorem.""" + return f"{cleanse_string(self.repo.url)}@{cleanse_string(self.repo.commit)}:{cleanse_string(self.file_path.__str__())}:{cleanse_string(self.full_name)}" + + @property + def uhash(self) -> str: + """Unique hash of the theorem.""" + return str(hash(self.uid) ** 2) diff --git a/data_extraction/trace.py b/data_extraction/trace.py new file mode 100644 index 0000000..cd1ddc2 --- /dev/null +++ b/data_extraction/trace.py @@ -0,0 +1,135 @@ +"""This module provides the main interfaces for tracing Lean repos, i.e., extracting data from them. +To estimate the time for tracing a repo, a good rule of thumb is 1.5x the time for compiling the repo using :code:`leanpkg build`. +A repo has to be traced only once, and the traced repo will be stored in a cache for fast access in the future. +""" + +import os +import shutil +from pathlib import Path +from loguru import logger +from typing import Union, Optional +from subprocess import CalledProcessError + +from .cache import cache, _split_git_url +from .lean import LeanGitRepo +from ..constants import NUM_PROCS +from .traced_data import TracedRepo +from ..utils import working_directory +from ..container import create_mounts, get_container, NativeContainer + + +LEAN4_BUILD_SCRIPT_PATH = Path(__file__).with_name("build_lean4_repo.py") +LEAN4_DATA_EXTRACTOR_PATH = Path(__file__).with_name("ExtractData.lean") + +def _trace(repo: LeanGitRepo, build_deps: bool) -> None: + assert ( + repo.exists() + ), f"The {repo} does not exist. Please check the URL `{repo.commit_url}`." + + # Trace `repo` in the current working directory. + assert not repo.is_lean4, "Cannot trace Lean 4 itself." + + user_name, repo_name = _split_git_url(repo.url) + local_repo_path = Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name + if not local_repo_path.exists(): + repo.clone_and_checkout() + + logger.debug(f"Tracing {repo}") + container = get_container() + mts = { + Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name: f"/workspace/{user_name}/{repo_name}", + LEAN4_BUILD_SCRIPT_PATH: f"/workspace/{LEAN4_BUILD_SCRIPT_PATH.name}", + LEAN4_DATA_EXTRACTOR_PATH: f"/workspace/{LEAN4_DATA_EXTRACTOR_PATH.name}", + } + + + cmd = f"python build_lean4_repo.py {user_name}/{repo_name}" + if not build_deps: + cmd += " --no-deps" + + try: + import ipdb; ipdb.set_trace() + container.run( + cmd, + create_mounts(mts), + {"NUM_PROCS": NUM_PROCS}, + as_current_user=True, + work_dir="/workspace", + ) + except CalledProcessError as ex: + if repo.is_lean4 and isinstance(container, NativeContainer): + logger.error( + "Failed to build Lean 4 without Docker. See https://leandojo.readthedocs.io/en/latest/user-guide.html#advanced-running-within-docker." + ) + raise ex + + +def is_available_in_cache(repo: LeanGitRepo) -> bool: + """Check if ``repo`` has a traced repo available in the cache (including the remote cache).""" + return cache.get(repo.url, repo.commit) is not None + + +def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path: + """Return the path of a traced repo in the cache. + + The function will trace a repo if it is not available in the cache. See :ref:`caching` for details. + + Args: + repo (LeanGitRepo): The Lean repo to trace. + build_deps (bool): Whether to build the dependencies of ``repo``. Defaults to True. + + Returns: + Path: The path of the traced repo in the cache, e.g. :file:`/home/kaiyu/.cache/lean_dojo/leanprover-community-mathlib-2196ab363eb097c008d4497125e0dde23fb36db2` + """ + path = cache.get(repo.url, repo.commit) + + if path is None: + logger.info(f"Tracing {repo}") + user_name, repo_name = _split_git_url(repo.url) + with Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name as tmp_dir: + print(tmp_dir) + logger.debug(f"Working in the temporary directory {tmp_dir}") + print("About to trace") + _trace(repo, build_deps) + traced_repo = TracedRepo.from_traced_files(tmp_dir, build_deps) + traced_repo.save_to_disk() + path = cache.store(tmp_dir) + else: + logger.debug("The traced repo is available in the cache.") + return path + + +def trace( + repo: LeanGitRepo, + dst_dir: Optional[Union[str, Path]] = None, + build_deps: bool = True, +) -> TracedRepo: + """Trace a repo (and its dependencies), saving the results to ``dst_dir``. + + The function only traces the repo when it's not available in the cache. Otherwise, + it directly copies the traced repo from the cache to ``dst_dir``. See :ref:`caching` for details. + + Args: + repo (LeanGitRepo): The Lean repo to trace. + dst_dir (Union[str, Path]): The directory for saving the traced repo. If None, the traced repo is only saved in the cahe. + build_deps (bool): Whether to build the dependencies of ``repo``. Defaults to True. + + Returns: + TracedRepo: A :class:`TracedRepo` object corresponding to the files at ``dst_dir``. + """ + if dst_dir is not None: + dst_dir = Path(dst_dir) + assert ( + not dst_dir.exists() + ), f"The destination directory {dst_dir} already exists." + + cached_path = get_traced_repo_path(repo, build_deps) + logger.info(f"Loading the traced repo from {cached_path}") + traced_repo = TracedRepo.load_from_disk(cached_path, build_deps) + traced_repo.check_sanity() + + if dst_dir is not None: + dst_dir.mkdir(parents=True) + shutil.copytree(cached_path, dst_dir / cached_path.name) + + return traced_repo diff --git a/data_extraction/traced_data.py b/data_extraction/traced_data.py new file mode 100644 index 0000000..82317b2 --- /dev/null +++ b/data_extraction/traced_data.py @@ -0,0 +1,1224 @@ +"""This module defines traced repos/files/theorems. +""" + +import re +import os +import ray +import json +import random +import itertools +import webbrowser +import networkx as nx +from tqdm import tqdm +from lxml import etree +from pathlib import Path +from loguru import logger +from dataclasses import dataclass, field +from typing import List, Optional, Dict, Any, Tuple, Union + +from ..utils import ( + is_git_repo, + compute_md5, + ray_actor_pool, + to_lean_path, + to_dep_path, + to_json_path, + to_xml_path, +) +from .ast import * +from .lean import LeanFile, LeanGitRepo, Theorem, Pos +from ..constants import NUM_WORKERS, LOAD_USED_PACKAGES_ONLY, LEAN4_PACKAGES_DIR + + +@dataclass(frozen=True) +class Comment: + """A comment in a Lean file.""" + + start: Pos + end: Pos + text: str + + def __post_init__(self) -> None: + assert isinstance(self.start, Pos) + assert isinstance(self.end, Pos) + assert self.start <= self.end + assert isinstance(self.text, str) + + def to_xml(self, parent: etree.Element) -> None: + tree = etree.SubElement(parent, self.__class__.__name__) + tree.set("start", str(self.start)) + tree.set("end", str(self.end)) + tree.set("text", self.text) + + @classmethod + def from_xml(cls, tree: etree.Element) -> "Comment": + start = Pos.from_str(tree.attrib["start"]) + end = Pos.from_str(tree.attrib["end"]) + text = tree.attrib["text"] + return cls(start, end, text) + + +def _collect_lean4_comments(ast: FileNode) -> List[Comment]: + comments = [] + + def _callback(node, _): + nonlocal comments + if isinstance(node, CommandModuledocNode) or isinstance( + node, CommandDoccommentNode + ): + comments.append(Comment(node.start, node.end, node.comment)) + elif is_leaf(node) and node.trailing.strip().startswith("--"): + num_spaces = node.trailing.index("--") + text = node.trailing[num_spaces:] + start = node.lean_file.offset(node.end, num_spaces) + end = node.lean_file.offset(start, len(text)) + comments.append(Comment(start, end, text)) + + ast.traverse_preorder(_callback, node_cls=None) + return comments + + +_SINGLE_LINE_COMMENT_REGEX = r"--.*?(\n|$)" +_MULTI_LINE_COMMENT_REGEX = r"/-.*?(-/|$)" +_COMMENT_REGEX = re.compile( + f"{_SINGLE_LINE_COMMENT_REGEX}|{_MULTI_LINE_COMMENT_REGEX}", re.DOTALL +) + + +def get_code_without_comments( + lean_file: LeanFile, start: Pos, end: Pos, comments: List[Comment] +) -> str: + """Return the code in ``lean_file`` from ``start`` to ``end`` with comments removed. + + Args: + lean_file (LeanFile): The lean source file. + start (Pos): The start position. + end (Pos): The end position. + comments (List[Comment]): A list of :class:`Comment` objects. + + Returns: + str: Human-written code with comments removed. + """ + base = start + code_segs = [] + + for c in comments: + if base <= c.start and c.end <= end: + code_segs.append(lean_file[base : c.start]) + base = c.end + + code_segs.append(lean_file[base:end]) + code = "".join(code_segs) + + code = _COMMENT_REGEX.sub("", code) + assert "--" not in code and "/-" not in code + + return code.strip() + + +@dataclass(frozen=True) +class TracedTactic: + """A traced tactic is a tactic annotated with additional information including + its AST and the states before/after the tactic. + """ + + ast: Node = field(repr=False) + """AST of the tactic. + """ + + traced_theorem: Optional["TracedTheorem"] = field( + default=None, repr=False, compare=False + ) + """The traced theorem this tactic belongs to. + """ + + def __getstate__(self) -> Dict[str, Any]: + d = {k: v for k, v in self.__dict__.items() if k != "traced_theorem"} + d["traced_theorem"] = None # Avoid serializing the traced theorem. + return d + + @property + def tactic(self) -> str: + """The raw tactic string.""" + return self.ast.tactic + + @property + def state_before(self) -> str: + """Pretty-printed state before applying the tactic.""" + assert self.ast.state_before is not None + return self.ast.state_before + + @property + def state_after(self) -> str: + """Pretty-printed state after applying the tactic.""" + assert self.ast.state_after is not None + return self.ast.state_after + + @property + def start(self) -> Pos: + """Start position in :file:`*.lean` file.""" + return self.ast.start + + @property + def end(self) -> Pos: + """End position in :file:`*.lean` file.""" + return self.ast.end + + def to_string(self) -> str: + return f"{self.__class__.__name__}(tactic={self.tactic}, state_before={self.state_before}, state_after={self.state_after})" + + def __str__(self) -> str: + return self.to_string() + + def __repr__(self) -> str: + return self.to_string() + + def get_annotated_tactic(self) -> Tuple[str, List[Dict[str, Any]]]: + """Return the tactic annotated with premise information. + + Premises in the tactic are marked by `` ... ``. For example, + :code:`rw [add_comm b]` contains a premise :code:`add_comm` and therefore + becomes :code:`rw [add_comm b]`. In addition, the function returns + the provenance (full name, file path, line/column numbers) of all premises. + + Returns: + Tuple[str, List[Dict[str, Any]]]: The first return value is the tactic string marked by `` ... ``. The second return value is a list of provenances. + """ + assert self.traced_theorem != None + lean_file = self.traced_theorem.traced_file.lean_file + annot_tac = [] + provenances = [] + cur = self.start + + def _callback4(node: IdentNode, _): + nonlocal cur + + if ( + node.full_name is not None + and node.mod_name is not None + and node.def_start is not None + and node.def_end is not None + ): + if cur <= node.start: + annot_tac.append(lean_file[cur : node.start]) + annot_tac.append("" + lean_file[node.start : node.end] + "") + prov = {"full_name": node.full_name} + prov["def_path"] = node.def_path + prov["def_pos"] = list(node.def_start) + prov["def_end_pos"] = list(node.def_end) + provenances.append(prov) + cur = node.end + + self.ast.traverse_preorder(_callback4, IdentNode) + annot_tac.append(lean_file[cur : self.end]) + + return "".join(annot_tac), provenances + + +@dataclass(frozen=True) +class TracedTheorem: + """A traced theorem is a theorem with additional information such as the AST.""" + + root_dir: Path = field(repr=False) + """Root directory of the corresponding traced repo. + """ + + theorem: Theorem + """The corresponding :class:`Theorem` object. + """ + + ast: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode] = field( + repr=False, compare=False + ) + """AST of the theorem. + """ + + comments: List[Comment] = field(repr=False, compare=False) + """All comments in the theorem/proof. + """ + + traced_file: Optional["TracedFile"] = field(default=None, repr=False, compare=False) + """The traced file this theorem belongs to. + """ + + def __post_init__(self) -> None: + assert ( + self.root_dir.is_absolute() and self.root_dir == self.traced_file.root_dir + ) + + def __getstate__(self) -> Dict[str, Any]: + d = {k: v for k, v in self.__dict__.items() if k != "traced_file"} + d["traced_file"] = None + return d + + @property + def start(self) -> Pos: + """Start position in :file:`*.lean` file.""" + return self.ast.start + + @property + def end(self) -> Pos: + """End position in :file:`*.lean` file.""" + return self.ast.end + + @property + def repo(self) -> LeanGitRepo: + """The Lean repo this theorem belongs to.""" + return self.theorem.repo + + @property + def file_path(self) -> Path: + """The theorem's file path (relative to the root directory).""" + return self.theorem.file_path + + @property + def traced_repo(self) -> "TracedRepo": + """The traced repo this theorem belongs to.""" + if self.traced_file is None: + return None + else: + return self.traced_file.traced_repo + + @property + def is_private(self) -> bool: + """Check if the theorem is private.""" + return self.ast.is_private() + + def show(self) -> None: + """Show the theorem in the default browser.""" + url = os.path.join( + self.repo.url, + "blob", + self.repo.commit, + self.file_path, + f"#L{self.start.line_nb}-L{self.end.line_nb}", + ) + webbrowser.open(url) + + def has_tactic_proof(self) -> bool: + """Check if the theorem has a tactic-style proof.""" + return self.ast.has_tactic_proof() + + def get_proof_node(self) -> Node: + """Return the AST of the theorem's proof.""" + return self.ast.get_proof_node() + + def locate_proof(self) -> Tuple[Pos, Pos]: + """Return the start/end positions of the proof.""" + start, end = self.get_proof_node().get_closure() + if end < self.end: + end = self.end + return start, end + + def get_tactic_proof(self) -> Optional[str]: + """Return the tactic-style proof (if any).""" + if not self.has_tactic_proof(): + return None + node = self.get_proof_node() + start, end = node.get_closure() + proof = get_code_without_comments(node.lean_file, start, end, self.comments) + if not re.match(r"^(by|begin)\s", proof): + return None + else: + return proof + + def get_theorem_statement(self) -> str: + """Return the theorem statement.""" + proof_start, _ = self.locate_proof() + return get_code_without_comments( + self.traced_file.lean_file, self.ast.start, proof_start, self.comments + ) + + def get_single_tactic_proof(self) -> Optional[str]: + """Wrap the proof into a single (potentially very long) tactic.""" + if not self.has_tactic_proof(): + return None + node = self.get_proof_node() + start, end = node.get_closure() + proof = get_code_without_comments(node.lean_file, start, end, self.comments) + + raise NotImplementedError + assert isinstance(node.children[0], AtomNode) and node.children[0].val == "by" + assert proof.startswith("by") + proof = proof[len("by") :].strip() + + return proof + + def get_premise_full_names(self) -> List[str]: + """Return the fully qualified names of all premises used in the proof.""" + names = [] + + def _callback(node: IdentNode, _: List[Node]): + if node.full_name is not None: + names.append(node.full_name) + + self.ast.traverse_preorder(_callback, node_cls=IdentNode) + + return names + + def get_traced_tactics(self, atomic_only: bool = False) -> List[TracedTactic]: + """Return a list of traced tactics in the proof.""" + tacs = self._get_traced_tactics_lean4(atomic_only) + + # Deduplicate. + signatures = set() + tacs_dedup = [] + for t in tacs: + sig = (t.state_before, t.tactic, t.state_after) + if sig not in signatures: + signatures.add(sig) + tacs_dedup.append(t) + + return tacs_dedup + + def _get_traced_tactics_lean4( + self, atomic_only: bool = False + ) -> List[TracedTactic]: + tacs = [] + + def _callback(node, _): + if not isinstance( + node, + ( + TacticTacticseq1IndentedNode, + TacticTacticseqbracketedNode, + ), + ): + return + for tac_node in node.get_tactic_nodes(atomic_only): + if ( + hasattr(tac_node, "state_before") + and tac_node.state_before is not None + ): + # Tactics outside theorem/lemma definitions are not recorded. + tacs.append(TracedTactic(tac_node, self)) + + self.ast.traverse_preorder(_callback, node_cls=None) + return tacs + + def get_num_tactics(self) -> int: + """Return the number of tactics in the proof.""" + return len(self.get_traced_tactics()) + + +_TAG_INDEX_REGEX = re.compile(r"(?P\S+)\[(?P\d+)\]$") + + +def _qualify_name(name: str, prefix: str) -> str: + """Qualify a name with a prefix.""" + if name.startswith("_root_."): + return name[len("_root_.") :] + elif prefix == "": + return name + else: + return f"{prefix}.{name}" + + +def _fix_indentation(tac: str, indent: int) -> str: + """Fix the indentation of a tactic.""" + lines = tac.splitlines() + if len(lines) == 1: + return tac + else: + lines_new = [lines[0]] + for l in lines[1:]: + for i in range(len(l)): + if l[i] != " " or i >= indent: + lines_new.append(l[i:]) + break + + return "\n".join(lines_new) + + +@dataclass(eq=False) +class TracedFile: + """A traced file is a Lean source file annotated with syntactic/semantic information + such as tactic states, Lean expressions, and abstract syntax trees (ASTs). + """ + + root_dir: Path + """Root directory (in absolute path) of the corresponding traced repo. + """ + + repo: LeanGitRepo + """The Lean repo this traced file belongs to. + """ + + lean_file: LeanFile + """Lean source file of this traced file. + """ + + ast: FileNode = field(repr=False) + """Abstract syntax tree (AST) of the entire :code:`*.lean` file. + + AST nodes are defined in :ref:`lean_dojo.data_extraction.ast`. + """ + + comments: List[Comment] = field(repr=False) + """All comments in the :code:`*.lean` file. + """ + + traced_repo: Optional["TracedRepo"] = field(default=None, repr=False) + """The traced repo this traced file belongs to. + + Note that ``traced_repo`` will become None after the traced file is serialized/deserialized on its own. + """ + + def __post_init__(self) -> None: + assert self.root_dir.is_absolute(), f"{self.root_dir} is not an absolute path" + + def __getstate__(self) -> Dict[str, Any]: + d = {k: v for k, v in self.__dict__.items() if k != "traced_repo"} + d["traced_repo"] = None + return d + + @property + def path(self) -> Path: + """Path of the :file:`*.lean` file relative to the root directory.""" + return self.lean_file.path + + @property + def abs_path(self) -> Path: + """Absolute path of the :code:`*.lean` file.""" + return self.root_dir / self.path + + @property + def has_prelude(self) -> bool: + """Check whether the file starts with :code:``prelude``. + + :code:``prelude`` instructs Lean NOT to include its built-in library automatically. + """ + result = False + + def _callback(node: ModulePreludeNode, _: List[Node]): + nonlocal result + result = True + return True # Stop traversing. + + self.ast.traverse_preorder(_callback, node_cls=ModulePreludeNode) + return result + + @classmethod + def from_traced_file( + cls, root_dir: Union[str, Path], json_path: Path, repo: LeanGitRepo + ) -> "TracedFile": + """Construct a :class:`TracedFile` object by parsing a :file:`*.ast.json` file + produced by :code:`lean --ast --tsast --tspp` (Lean 3) or :file:`ExtractData.lean` (Lean 4). + + Args: + root_dir (Union[str, Path]): Root directory of the traced repo. + json_path (Path): Path of the :file:`*.ast.json` file relative to ``root_dir``. + """ + root_dir = Path(root_dir) + root_dir = root_dir.resolve() + if not json_path.is_absolute(): + json_path = root_dir / json_path + if not json_path.exists(): + raise FileNotFoundError(f"{json_path} does not exist") + assert json_path.suffixes == [ + ".ast", + ".json", + ], f"{json_path} is not a *.ast.json file" + + return cls._from_lean4_traced_file(root_dir, json_path, repo) + + @classmethod + def _from_lean4_traced_file( + cls, root_dir: Path, json_path: Path, repo: LeanGitRepo + ) -> "TracedFile": + lean_path = to_lean_path(root_dir, json_path, repo) + lean_file = LeanFile(root_dir, lean_path) + + data = json.load(json_path.open()) + + data["module_paths"] = [] + for line in ( + json_path.with_suffix("").with_suffix("").with_suffix(".dep_paths").open() + ): + line = line.strip() + if line == "": + break + data["module_paths"].append(line) + + ast = FileNode.from_data(data, lean_file) + comments = _collect_lean4_comments(ast) + TracedFile._post_process_lean4( + ast, + lean_file, + data["tactics"], + data["premises"], + data["module_paths"], + comments, + ) + + return cls(root_dir, repo, lean_file, ast, comments) + + @classmethod + def _post_process_lean4( + cls, + ast: FileNode, + lean_file: LeanFile, + tactics_data: List[Dict[str, Any]], + premises_data: List[Dict[str, Any]], + imports_data: List[str], + comments: List[Comment], + ) -> None: + pos2tactics = {} + for t in tactics_data: + start = lean_file.convert_pos(t["pos"]) + end = lean_file.convert_pos(t["endPos"]) + pos2tactics[(start, end)] = t + + pos2premises = {} + for p in premises_data: + if ( + p is None + or p["pos"] is None + or p["endPos"] is None + or p["fullName"] is None + or p["fullName"] == "[anonymous]" + ): + continue + start_line_nb, start_column_nb = p["pos"]["line"], p["pos"]["column"] + end_line_nb, end_column_nb = p["endPos"]["line"], p["endPos"]["column"] + start = Pos(line_nb=start_line_nb, column_nb=start_column_nb + 1) + end = Pos(line_nb=end_line_nb, column_nb=end_column_nb + 1) + pos2premises[(start, end)] = p + + inside_sections_namespaces = [] + + def _callback(node: Node, _): + if ( + isinstance( + node, + ( + CommandNamespaceNode, + CommandSectionNode, + CommandNoncomputablesectionNode, + ), + ) + and node.name is not None + ): + inside_sections_namespaces.append(node) + elif ( + isinstance(node, CommandEndNode) + and node.name is not None + and len(inside_sections_namespaces) > 0 + ): + inside_sections_namespaces.pop() + elif is_potential_premise_lean4(node): + prefix = ".".join( + ns.name + for ns in inside_sections_namespaces + if isinstance(ns, CommandNamespaceNode) + ) + full_name = ( + [_qualify_name(name, prefix) for name in node.name] + if is_mutual_lean4(node) + else _qualify_name(node.name, prefix) + ) + object.__setattr__(node, "full_name", full_name) + if isinstance(node, CommandDeclarationNode) and node.is_theorem: + object.__setattr__(node.get_theorem_node(), "full_name", full_name) + elif isinstance( + node, + ( + TacticTacticseq1IndentedNode, + TacticTacticseqbracketedNode, + ), + ): + for tac_node in node.get_tactic_nodes(): + assert isinstance( + tac_node, (OtherNode, TacticTacticseqbracketedNode) + ) + if (tac_node.start, tac_node.end) not in pos2tactics: + continue + t = pos2tactics[(tac_node.start, tac_node.end)] + tac = get_code_without_comments( + lean_file, tac_node.start, tac_node.end, comments + ) + tac = _fix_indentation(tac, tac_node.start.column_nb - 1) + object.__setattr__(tac_node, "state_before", t["stateBefore"]) + object.__setattr__(tac_node, "state_after", t["stateAfter"]) + object.__setattr__(tac_node, "tactic", tac) + elif isinstance(node, IdentNode): + start, end = node.get_closure() + if (start, end) in pos2premises: + assert start is not None + assert end is not None + p = pos2premises[(start, end)] + prem = get_code_without_comments(lean_file, start, end, comments) + prem = _fix_indentation(prem, start.column_nb - 1) + if p["fullName"] is not None: + object.__setattr__(node, "full_name", p["fullName"]) + if p["modName"] is not None: + object.__setattr__(node, "mod_name", p["modName"]) + if p["defPath"] is not None: + object.__setattr__(node, "def_path", p["defPath"]) + if p["defPos"] is not None and p["defEndPos"] is not None: + def_start_line_nb, def_start_column_nb = ( + p["defPos"]["line"], + p["defPos"]["column"], + ) + def_end_line_nb, def_end_column_nb = ( + p["defEndPos"]["line"], + p["defEndPos"]["column"], + ) + def_start = Pos( + line_nb=def_start_line_nb, column_nb=def_start_column_nb + 1 + ) + def_end = Pos( + line_nb=def_end_line_nb, column_nb=def_end_column_nb + 1 + ) + object.__setattr__(node, "def_start", def_start) + object.__setattr__(node, "def_end", def_end) + elif isinstance(node, ModuleImportNode): + node_module_name = object.__getattribute__(node, "module") + if node_module_name is not None: + suffix = node_module_name.replace(".", "/") + for import_line in imports_data: + if import_line.endswith( + suffix + ".lean" + ) or import_line.endswith(suffix + "/default.lean"): + object.__setattr__(node, "path", Path(import_line)) + + ast.traverse_preorder(_callback, node_cls=None) + + def check_sanity(self) -> None: + """Perform some basic sanity checks. + + The function raises exceptions in case of unsuccessful checks. + """ + assert isinstance(self.root_dir, Path) + assert isinstance(self.lean_file, LeanFile) + isinstance(self.ast, FileNode) + + assert self.lean_file.root_dir == self.root_dir + + for t in self.get_traced_theorems(): + assert str(self.lean_file.path).endswith(str(t.theorem.file_path)) + assert t.traced_file is None or t.traced_file is self + + def traverse_preorder(self, callback, node_cls: Optional[type] = None): + """Traverse the AST in preorder. + + Args: + callback (function): Callback function for visiting AST nodes. + node_cls (Optional[type], optional): Restrict the application of + ``callback`` to only nodes of type ``node_cls``. + Defaults to None, which means applying ``callback`` to all. + """ + self.ast.traverse_preorder(callback, node_cls) + + def _get_repo_and_relative_path(self) -> Tuple[LeanGitRepo, Path]: + """Return the repo this file belongs to, as well as the file's path relative to it.""" + if self.path.is_relative_to(LEAN4_PACKAGES_DIR): + # The theorem belongs to one of the dependencies. + p = self.path.relative_to(LEAN4_PACKAGES_DIR) + name = p.parts[0] + repo = self.traced_repo.dependencies[name] + return repo, p.relative_to(name) + else: + # The theorem belongs to the traced repo itself. + return self.traced_repo.repo, self.path + + def get_traced_theorem( + self, thm_or_name: Union[Theorem, str] + ) -> Optional[TracedTheorem]: + """Return a :class:`TracedTheorem` object given an :class:`Theorem` object + or its fully-qualified name.""" + if isinstance(thm_or_name, Theorem): + thm = thm_or_name + else: + repo, path = self._get_repo_and_relative_path() + thm = Theorem(repo, path, thm_or_name) + result = None + private_result = None + + def _callback( + node: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode], _ + ) -> None: + nonlocal result, private_result + if not isinstance( + node, + ( + CommandTheoremNode, + LemmaNode, + MathlibTacticLemmaNode, + ), + ): + return False + if node.full_name == thm.full_name: + comments = self._filter_comments(node.start, node.end) + t = TracedTheorem(self.root_dir, thm, node, comments, self) + if t.is_private: + private_result = t + else: + result = t + + self.ast.traverse_preorder(_callback, node_cls=None) + + # Prioritize non-private theorems. + if result is None: + result = private_result + return result + + def get_traced_theorems(self) -> List[TracedTheorem]: + """Return a list of traced theorem in this traced file.""" + traced_theorems = [] + + def _callback( + node: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode], _ + ) -> None: + if not isinstance( + node, + ( + CommandTheoremNode, + LemmaNode, + MathlibTacticLemmaNode, + ), + ): + return False + repo, path = self._get_repo_and_relative_path() + thm = Theorem(repo, path, node.full_name) + comments = self._filter_comments(node.start, node.end) + traced_theorems.append( + TracedTheorem(self.root_dir, thm, node, comments, self) + ) + # No need to traverse the subtree since theorems cannot be nested. + return True + + self.traverse_preorder(_callback, node_cls=None) + return traced_theorems + + def _filter_comments(self, start: Pos, end: Pos) -> List[Comment]: + """Return a list of comments that are contained in the given range.""" + comments = [] + for c in self.comments: + if c.start < start: + assert c.end <= start + elif c.start < end: + assert c.end <= end + comments.append(c) + return comments + + def get_direct_dependencies(self, repo: LeanGitRepo) -> List[Tuple[str, Path]]: + """Return the names and paths of all modules imported by the current :file:`*.lean` file.""" + deps = set() + + if not self.has_prelude: # Add the prelude as a dependency. + init_lean = Path("src/lean/Init.lean") + if self.root_dir.name == "lean4": + deps.add(("Init", init_lean)) + else: + deps.add(("Init", LEAN4_PACKAGES_DIR / "lean4" / init_lean)) + + def _callback(node: ModuleImportNode, _) -> None: + if node.module is not None and node.path is not None: + deps.add((node.module, node.path)) + + self.traverse_preorder(_callback, node_cls=ModuleImportNode) + return list(deps) + + def get_premise_definitions(self) -> List[Dict[str, Any]]: + """Return all theorems and definitions defined in the current file that + can be potentially used as premises. + + Returns: + List[Dict[str, Any]]: _description_ + """ + results = [] + + def _callback4(node: Node, _) -> None: + if is_potential_premise_lean4(node): + start, end = node.get_closure() + if isinstance(node, CommandDeclarationNode) and node.is_theorem: + # We assume theorems are defined using keywords "theorem" + # or "lemma" but not, e.g., "def". + proof_start, _ = ( + node.get_theorem_node().get_proof_node().get_closure() + ) + code = get_code_without_comments( + self.lean_file, start, proof_start, self.comments + ) + if code.endswith(":="): + code = code[:-2].strip() + else: + code = get_code_without_comments( + self.lean_file, start, end, self.comments + ) + # TODO: For alias, restate_axiom, etc., the code is not very informative. + if is_mutual_lean4(node): + for s in node.full_name: + results.append( + { + "full_name": s, + "code": code, + "start": list(start), + "end": list(end), + "kind": node.kind(), + } + ) + else: + results.append( + { + "full_name": node.full_name, + "code": code, + "start": list(start), + "end": list(end), + "kind": node.kind(), + } + ) + + self.traverse_preorder(_callback4, node_cls=None) + return results + + def to_xml(self) -> str: + """Serialize a :class:`TracedFile` object to XML.""" + tree = etree.Element(self.__class__.__name__) + + tree.set("path", str(self.path)) + tree.set("md5", compute_md5(self.abs_path)) + + self.ast.to_xml(tree) + + if self.comments is not None: + comments_node = etree.SubElement(tree, "Comments") + for c in self.comments: + c.to_xml(comments_node) + + return etree.tostring(tree, encoding="utf-8", pretty_print=True).decode() + + @classmethod + def from_xml( + cls, + root_dir: Union[str, Path], + path: Union[str, Path], + repo: LeanGitRepo, + ) -> "TracedFile": + """Load a :class:`TracedFile` object from its :file:`*.trace.xml` file. + + Args: + root_dir (Union[str, Path]): Root directory of the traced repo. + path (Union[str, Path]): Path of the :file:`*.trace.xml` file relative to ``root_dir``. + repo (LeanGitRepo): The repo to which the traced file belongs. + """ + root_dir = Path(root_dir) + path = Path(path) + assert path.suffixes == [".trace", ".xml"] + lean_path = to_lean_path(root_dir, path, repo) + lean_file = LeanFile(root_dir, lean_path) + + tree = etree.parse(path).getroot() + assert tree.tag == "TracedFile" + assert tree.attrib["path"] == str(lean_path) + assert tree.attrib["md5"] == compute_md5(lean_file.abs_path) + + ast_tree, comments_tree = list(tree) + ast = FileNode.from_xml(ast_tree, lean_file) + comments = [Comment.from_xml(c) for c in comments_tree] + + return cls(root_dir, repo, lean_file, ast, comments) + + +def _save_xml_to_disk(tf: TracedFile) -> None: + xml_path = tf.root_dir / to_xml_path(tf.root_dir, tf.path, tf.repo) + with xml_path.open("wt") as oup: + oup.write(tf.to_xml()) + + +def _build_dependency_graph( + seed_files: List[TracedFile], root_dir: Path, repo: LeanGitRepo +) -> nx.DiGraph: + G = nx.DiGraph() + + for tf in seed_files: + tf_path_str = str(tf.path) + assert not G.has_node(tf_path_str) + G.add_node(tf_path_str, traced_file=tf) + + traced_files = seed_files.copy() + i = 0 + + while i < len(traced_files): + tf = traced_files[i] + tf_path_str = str(tf.path) + + for dep_module, dep_path in tf.get_direct_dependencies(repo): + dep_path_str = str(dep_path) + if not G.has_node(dep_path_str): + json_path = to_json_path(root_dir, dep_path, repo) + tf_dep = TracedFile.from_traced_file(root_dir, json_path, repo) + G.add_node(dep_path_str, traced_file=tf_dep) + traced_files.append(tf_dep) + + G.add_edge(tf_path_str, dep_path_str, module=dep_module) + + i += 1 + + assert nx.is_directed_acyclic_graph(G) + return G + + +@ray.remote +class _TracedRepoHelper: + """ + Helper class serving as Ray actor. + """ + + def __init__(self, root_dir: Path, repo: LeanGitRepo) -> None: + self.root_dir = root_dir + self.repo = repo + + def parse_traced_file(self, path: Path) -> TracedFile: + return TracedFile.from_traced_file(self.root_dir, path, self.repo) + + def save_xml_to_disk(self, tf: TracedFile) -> None: + return _save_xml_to_disk(tf) + + def load_xml_from_disk(self, path: Path) -> TracedFile: + return TracedFile.from_xml(self.root_dir, path, self.repo) + + +@dataclass(frozen=True, eq=False) +class TracedRepo: + """A traced repo is a Lean repo of traced files and additional information, such as + other repos it depends on, as well as the dependency graph between files. + """ + + repo: LeanGitRepo + """The corresponding Lean repo. + """ + + dependencies: Dict[str, LeanGitRepo] + """Dictionary mapping the name of each dependency to a :class:`LeanGitRepo` object. + """ + + root_dir: Path + """Root directory of the traced repo. + """ + + traced_files: List[TracedFile] = field(repr=False) + """List of traced files in the repo.""" + + traced_files_graph: Optional[nx.DiGraph] = field(repr=False) + """Dependency graph between files in the repo. + + The graph is a DAG, and there is an edge from file :file:`X` to file :file:`Y` + if and only if :file:`X` imports :file:`Y` + """ + + def __post_init__(self) -> None: + assert self.root_dir.is_absolute() + + def __setstate__(self, state) -> None: + object.__setattr__(self, "__dict__", state) + self._update_traced_files() + + @property + def name(self) -> str: + """Name of the repo.""" + return self.repo.name + + def show(self) -> None: + """Show the repo in the default browser.""" + self.repo.show() + + def check_sanity(self) -> None: + """Perform some basic sanity checks. + + The function raises exceptions in case of unsuccessful checks. + """ + logger.debug(f"Checking the sanity of {self}") + assert isinstance(self.repo, LeanGitRepo) + assert isinstance(self.dependencies, dict) + for k, v in self.dependencies.items(): + assert isinstance(k, str) and isinstance(v, LeanGitRepo) + assert isinstance(self.root_dir, Path) + assert self.traced_files_graph is None or isinstance( + self.traced_files_graph, nx.DiGraph + ) + + assert self.repo not in self.dependencies.values() + + json_files = { + p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.ast.json") + } + lean_files = { + p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.lean") + } + xml_files = { + p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.trace.xml") + } + path_files = { + p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.dep_paths") + } + + if self.traced_files_graph is not None: + if not LOAD_USED_PACKAGES_ONLY: + assert len(json_files) == self.traced_files_graph.number_of_nodes() + + for path_str, tf_node in self.traced_files_graph.nodes.items(): + tf = tf_node["traced_file"] + path = Path(path_str) + tf.check_sanity() + assert tf.path == path and tf.root_dir == self.root_dir + assert tf.traced_repo is None or tf.traced_repo is self + assert path in lean_files + assert ( + to_dep_path(self.root_dir, path, self.repo) in path_files + ), to_dep_path(self.root_dir, path, self.repo) + assert ( + to_json_path(self.root_dir, path, self.repo) in json_files + ), to_json_path(self.root_dir, path, self.repo) + if len(xml_files) > 0: + assert ( + to_xml_path(self.root_dir, path, self.repo) in xml_files + ), to_xml_path(self.root_dir, path, self.repo) + + @classmethod + def from_traced_files( + cls, root_dir: Union[str, Path], build_deps: bool = True + ) -> "TracedRepo": + """Construct a :class:`TracedRepo` object by parsing :file:`*.ast.json` and :file:`*.path` files + produced by :code:`lean --ast --tsast --tspp` (Lean 3) or :file:`ExtractData.lean` (Lean 4). + + Args: + root_dir (Union[str, Path]): Root directory of the traced repo. + build_deps (bool, optional): Whether to build the dependency graph between files. + """ + root_dir = Path(root_dir).resolve() + if not is_git_repo(root_dir): + raise RuntimeError(f"{root_dir} is not a Git repo.") + repo = LeanGitRepo.from_path(root_dir) + + json_paths = list(root_dir.glob("**/*.ast.json")) + random.shuffle(json_paths) + logger.debug( + f"Parsing {len(json_paths)} *.ast.json files in {root_dir} with {NUM_WORKERS} workers" + ) + + if NUM_WORKERS <= 1: + traced_files = [ + TracedFile.from_traced_file(root_dir, path, repo) + for path in tqdm(json_paths) + ] + else: + with ray_actor_pool(_TracedRepoHelper, root_dir, repo) as pool: + traced_files = list( + tqdm( + pool.map_unordered( + lambda a, p: a.parse_traced_file.remote(p), json_paths + ), + total=len(json_paths), + ) + ) + + dependencies = repo.get_dependencies(root_dir) + if build_deps: + traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo) + else: + traced_files_graph = None + + traced_repo = cls( + repo, dependencies, root_dir, traced_files, traced_files_graph + ) + traced_repo._update_traced_files() + return traced_repo + + def get_traced_file(self, path: Union[str, Path]) -> TracedFile: + """Return a traced file by its path.""" + return self.traced_files_graph.nodes[str(path)]["traced_file"] + + def _update_traced_files(self) -> None: + for tf in self.traced_files: + tf.traced_repo = self + + def save_to_disk(self) -> None: + """Save all traced files in the repo to the disk as :file:`*.trace.xml` files.""" + num_traced_files = len(self.traced_files) + logger.debug( + f"Saving {num_traced_files} traced XML files to {self.root_dir} with {NUM_WORKERS} workers" + ) + if NUM_WORKERS <= 1: + for tf in tqdm(self.traced_files, total=num_traced_files): + _save_xml_to_disk(tf) + else: + with ray_actor_pool(_TracedRepoHelper, self.root_dir, self.repo) as pool: + list( + tqdm( + pool.map_unordered( + lambda a, tf: a.save_xml_to_disk.remote(tf), + self.traced_files, + ), + total=num_traced_files, + ) + ) + + @classmethod + def load_from_disk( + cls, root_dir: Union[str, Path], build_deps: bool = True + ) -> "TracedRepo": + """Load a traced repo from :file:`*.trace.xml` files.""" + root_dir = Path(root_dir).resolve() + if not is_git_repo(root_dir): + raise RuntimeError(f"{root_dir} is not a Git repo.") + repo = LeanGitRepo.from_path(root_dir) + + xml_paths = list(root_dir.glob("**/*.trace.xml")) + logger.debug( + f"Loading {len(xml_paths)} traced XML files from {root_dir} with {NUM_WORKERS} workers" + ) + + # Start from files in the target repo as seeds. + # Only load dependency files that are actually used. + if LOAD_USED_PACKAGES_ONLY: + xml_paths = [ + p + for p in xml_paths + if not "lake-packages/" in str(p) and not ".lake/packages" in str(p) + ] + + if NUM_WORKERS <= 1: + traced_files = [ + TracedFile.from_xml(root_dir, path, repo) for path in tqdm(xml_paths) + ] + else: + with ray_actor_pool(_TracedRepoHelper, root_dir, repo) as pool: + traced_files = list( + tqdm( + pool.map_unordered( + lambda a, path: a.load_xml_from_disk.remote(path), xml_paths + ), + total=len(xml_paths), + ) + ) + + dependencies = repo.get_dependencies(root_dir) + if build_deps: + traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo) + else: + traced_files_graph = None + + traced_repo = cls( + repo, dependencies, root_dir, traced_files, traced_files_graph + ) + traced_repo._update_traced_files() + return traced_repo + + def get_traced_theorems(self) -> List[TracedTheorem]: + """Return all traced theorems in the repo.""" + return list( + itertools.chain.from_iterable( + tf.get_traced_theorems() for tf in self.traced_files + ) + ) + + def get_traced_theorem(self, thm: Theorem) -> Optional[TracedTheorem]: + """Return a :class:`TracedTheorem` object corresponding to ``thm``""" + if thm.repo == self.repo: + path = Path(thm.repo.name) / thm.file_path + else: + assert thm.repo in self.dependencies.values() + path = Path(self.name) / LEAN4_PACKAGES_DIR / thm.repo.name / thm.file_path + return self.get_traced_file(path).get_traced_theorem(thm.full_name) diff --git a/interaction/Lean4Repl.lean b/interaction/Lean4Repl.lean new file mode 100644 index 0000000..dad9b66 --- /dev/null +++ b/interaction/Lean4Repl.lean @@ -0,0 +1,357 @@ +-- REPL for interacting with Lean 4 via the command line. +import Lean.Message +import Lean.Elab.Tactic +import Lean.Elab.Frontend + +open Lean Lean.Meta Lean.Elab Lean.Elab.Command Lean.Elab.Tactic + +namespace LeanDojo + + +/-- Print the response as JSON. --/ +private def printResponse {α : Type _} [ToJson α] (res : α) : IO Unit := do + let json := (toJson res).pretty 99999999999999999 + println! "REPL> {json}" + (← IO.getStdout).flush + + +/-- Join a list of strings using a separator. --/ +private def join (l : List String) (sep : String := "\n") : String := + match l with + | [] => "" + | first :: others => others.foldl (fun r s => r ++ sep ++ s) first + + +/-- A request to REPL. --/ +structure Request where + /-- Tactic/command state ID on which to execute the request. -/ + sid: Nat + /-- Tactic/command. --/ + cmd: String +deriving FromJson, ToJson + + +/-- A response to REPL. --/ +structure Response where + /-- New tactic/command state ID. --/ + sid : Option Nat := none + /-- Next tactic state. --/ + tacticState : Option String := none + /-- Error message. --/ + error: Option String := none +deriving ToJson + + +/-- The state of the REPL. --/ +structure ReplState (σ : Type _) where + /-- Saved tactic/command states. --/ + savedStates : Array σ + /-- The first solved tactic state. --/ + solvedState : Option σ + + +/-- Get the saved tactic state with the given ID. --/ +private def getSavedState? (m : Type → Type) [Monad m] {σ : Type _} [MonadState (ReplState σ) m] (sid : Nat) : m (Option σ) := do + return (← get).savedStates[sid]? + + +/-- Get the initial tactic state. --/ +private def getInitialState! (m : Type → Type) [Monad m] {σ : Type _} [MonadState (ReplState σ) m] [MonadError m] : m σ := do + let some ts ← getSavedState? m 0 | throwError "[fatal] no initial state" + return ts + + +/-- Get the next state ID. --/ +private def getNextSid (m : Type → Type) [Monad m] {σ : Type _} [MonadState (ReplState σ) m] : m Nat := do + return (← get).savedStates.size + + +namespace TacticRepl + + +/-- The tactic REPL monad. --/ +abbrev TacticReplM := StateT (ReplState Tactic.SavedState) TacticM + + +instance : MonadLift IO TacticReplM where + monadLift x := liftM x + + +/-- Insert a tactic state into the REPL state. --/ +private def insertTacticState (ts : Tactic.SavedState) : TacticReplM Unit := do + let succeeded := ts.tactic.goals.isEmpty + modifyGet fun s => ((), ⟨s.savedStates.push ts, + match s.solvedState with + | some _ => s.solvedState + | none => if succeeded then ts else none + ⟩) + + +/-- Pretty print the given tactic state. --/ +def ppTacticState (ts : Tactic.SavedState) : TacticM String := do + match ts.tactic.goals with + | [] => return "no goals" + | [g] => return (← Meta.ppGoal g).pretty + | goals => + return (← goals.foldlM (fun a b => do return a ++ "\n\n" ++ (← Meta.ppGoal b).pretty) "").trim + + +/-- Initialize the REPL. --/ +private def initializeTacticRepl : TacticM Tactic.SavedState := do + if not (← isProp (← getMainTarget)) then + throwError "[fatal] not_a_theorem" + pruneSolvedGoals + let ts ← Tactic.saveState + let ts_str ← ppTacticState ts + let res : Response := {sid := some 0, tacticState := ts_str} + printResponse res + return ts + + +private def levels2Names : List Level → NameSet + | [] => NameSet.empty + | Level.param n :: us => (levels2Names us).insert n + | _ :: us => levels2Names us + + +private def collectFromLevel : Level → NameSet +| Level.zero => NameSet.empty +| Level.succ l => collectFromLevel l +| Level.param n => NameSet.empty.insert n +| Level.max l1 l2 => (collectFromLevel l1).union $ collectFromLevel l2 +| Level.imax l1 l2 => (collectFromLevel l1).union $ collectFromLevel l2 +| Level.mvar _ => NameSet.empty + + +private def collectLevelParams : Expr → NameSet + | .sort u => collectFromLevel u + | .const _ us => levels2Names us + | .app fm arg => (collectLevelParams fm).union $ collectLevelParams arg + | .lam _ binderType body _ => (collectLevelParams binderType).union $ collectLevelParams body + | .forallE _ binderType body _ => (collectLevelParams binderType).union $ collectLevelParams body + | .letE _ type value body _ => ((collectLevelParams type).union $ collectLevelParams value).union $ collectLevelParams body + | .mdata _ expr => collectLevelParams expr + | .proj _ _ struct => collectLevelParams struct + | _ => NameSet.empty + + +private def collectFVarsAux : Expr → NameSet + | .fvar fvarId => NameSet.empty.insert fvarId.name + | .app fm arg => (collectFVarsAux fm).union $ collectFVarsAux arg + | .lam _ binderType body _ => (collectFVarsAux binderType).union $ collectFVarsAux body + | .forallE _ binderType body _ => (collectFVarsAux binderType).union $ collectFVarsAux body + | .letE _ type value body _ => ((collectFVarsAux type).union $ collectFVarsAux value).union $ collectFVarsAux body + | .mdata _ expr => collectFVarsAux expr + | .proj _ _ struct => collectFVarsAux struct + | _ => NameSet.empty + + +private def collectFVars (e : Expr) : MetaM (Array Expr) := do + let names := collectFVarsAux e + let mut fvars := #[] + for ldecl in ← getLCtx do + if ldecl.isImplementationDetail then + continue + if names.contains ldecl.fvarId.name then + fvars := fvars.push $ .fvar ldecl.fvarId + return fvars + + +private def abstractAllLambdaFVars (e : Expr) : MetaM Expr := do + let mut e' := e + while e'.hasFVar do + let fvars ← collectFVars e' + if fvars.isEmpty then + break + e' ← mkLambdaFVars fvars e' + return e' + + +private def validateProof : TacticReplM Response := do + let ts ← Tactic.saveState + + -- Go to the initial state and grab the goal's metavariable ID. + let ts0 ← getInitialState! TacticReplM + ts0.restore + let [goalId] ← getGoals | throwError "[fatal] more than one initial goal" + let tgt ← getMainTarget >>= instantiateMVars + let tgt_fmt ← ppExpr tgt + + -- Check its assigned Expr in the current state. + ts.restore + let some pf ← getExprMVarAssignment? goalId | throwError "[fatal] goal not assigned" + let pf ← instantiateMVars pf + let pft ← inferType pf >>= instantiateMVars + let pft_fmt ← ppExpr pft + + if ! (← withTransparency .all (isExprDefEq tgt pft)) then + return {error := s!"proof type mismatch: {tgt_fmt} != {pft_fmt}"} + + ts0.restore + let pf ← goalId.withContext $ abstractAllLambdaFVars pf + let pft ← inferType pf >>= instantiateMVars + + ts.restore + if pf.hasSorry then + return {error := "proof contains `sorry`"} + + if pf.hasExprMVar then + return {error := "proof contains metavariables"} + + -- Kernel type check. + let lvls := (collectLevelParams pf).toList + let decl := Declaration.thmDecl { + name := Name.anonymous, type := pft, value := pf + levelParams := lvls + } + try + let _ ← addDecl decl + catch ex => + return {error := s!"kernel type check failed: {← ex.toMessageData.toString}"} + + let ts_str ← ppTacticState ts + let next_tsid ← getNextSid TacticReplM + insertTacticState ts + return {sid := next_tsid, tacticState := ts_str} + + +private def handleRunTac (req : Request) : TacticReplM Response := do + match ← getSavedState? TacticReplM req.sid with + | none => throwError s!"[fatal] unknown tsid: {req.sid}" + | some ts => + match Parser.runParserCategory (← getEnv) `tactic req.cmd "" with + | .error err => return {error := err} + | .ok stx => + ts.restore + + try + monadLift $ commitIfNoEx (evalTactic stx) + let s ← getThe Core.State + if s.messages.hasErrors then + let messages := s.messages.toList.filter fun m => m.severity == MessageSeverity.error + return { error := join $ ← (messages.map (·.data)).mapM fun md => md.toString } + catch ex => + return {error := ← ex.toMessageData.toString} + + pruneSolvedGoals + if (← getGoals).isEmpty then + validateProof + else + let ts' ← Tactic.saveState + let ts'_str ← ppTacticState ts' + let next_tsid ← getNextSid TacticReplM + insertTacticState ts' + return {sid := next_tsid, tacticState := ts'_str} + + +end TacticRepl + + +private def loop (m : Type → Type) [Monad m] [MonadLift IO m] [MonadError m] (handler : Request → m Response) : m Unit := do + while true do + let line ← (← IO.getStdin).getLine + if line.trim == "exit" then + break + match (Json.parse line) with + | .error err => throwError s!"[fatal] failed to parse JSON {err}" + | .ok cmd => + match (fromJson? cmd : Except String Request) with + | .error err => throwError s!"[fatal] parse_failed: data={err}" + | .ok req => (← handler req) |> printResponse + + +namespace TacticRepl + +/-- +{"sid": 0, "cmd": "skip"} +{"sid": 1, "cmd": "rw [add_assoc, add_comm b, ←add_assoc]"} +exit +--/ +def repl : TacticM Unit := do + withMainContext do + -- Print the initial goal. + let ts ← initializeTacticRepl + -- Interaction through the command line. + let loop := LeanDojo.loop TacticReplM handleRunTac + let (_, s) ← loop.run {savedStates := #[ts], solvedState := none} + -- Close the proof if we have found a solved tactic state. + match s.solvedState with + | none => return () + | some ts' => ts'.restore + IO.Process.exit 0 + + +end TacticRepl + + +namespace CommandRepl + + +/-- The REPL monad. --/ +abbrev CommandReplM := StateT (ReplState Command.State) CommandElabM + + +instance : MonadLift IO CommandReplM where + monadLift x := liftM x + + +/-- Insert a command state into the REPL state. --/ +private def insertCommandState (cs : Command.State) : CommandReplM Unit := do + modifyGet fun s => ((), ⟨s.savedStates.push cs, none⟩) + + +/-- Initialize the REPL. --/ +private def initializeRepl : CommandElabM Command.State := do + let res : Response := {sid := some 0} + printResponse res + return (← get) + + +private def handleRunCmd (req : Request) : CommandReplM Response := do + match ← getSavedState? CommandReplM req.sid with + | none => throwError s!"[fatal] unknown csid: {req.sid}" + | some cs => + let inputCtx := Parser.mkInputContext req.cmd "" + let parserState := { : Parser.ModuleParserState } + let cs' := (← IO.processCommands inputCtx parserState cs).commandState + + -- Collect error messages and print other messages. + let messages := cs'.messages.toList + let mut errors := #[] + for msg in messages do + let s ← msg.data.toString + if msg.severity == MessageSeverity.error then + errors := errors.push s + else + println! s.trim + let err_msg := if errors.isEmpty then none else some (join errors.toList) + + let next_csid ← getNextSid CommandReplM + insertCommandState cs' + return {sid := next_csid, error := err_msg} + + +/-- +{"sid": 0, "cmd": "#eval 1"} +{"sid": 1, "cmd": "#eval x"} +{"sid": 0, "cmd": "def x := 1"} +{"sid": 3, "cmd": "#eval x"} +exit +--/ +def repl : CommandElabM Unit := do + let cs ← initializeRepl + let loop := LeanDojo.loop CommandReplM handleRunCmd + let _ ← loop.run {savedStates := #[cs], solvedState := none} + IO.Process.exit 0 + +end CommandRepl + +end LeanDojo + + +/-- The `lean_dojo_repl` tactic. --/ +elab "lean_dojo_repl" : tactic => LeanDojo.TacticRepl.repl + + +/-- The `#lean_dojo_repl` command. --/ +elab "#lean_dojo_repl" : command => LeanDojo.CommandRepl.repl diff --git a/interaction/dojo.py b/interaction/dojo.py new file mode 100644 index 0000000..e13c576 --- /dev/null +++ b/interaction/dojo.py @@ -0,0 +1,549 @@ +import re +import os +import sys +import json +import time +import signal +import shutil +import psutil +from pathlib import Path +from loguru import logger +from tempfile import mkdtemp +from shutil import ignore_patterns +from subprocess import TimeoutExpired +from dataclasses import dataclass, field +from typing import Union, Tuple, List, Dict, Any, Optional + +from ..constants import ( + TMP_DIR, + TACTIC_CPU_LIMIT, + TACTIC_MEMORY_LIMIT, +) +from ..utils import to_json_path +from .parse_goals import parse_goals, Goal +from ..data_extraction.trace import get_traced_repo_path +from ..data_extraction.lean import Theorem, LeanGitRepo, Pos +from ..container import get_container, Mount, NativeContainer, DockerContainer +from ..data_extraction.traced_data import TracedFile, get_code_without_comments + + +_REPL_PROMPT = "REPL>" + + +@dataclass(frozen=True) +class CommandState: + id: int = field(compare=False) + message: Optional[str] = field(default=None, compare=False) + + +@dataclass(frozen=True) +class TacticState: + pp: str + id: int = field(compare=False) + message: Optional[str] = field(default=None, compare=False) + goals: List[Goal] = field(init=False, compare=False, repr=False) + + def __post_init__(self) -> None: + goals = parse_goals(self.pp) + assert len(goals) == self.pp.count("⊢") + object.__setattr__(self, "goals", goals) + + @property + def num_goals(self) -> int: + return len(self.goals) + + +@dataclass(frozen=True) +class ProofFinished: + tactic_state_id: int + message: Optional[str] = field(default=None, compare=False) + + +@dataclass(frozen=True) +class ProofGivenUp: + pass + + +@dataclass(frozen=True) +class LeanError: + error: str + + +@dataclass(frozen=True) +class TimeoutError: + error: str + + +TacticResult = Union[ + TacticState, + ProofFinished, + LeanError, + TimeoutError, + ProofGivenUp, +] + +CommandResult = Union[CommandState, LeanError, TimeoutError] + +State = Union[CommandState, TacticState] + + +class DojoCrashError(Exception): + @property + def is_out_of_memory(self) -> bool: + return str(self) == "OOM" + + +class DojoHardTimeoutError(Exception): + pass + + +class DojoInitError(Exception): + pass + + +def _kill_descendants(proc: psutil.Process) -> None: + for child in proc.children(): + _kill_descendants(child) + proc.kill() + + +class Dojo: + """Gym-like environment for programmatic interaction with Lean through tactics or commands.""" + + entry: Union[Theorem, Tuple[LeanGitRepo, Path, int]] + hard_timeout: Optional[float] + additional_imports: List[str] + repo: LeanGitRepo + file_path: Path + is_successful: Optional[bool] = None + is_crashed: bool = False + has_timedout: bool = False + + def __init__( + self, + entry: Union[Theorem, Tuple[LeanGitRepo, Path, int]], + hard_timeout: Optional[float] = None, + additional_imports: List[str] = [], + ): + """Initialize Dojo. + + Args: + entry (Union[Theorem, Tuple[LeanGitRepo, Path, int]]): When a Theorem is given, + the :class:`Dojo` object enables interaction with the theorem through tactics. + When a tuple of (repo, file_path, line_nb) is given (only supported in Lean 4), + the :class:`Dojo` object enables interaction with Lean through commands (similar to a REPL). + hard_timeout (Optional[float], optional): Hard timeout in seconds. Defaults to None. + """ + self.entry = entry + self.hard_timeout = hard_timeout + self.additional_imports = additional_imports + + if self.uses_tactics: + assert isinstance(entry, Theorem) + self.repo, self.file_path = entry.repo, entry.file_path + self.is_successful = False + else: + assert self.uses_commands + assert isinstance(entry, tuple) + self.repo, self.file_path, _ = entry + self.file_path = Path(self.file_path) + + if self.hard_timeout is None: + logger.warning("Using Lean 4 without a hard timeout may hang indefinitely.") + + @property + def uses_tactics(self) -> bool: + return isinstance(self.entry, Theorem) + + @property + def uses_commands(self) -> bool: + return isinstance(self.entry, tuple) + + def __enter__(self) -> Tuple["Dojo", State]: + """Initialize Dojo.""" + logger.debug(f"Initializing Dojo for {self.entry}") + + # Work in a temporary directory. + self.origin_dir = Path.cwd() + self.tmp_dir = Path(mkdtemp(dir=TMP_DIR)) + + try: + self._install_handlers() + os.chdir(self.tmp_dir) + + # Copy and `cd` into the repo. + traced_repo_path = get_traced_repo_path(self.repo) + shutil.copytree( + traced_repo_path, + self.repo.name, + ignore=ignore_patterns("*.dep_paths", "*.ast.json", "*.trace.xml"), + ) + os.chdir(self.repo.name) + + # Replace the human-written proof with a `repl` tactic. + try: + traced_file = self._locate_traced_file(traced_repo_path) + except FileNotFoundError: + raise DojoInitError( + f"Cannot find the *.ast.json file for {self.entry} in {traced_repo_path}." + ) + + self._modify_file(traced_file) + + # Run the modified file in a container. + self.container = get_container() + logger.debug(f"Launching the proof using {type(self.container)}") + mts = [Mount(Path.cwd(), Path(f"/workspace/{self.repo.name}"))] + self.container.run( + "lake build Lean4Repl", + mts, + as_current_user=True, + capture_output=True, + work_dir=f"/workspace/{self.repo.name}", + cpu_limit=None, + memory_limit=None, + envs={}, + ) + assert re.fullmatch(r"\d+g", TACTIC_MEMORY_LIMIT) + memory_limit = 1024 * int(TACTIC_MEMORY_LIMIT[:-1]) + cmd = f"lake env lean --threads={TACTIC_CPU_LIMIT} --memory={memory_limit} {self.file_path}" + + self.proc = self.container.run_interactive( + cmd, + mts, + cpu_limit=None, + memory_limit=None, + work_dir=f"/workspace/{self.repo.name}", + as_current_user=True, + envs={}, + ) + + # Get the initial tactic state. + try: + res = json.loads(self._read_next_line()[0]) + except Exception as ex: + if traced_file.has_prelude: + raise DojoInitError( + "Currently LeanDojo does not support interacting with proofs in prelude files." + ) + elif isinstance(ex, EOFError): + raise DojoInitError("EOF") + else: + raise ex + + assert res["error"] is None + + # logger.debug(f"Response: {res}") + if self.uses_tactics: + assert res["tacticState"] != "no goals" + init_state: State = TacticState( + self._post_process(res["tacticState"]), + res["sid"], + ) + else: + assert self.uses_commands + init_state = CommandState(int(res["sid"])) + + self.start_time = time.monotonic() + self._set_timer() + + return self, init_state + + except Exception as ex: + os.chdir(self.origin_dir) + shutil.rmtree(self.tmp_dir) + raise ex + + def _locate_traced_file(self, traced_repo_path: Path) -> TracedFile: + json_path = to_json_path(traced_repo_path, self.file_path, self.repo) + return TracedFile.from_traced_file(traced_repo_path, json_path, self.repo) + + def _set_timer(self) -> None: + if self.hard_timeout is not None: + signal.signal(signal.SIGALRM, self._handle_hard_timeout) + signal.alarm(int(self.hard_timeout)) + + def _cancel_timer(self) -> None: + if self.hard_timeout is not None: + signal.alarm(0) + signal.signal(signal.SIGALRM, signal.SIG_DFL) + + def _handle_hard_timeout(self, signum: Any, frame: Any) -> None: + logger.debug(f"Hard timeout in {self}") + self.has_timedout = True + raise DojoHardTimeoutError() + + def _install_handlers(self) -> None: + self.old_sigint = signal.signal(signal.SIGINT, self._exit_gracefully) + self.old_sigterm = signal.signal(signal.SIGTERM, self._exit_gracefully) + + def _uninstall_handlers(self) -> None: + signal.signal(signal.SIGINT, self.old_sigint) + signal.signal(signal.SIGTERM, self.old_sigterm) + + def _exit_gracefully(self, signum: Any, frame: Any) -> None: + logger.debug("Exiting gracefully.") + sys.exit(-1) + + def _cleanup(self) -> None: + logger.debug("Cleaning up.") + try: + self._cleanup_container() + self._cleanup_proc() + finally: + self._cleanup_tmp_dir() + self._uninstall_handlers() + + def _cleanup_container(self) -> None: + """Clean up the container.""" + logger.debug("Cleaning up the container.") + assert isinstance(self.container, DockerContainer) or isinstance( + self.container, NativeContainer + ) + self.container.cleanup() + + def _cleanup_proc(self) -> None: + """Clean up the subprocess.""" + logger.debug(f"Cleaning up the subprocess {self.proc.pid}.") + _kill_descendants(psutil.Process(self.proc.pid)) + """ + self.proc.terminate() + try: + self.proc.wait(timeout=0.5) + except TimeoutExpired: + self.proc.kill() + """ + + def _cleanup_tmp_dir(self) -> None: + """Clean up the temporary directory.""" + logger.debug("Cleaning up the temporary directory.") + os.chdir(self.origin_dir) + if self.tmp_dir is not None and os.path.exists(self.tmp_dir): + shutil.rmtree(self.tmp_dir) + + def __exit__(self, exc_type: None, exc_val: None, exc_tb: None) -> None: + """Exit Dojo. + + Args: + exc_type (None): _description_ + exc_val (None): _description_ + exc_tb (None): _description_ + """ + # Cancel the hard timeout. + self._cancel_timer() + self._cleanup() + + def _post_process(self, tactic_state: str) -> str: + """Post-process the pretty-printed tactic state. + + Args: + tactic_state (str): _description_ + + Returns: + str: _description_ + """ + m = re.match(r"\d+ goals\n", tactic_state) + if m is not None: + return tactic_state[m.end() :] + else: + return tactic_state + + def _get_imports(self) -> str: + imports = ["Lean4Repl"] + self.additional_imports + return "\n".join(f"import {_}" for _ in imports) + "\n\n" + + def _modify_file(self, traced_file: TracedFile) -> None: + logger.debug(f"Modifying {traced_file.lean_file.path}") + + if self.uses_tactics: + # Interaction through tactics. + modified_code = self._modify_proof(traced_file) + else: + # Interaction through commands (supported only in Lean 4 via CommandElabM). + lean_file = traced_file.lean_file + pos = Pos(line_nb=self.entry[2], column_nb=1) + code_before = get_code_without_comments( + lean_file, lean_file.start_pos, pos, traced_file.comments + ) + modified_code = ( + self._get_imports() + + code_before + + "set_option maxHeartbeats 0 in\n#lean_dojo_repl\n\n" + + lean_file[pos:] + ) + + repl_file = "Lean4Repl.lean" + repl_dst = Path(repl_file) + + if os.path.exists("lakefile.lean"): + with open("lakefile.lean", "a") as oup: + oup.write("\nlean_lib Lean4Repl {\n\n}\n") + else: + assert os.path.exists("lakefile.toml") + with open("lakefile.toml", "a") as oup: + oup.write('\n[[lean_lib]]\nname = "Lean4Repl"\n') + + if os.path.exists("lakefile.olean"): + os.remove("lakefile.olean") + if os.path.exists(".lake/lakefile.olean"): + os.remove(".lake/lakefile.olean") + + # Copy the REPL code to the right directory. + repl_src = Path(__file__).with_name(repl_file) + repl_code = repl_src.open().read() + if repl_dst.exists(): + raise DojoInitError(f"{repl_dst} exists") + with repl_dst.open("wt") as oup: + oup.write(repl_code) + + # Write the modified code to the file. + with self.file_path.open("wt") as oup: + oup.write(modified_code) + + def _modify_proof(self, traced_file: TracedFile) -> str: + # Modify the proof and set up the `repl` tactic. + assert isinstance(self.entry, Theorem) + traced_theorem = traced_file.get_traced_theorem(self.entry) + if traced_theorem is None: + raise DojoInitError( + f"Failed to locate the theorem with `{self.entry.full_name}` as its fully qualified name" + ) + proof_start, proof_end = traced_theorem.locate_proof() + lean_file = traced_file.lean_file + + code_import = self._get_imports() + code_proof = "\nby\n lean_dojo_repl\n sorry\n" + code_before_theorem = get_code_without_comments( + lean_file, lean_file.start_pos, traced_theorem.start, traced_file.comments + ) + code_thereom = lean_file[traced_theorem.start : proof_start] + modified_code = ( + code_import + + code_before_theorem + + "\nset_option maxHeartbeats 0 in\n" + + code_thereom + + code_proof + + lean_file[proof_end:] + ) + + return str(modified_code) + + def run_tac(self, state: TacticState, tactic: str) -> TacticResult: + if not isinstance(state, TacticState): + raise RuntimeError( + f"Attempting to run a tactic on an invalid state {state}." + ) + assert isinstance(tactic, str), f"Invalid tactic {tactic}" + + tsid = state.id + req = json.dumps({"sid": tsid, "cmd": tactic}, ensure_ascii=False) + res = self._submit_request(req) + + if res["error"] is not None: + if "proof contains `sorry`" in res["error"]: + return ProofGivenUp() + elif "try_for_time tactic failed, timeout" in res["error"]: + return TimeoutError(res["error"].strip()) + else: + return LeanError(res["error"].strip()) + elif res["tacticState"] == "no goals": + self.is_successful = True + return ProofFinished(res["sid"], res["message"]) + else: + tactic_state = self._post_process(res["tacticState"]) + return TacticState( + tactic_state, + res["sid"], + res["message"], + ) + + def run_cmd(self, state: CommandState, command: str) -> CommandResult: + if not isinstance(state, CommandState): + raise RuntimeError( + f"Attempting to run a command on an invalid state {state}." + ) + assert isinstance(command, str), f"Invalid command {command}" + + csid = state.id + req = json.dumps({"sid": csid, "cmd": command}, ensure_ascii=False) + res = self._submit_request(req) + + if res["error"] is not None: + return LeanError(res["error"].strip()) + else: + return CommandState(res["sid"], res["message"]) + + def _submit_request(self, req: str) -> Dict[str, Any]: + """Submit a request to Lean and get the response. + + Args: + req (str): _description_ + + Raises: + DojoCrashError: _description_ + + Returns: + Dict[str, Any]: _description_ + """ + if self.proc.stdin is None: + raise RuntimeError("self.proc.stdin is not initialized") + self._check_alive() + logger.debug(req) + self.proc.stdin.write(req + "\n") + try: + res, msg = self._read_next_line() + except EOFError: + raise DojoCrashError("Unexpected EOF") + try: + result: Dict[str, Any] = json.loads(res) + except json.decoder.JSONDecodeError: + raise DojoCrashError(f"Invalid JSON: {res}") + + result["message"] = msg + return result + + def _check_alive(self) -> None: + exit_code = self.proc.poll() + if exit_code is None: + return + elif exit_code == 137: + raise DojoCrashError("OOM") + else: + raise DojoCrashError(f"Unknown exit code: {exit_code}") + + def _read_next_line(self) -> Tuple[str, str]: + """Read the next line from `self.proc`. + + Raises: + EOFError: _description_ + DojoCrashError: _description_ + DojoInitError: _description_ + + Returns: + str: _description_ + """ + if self.proc.stdout is None: + raise RuntimeError("self.proc.stout is not initialized") + msg: List[str] = [] + while True: + line = self.proc.stdout.readline().strip() + logger.debug(line) + if line == "": + raise EOFError + if line.startswith(_REPL_PROMPT): + self._check_alive() + return line[len(_REPL_PROMPT) :].strip(), "\n".join(msg) + elif "error: " in line: + if ( + "error: deep recursion was detected" in line + or "error: [fatal] not_a_theorem" in line + ): + self.is_crashed = True + raise DojoCrashError(line) + elif "error: unknown package" in line: + self.is_crashed = True + raise DojoInitError(line) + else: + pass + else: + msg.append(line) diff --git a/interaction/parse_goals.py b/interaction/parse_goals.py new file mode 100644 index 0000000..6472731 --- /dev/null +++ b/interaction/parse_goals.py @@ -0,0 +1,69 @@ +"""Utilities for parsing Lean's pretty-printed proof goals. +""" + +import re +from typing import List +from dataclasses import dataclass + + +_DECL_REGEX = re.compile( + r"(?<=\n)(?P.+?)\s+\:(?P.+?)\n(?=\S)", re.DOTALL +) +"""Regex for a line of declarations in the local context. + +It can be a single declaration such as ``x : Nat`` or multiple declarations such as ``x y : Nat``. +""" + + +_CASE_REGEX = re.compile(r"case\s\S+\n") + + +_SPACE_REGEX = re.compile(r"\s+") + + +@dataclass(frozen=True) +class Declaration: + """A declaration in the local context.""" + + ident: str + lean_type: str + + def __post_init__(self) -> None: + assert _SPACE_REGEX.search(self.ident) is None + + +def _parse_local_context(ctx_pp: str) -> List[Declaration]: + """Parse the local context of a goal.""" + m = _CASE_REGEX.match(ctx_pp) + if m is not None: + ctx_pp = ctx_pp[m.end() :] + + decls = [] + for m in _DECL_REGEX.finditer("\n" + ctx_pp + "⊢"): + lean_type = m["lean_type"].strip() + if lean_type.endswith(","): + lean_type = lean_type[:-1].strip() + for ident in m["idents"].strip().split(): + decls.append(Declaration(ident.strip(), lean_type)) + return decls + + +@dataclass(frozen=True) +class Goal: + """A goal in Lean.""" + + assumptions: List[Declaration] + conclusion: str + + @classmethod + def from_pp(cls, pp: str) -> "Goal": + """Parse a pretty-printed goal.""" + assert pp.count("⊢") == 1 + ctx, concl = pp.split("⊢") + assumptions = _parse_local_context(ctx) + return cls(assumptions, concl.strip()) + + +def parse_goals(pp: str) -> List[Goal]: + """Parse a list of pretty-printed goals.""" + return [Goal.from_pp(g) for g in pp.split("\n\n") if "⊢" in g] diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..9891276 --- /dev/null +++ b/utils.py @@ -0,0 +1,314 @@ +"""Utility functions used internally by LeanDojo. +""" + +import re +import os +import ray +import time +import urllib +import typing +import hashlib +import tempfile +import subprocess +from pathlib import Path +from loguru import logger +from functools import cache +from contextlib import contextmanager +from ray.util.actor_pool import ActorPool +from typing import Tuple, Union, List, Generator, Optional + +from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR + + +@contextmanager +def working_directory( + path: Optional[Union[str, Path]] = None +) -> Generator[Path, None, None]: + """Context manager setting the current working directory (CWD) to ``path`` (or a temporary directory if ``path`` is None). + + The original CWD is restored after the context manager exits. + + Args: + path (Optional[Union[str, Path]], optional): The desired CWD. Defaults to None. + + Yields: + Generator[Path, None, None]: A ``Path`` object representing the CWD. + """ + origin = Path.cwd() + if path is None: + tmp_dir = tempfile.TemporaryDirectory(dir=TMP_DIR) + path = tmp_dir.__enter__() + is_temporary = True + else: + is_temporary = False + + path = Path(path) + if not path.exists(): + path.mkdir(parents=True) + os.chdir(path) + + try: + yield path + finally: + os.chdir(origin) + if is_temporary: + tmp_dir.__exit__(None, None, None) + + +@contextmanager +def ray_actor_pool( + actor_cls: type, *args, **kwargs +) -> Generator[ActorPool, None, None]: + """Create a pool of Ray Actors of class ``actor_cls``. + + Args: + actor_cls (type): A Ray Actor class (annotated by ``@ray.remote``). + *args: Position arguments passed to ``actor_cls``. + **kwargs: Keyword arguments passed to ``actor_cls``. + + Yields: + Generator[ActorPool, None, None]: A :class:`ray.util.actor_pool.ActorPool` object. + """ + assert not ray.is_initialized() + ray.init() + pool = ActorPool([actor_cls.remote(*args, **kwargs) for _ in range(NUM_WORKERS)]) + try: + yield pool + finally: + ray.shutdown() + + +@contextmanager +def report_critical_failure(msg: str) -> Generator[None, None, None]: + """Context manager logging ``msg`` in case of any exception. + + Args: + msg (str): The message to log in case of exceptions. + + Raises: + ex: Any exception that may be raised within the context manager. + """ + try: + yield + except Exception as ex: + logger.error(msg) + raise ex + + +def execute( + cmd: Union[str, List[str]], capture_output: bool = False +) -> Optional[Tuple[str, str]]: + """Execute the shell command ``cmd`` and optionally return its output. + + Args: + cmd (Union[str, List[str]]): The shell command to execute. + capture_output (bool, optional): Whether to capture and return the output. Defaults to False. + + Returns: + Optional[Tuple[str, str]]: The command's output, including stdout and stderr (None if ``capture_output == False``). + """ + try: + res = subprocess.run(cmd, shell=True, capture_output=capture_output, check=True) + except subprocess.CalledProcessError as ex: + if capture_output: + logger.info(ex.stdout.decode()) + logger.error(ex.stderr.decode()) + raise ex + if not capture_output: + return None + output = res.stdout.decode() + error = res.stderr.decode() + return output, error + + +def compute_md5(path: Path) -> str: + """Return the MD5 hash of the file ``path``.""" + # The file could be large + # See: https://stackoverflow.com/questions/48122798/oserror-errno-22-invalid-argument-when-reading-a-huge-file + hasher = hashlib.md5() + with path.open("rb") as inp: + while True: + block = inp.read(64 * (1 << 20)) + if not block: + break + hasher.update(block) + return hasher.hexdigest() + + +_CAMEL_CASE_REGEX = re.compile(r"(_|-)+") + + +def camel_case(s: str) -> str: + """Convert the string ``s`` to camel case.""" + return _CAMEL_CASE_REGEX.sub(" ", s).title().replace(" ", "") + + +@cache +def get_repo_info(path: Path) -> Tuple[str, str]: + """Get the URL and commit hash of the Git repo at ``path``. + + Args: + path (Path): Path to the Git repo. + + Returns: + Tuple[str, str]: URL and (most recent) hash commit + """ + with working_directory(path): + # Get the URL. + url_msg, _ = execute(f"git remote get-url origin", capture_output=True) + url = url_msg.strip() + # Get the commit. + commit_msg, _ = execute(f"git log -n 1", capture_output=True) + m = re.search(r"(?<=^commit )[a-z0-9]+", commit_msg) + assert m is not None + commit = m.group() + + if url.startswith("git@"): + assert url.endswith(".git") + url = url[: -len(".git")].replace(":", "/").replace("git@", "https://") + + return url, commit + + +def is_optional_type(tp: type) -> bool: + """Test if ``tp`` is Optional[X].""" + if typing.get_origin(tp) != Union: + return False + args = typing.get_args(tp) + return len(args) == 2 and args[1] == type(None) + + +def remove_optional_type(tp: type) -> type: + """Given Optional[X], return X.""" + if typing.get_origin(tp) != Union: + return False + args = typing.get_args(tp) + if len(args) == 2 and args[1] == type(None): + return args[0] + else: + raise ValueError(f"{tp} is not Optional") + + +@cache +def read_url(url: str, num_retries: int = 2) -> str: + """Read the contents of the URL ``url``. Retry if failed""" + backoff = 1 + while True: + try: + with urllib.request.urlopen(url) as f: + return f.read().decode() + except Exception as ex: + if num_retries <= 0: + raise ex + num_retries -= 1 + logger.debug(f"Request to {url} failed. Retrying...") + time.sleep(backoff) + backoff *= 2 + + +@cache +def url_exists(url: str) -> bool: + """Return True if the URL ``url`` exists.""" + try: + with urllib.request.urlopen(url) as _: + return True + except urllib.error.HTTPError: + return False + + +def parse_int_list(s: str) -> List[int]: + assert s.startswith("[") and s.endswith("]") + return [int(_) for _ in s[1:-1].split(",") if _ != ""] + + +def parse_str_list(s: str) -> List[str]: + assert s.startswith("[") and s.endswith("]") + return [_.strip()[1:-1] for _ in s[1:-1].split(",") if _ != ""] + + +@cache +def is_git_repo(path: Path) -> bool: + """Check if ``path`` is a Git repo.""" + with working_directory(path): + return ( + os.system("git rev-parse --is-inside-work-tree 1>/dev/null 2>/dev/null") + == 0 + ) + + +def _from_lean_path(root_dir: Path, path: Path, repo, ext: str) -> Path: + assert path.suffix == ".lean" + if path.is_absolute(): + path = path.relative_to(root_dir) + + assert root_dir.name != "lean4" + if path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/src/lean/lake"): + # E.g., "lake-packages/lean4/src/lean/lake/Lake/CLI/Error.lean" + p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/src/lean/lake") + return LEAN4_PACKAGES_DIR / "lean4/lib/lean" / p.with_suffix(ext) + elif path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/src"): + # E.g., "lake-packages/lean4/src/lean/Init.lean" + p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/src").with_suffix(ext) + return LEAN4_PACKAGES_DIR / "lean4/lib" / p + elif path.is_relative_to(LEAN4_PACKAGES_DIR): + # E.g., "lake-packages/std/Std.lean" + p = path.relative_to(LEAN4_PACKAGES_DIR).with_suffix(ext) + repo_name = p.parts[0] + return ( + LEAN4_PACKAGES_DIR + / repo_name + / LEAN4_BUILD_DIR + / "ir" + / p.relative_to(repo_name) + ) + else: + # E.g., "Mathlib/LinearAlgebra/Basics.lean" + return LEAN4_BUILD_DIR / "ir" / path.with_suffix(ext) + + +def to_xml_path(root_dir: Path, path: Path, repo) -> Path: + return _from_lean_path(root_dir, path, repo, ext=".trace.xml") + + +def to_dep_path(root_dir: Path, path: Path, repo) -> Path: + return _from_lean_path(root_dir, path, repo, ext=".dep_paths") + + +def to_json_path(root_dir: Path, path: Path, repo) -> Path: + return _from_lean_path(root_dir, path, repo, ext=".ast.json") + + +def to_lean_path(root_dir: Path, path: Path, repo) -> bool: + if path.is_absolute(): + path = path.relative_to(root_dir) + + if path.suffix in (".xml", ".json"): + path = path.with_suffix("").with_suffix(".lean") + else: + assert path.suffix == ".dep_paths" + path = path.with_suffix(".lean") + + assert root_dir.name != "lean4" + if path == LEAN4_PACKAGES_DIR / "lean4/lib/lean/Lake.lean": + return LEAN4_PACKAGES_DIR / "lean4/src/lean/lake/Lake.lean" + elif path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/lib/lean/Lake"): + # E.g., "lake-packages/lean4/lib/lean/Lake/Util/List.lean" + p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/lib/lean/Lake") + return LEAN4_PACKAGES_DIR / "lean4/src/lean/lake/Lake" / p + elif path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/lib"): + # E.g., "lake-packages/lean4/lib/lean/Init.lean" + p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/lib") + return LEAN4_PACKAGES_DIR / "lean4/src" / p + elif path.is_relative_to(LEAN4_PACKAGES_DIR): + # E.g., "lake-packages/std/build/ir/Std.lean" + p = path.relative_to(LEAN4_PACKAGES_DIR) + repo_name = p.parts[0] + return ( + LEAN4_PACKAGES_DIR + / repo_name + / p.relative_to(Path(repo_name) / LEAN4_BUILD_DIR / "ir") + ) + else: + # E.g., ".lake/build/ir/Mathlib/LinearAlgebra/Basics.lean" or "build/ir/Mathlib/LinearAlgebra/Basics.lean" + assert path.is_relative_to(LEAN4_BUILD_DIR / "ir"), path + return path.relative_to(LEAN4_BUILD_DIR / "ir") From c20d300c43fc765073e43e53de7877a085f7b79f Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Thu, 23 Oct 2025 23:21:05 -0400 Subject: [PATCH 21/29] Fix _to_commit_hash compatibility fallback --- data_extraction/lean.py | 106 +++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 34 deletions(-) diff --git a/data_extraction/lean.py b/data_extraction/lean.py index 187288c..6238f93 100644 --- a/data_extraction/lean.py +++ b/data_extraction/lean.py @@ -88,35 +88,69 @@ def cleanse_string(s: Union[str, Path]) -> str: @cache def _to_commit_hash(repo: Repository, label: str) -> str: - """Convert a tag or branch to a commit hash.""" + """Convert a tag or branch to a commit hash with fallback strategies.""" logger.debug(f"Querying the commit hash for {repo.name} {label}") - - # Poor man's cache - if repo.name == "lean4": - if label == "v4.23.0-rc2": - return "ad1a017949674a947f0d6794cbf7130d642c6530" - elif label == "v4.17.0": - return "306f36116535cd226329f562b4675b8b6dbf948c" - elif label == "v4.8.0-rc2": - return "873ef2d894af80d8fc672e35f7e28bae314a1f6f" - - # if the label is a commit hash, return it directly - if len(label) == 40 and _COMMIT_REGEX.fullmatch(label.strip()): - return label - for branch in repo.get_branches(): - if branch.name == label: - print(f"Found branch {branch.name} with commit {branch.commit.sha}") - return branch.commit.sha + label_stripped = (label or "").strip() + if len(label_stripped) == 40 and _COMMIT_REGEX.fullmatch(label_stripped): + return label_stripped + + candidates = [] + if label_stripped.startswith("v"): + candidates += [label_stripped, label_stripped.lstrip("v")] + else: + candidates += [label_stripped, f"v{label_stripped}"] + + base = label_stripped[1:] if label_stripped.startswith("v") else label_stripped + if "-rc" in base: + base_no_rc = base.split("-rc", 1)[0] + candidates += [base_no_rc, f"v{base_no_rc}"] + + seen = set() + uniq = [] + for cand in candidates: + if cand and cand not in seen: + uniq.append(cand) + seen.add(cand) + candidates = uniq + + for cand in candidates: + try: + ref = repo.get_git_ref(f"tags/{cand}") + obj = ref.object + if obj.type == "tag": + tag_obj = repo.get_git_tag(obj.sha) + if tag_obj.object.type == "commit": + return tag_obj.object.sha + elif obj.type == "commit": + return obj.sha + except Exception: + pass + + try: + return repo.get_commit(cand).sha + except Exception: + pass - for tag in repo.get_tags(): - if tag.name == label: - print(f"Found tag {tag.name} with commit {tag.commit.sha}") - return tag.commit.sha + try: + data = read_url(f"https://api.github.com/repos/{repo.full_name}/commits/{cand}") + sha = json.loads(data).get("sha") + if sha: + return sha + except Exception: + pass raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") +def _to_commit_hash_compat(repo: Repository, label: str) -> str: + """Compatibility wrapper: supports both (repo, label) and (label) call signatures.""" + try: + return _to_commit_hash(repo, label) + except TypeError: + return _to_commit_hash(label) + + @dataclass(eq=True, unsafe_hash=True) class Pos: """Position in source files. @@ -357,9 +391,9 @@ def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str: version = config[len(prefix) :] if version.startswith("nightly"): - return _to_commit_hash(LEAN4_NIGHTLY_REPO, version) + return _to_commit_hash_compat(LEAN4_NIGHTLY_REPO, version) else: - return _to_commit_hash(LEAN4_REPO, version) + return _to_commit_hash_compat(LEAN4_REPO, version) URL = TAG = COMMIT = str @@ -445,7 +479,7 @@ def __post_init__(self) -> None: if (self.url, self.commit) in info_cache.tag2commit: commit = info_cache.tag2commit[(self.url, self.commit)] else: - commit = _to_commit_hash(self.repo, self.commit) + commit = _to_commit_hash_compat(self.repo, self.commit) assert _COMMIT_REGEX.fullmatch(commit), f"Invalid commit hash: {commit}" info_cache.tag2commit[(self.url, self.commit)] = commit object.__setattr__(self, "commit", commit) @@ -498,18 +532,22 @@ def clone_and_checkout(self) -> None: user_name, repo_name = _split_git_url(self.url) local_repo_path = Path(os.environ["REPO_DIR"]) / user_name / repo_name + local_repo_path.parent.mkdir(parents=True, exist_ok=True) + if os.path.exists(local_repo_path): logger.info(f"{self} already exists locally.") else: logger.debug(f"Cloning {self}") - execute(f"git clone -n --recursive {self.url}", capture_output=True) - - - with working_directory(local_repo_path): - execute( - f"git checkout {self.commit} && git submodule update --recursive", - capture_output=True, - ) + execute( + f"git clone -n --recursive {self.url} {local_repo_path}", + capture_output=True, + ) + + with working_directory(local_repo_path): + execute( + f"git checkout {self.commit} && git submodule update --recursive", + capture_output=True, + ) def get_dependencies( self, path: Union[str, Path, None] = None @@ -591,7 +629,7 @@ def _parse_deps( commit = rev else: try: - commit = _to_commit_hash(url_to_repo(url), rev) + commit = _to_commit_hash_compat(url_to_repo(url), rev) except ValueError: commit = get_latest_commit(url) assert _COMMIT_REGEX.fullmatch(commit) From eb8eac6ca1f8039a462b47cbfcef57dcf2a18c0c Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 28 Oct 2025 19:14:51 -0400 Subject: [PATCH 22/29] Patch LeanDojo tracing and repo handling for LeanAgent pipeline --- container.py | 3 -- data_extraction/build_lean4_repo.py | 1 - data_extraction/lean.py | 69 +++++++++++++++++++---------- data_extraction/trace.py | 6 ++- 4 files changed, 51 insertions(+), 28 deletions(-) diff --git a/container.py b/container.py index af9f25a..2022eb2 100644 --- a/container.py +++ b/container.py @@ -118,7 +118,6 @@ class NativeContainer(Container): def _mount_files(self, mounts: List[Mount]) -> None: cwd = Path.cwd() - import ipdb; ipdb.set_trace() for src, dst in mounts: if dst.is_absolute(): dst = cwd / dst.relative_to(dst.root) @@ -134,7 +133,6 @@ def _mount_files(self, mounts: List[Mount]) -> None: def _unmount_files(self, mounts: List[Mount]) -> None: cwd = Path.cwd() - for src, dst in mounts: if dst.is_absolute(): dst = cwd / dst.relative_to(dst.root) @@ -184,7 +182,6 @@ def run( assert memory_limit is None, "NativeContainer does not support memory limit." assert cpu_limit is None, "NativeContainer does not support CPU limit." - import ipdb; ipdb.set_trace() self._mount_files(mounts) cmd = self._build_native_command(command, envs) diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py index a15dd61..dc82257 100644 --- a/data_extraction/build_lean4_repo.py +++ b/data_extraction/build_lean4_repo.py @@ -154,7 +154,6 @@ def is_new_version(v: str) -> bool: def main() -> None: - import ipdb; ipdb.set_trace() parser = argparse.ArgumentParser() parser.add_argument("repo_name") parser.add_argument("--no-deps", action="store_true") diff --git a/data_extraction/lean.py b/data_extraction/lean.py index 6238f93..9a63618 100644 --- a/data_extraction/lean.py +++ b/data_extraction/lean.py @@ -50,11 +50,18 @@ LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly") """The GitHub Repo for Lean 4 nightly releases.""" +TOOLCHAIN_OVERRIDES = { + "ImperialCollegeLondon/FLT": "leanprover/lean4:v4.25.0-rc1", +} + _URL_REGEX = re.compile(r"(?P.*?)/*") def normalize_url(url: str) -> str: - return _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. + cleaned = _URL_REGEX.fullmatch(url)["url"] # Remove trailing `/`. + if cleaned.endswith(".git"): + cleaned = cleaned[:-4] + return cleaned @cache @@ -143,12 +150,17 @@ def _to_commit_hash(repo: Repository, label: str) -> str: raise ValueError(f"Invalid tag or branch: `{label}` for {repo}") -def _to_commit_hash_compat(repo: Repository, label: str) -> str: - """Compatibility wrapper: supports both (repo, label) and (label) call signatures.""" +def _to_commit_hash_compat(*args, **kwargs): + """ + Compatibility wrapper for LeanDojo versions that define _to_commit_hash with either: + (repo: Repository, label: str) OR (label: str) + """ try: - return _to_commit_hash(repo, label) + return _to_commit_hash(*args, **kwargs) except TypeError: - return _to_commit_hash(label) + if len(args) == 2 and not kwargs: + return _to_commit_hash(args[1]) + raise @dataclass(eq=True, unsafe_hash=True) @@ -525,29 +537,40 @@ def show(self) -> None: def exists(self) -> bool: return url_exists(self.commit_url) + def toolchain_spec(self) -> Optional[str]: + owner_repo = "/".join(self.url.split("/")[-2:]) + if owner_repo in TOOLCHAIN_OVERRIDES: + return TOOLCHAIN_OVERRIDES[owner_repo] + try: + config = self.get_config("lean-toolchain") + except Exception: + return None + content = (config.get("content") or "").strip() + return content or None + def clone_and_checkout(self) -> None: - """Clone the repo to the current working directory and checkout a specific commit.""" - # Check if the repo already exists. - # If it exists, we assume it has been checked out to the correct commit. - - user_name, repo_name = _split_git_url(self.url) - local_repo_path = Path(os.environ["REPO_DIR"]) / user_name / repo_name + """ + Clone the repo into $REPO_DIR// (creating parents), then checkout the pinned commit + and update submodules. If it already exists, assume it's correct and skip. + """ + owner, name = _split_git_url(self.url) + base = Path(os.environ.get("REPO_DIR", ".")) + local_repo_path = (base / owner / name).resolve() local_repo_path.parent.mkdir(parents=True, exist_ok=True) - if os.path.exists(local_repo_path): - logger.info(f"{self} already exists locally.") - else: - logger.debug(f"Cloning {self}") - execute( - f"git clone -n --recursive {self.url} {local_repo_path}", - capture_output=True, - ) + if local_repo_path.exists(): + logger.info(f"{self} already exists locally at {local_repo_path}.") + return + + logger.debug(f"Cloning {self} into {local_repo_path}") + execute( + f'git clone -n --recursive "{self.url}" "{local_repo_path}"', + capture_output=True, + ) with working_directory(local_repo_path): - execute( - f"git checkout {self.commit} && git submodule update --recursive", - capture_output=True, - ) + execute(f'git checkout "{self.commit}"', capture_output=True) + execute("git submodule update --init --recursive", capture_output=True) def get_dependencies( self, path: Union[str, Path, None] = None diff --git a/data_extraction/trace.py b/data_extraction/trace.py index cd1ddc2..560cd31 100644 --- a/data_extraction/trace.py +++ b/data_extraction/trace.py @@ -35,6 +35,11 @@ def _trace(repo: LeanGitRepo, build_deps: bool) -> None: repo.clone_and_checkout() logger.debug(f"Tracing {repo}") + toolchain = repo.toolchain_spec() + if toolchain: + logger.info(f"{repo} declares toolchain {toolchain}") + else: + logger.warning(f"No lean-toolchain found for {repo}; proceeding without explicit toolchain.") container = get_container() mts = { Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name: f"/workspace/{user_name}/{repo_name}", @@ -48,7 +53,6 @@ def _trace(repo: LeanGitRepo, build_deps: bool) -> None: cmd += " --no-deps" try: - import ipdb; ipdb.set_trace() container.run( cmd, create_mounts(mts), From a3347e95bfd4de47486e66bf4b70b5ca6451cfe7 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 28 Oct 2025 22:21:38 -0400 Subject: [PATCH 23/29] Skip tracing when artifacts already present --- data_extraction/build_lean4_repo.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py index dc82257..9947cfc 100644 --- a/data_extraction/build_lean4_repo.py +++ b/data_extraction/build_lean4_repo.py @@ -162,20 +162,22 @@ def main() -> None: num_procs = int(os.environ["NUM_PROCS"]) repo_name = args.repo_name os.chdir(repo_name) - - if is_new_version(get_lean_version()): - packages_path = ".lake/packages" - build_path = ".lake/build" + + lean_version = get_lean_version() + use_new_layout = is_new_version(lean_version) + if use_new_layout: + packages_path = ".lake/packages" + build_path = ".lake/build" else: packages_path = "lake-packages" build_path = "build" - - # if check_files(packages_path, args.no_deps): - # logger.info(f"The repo {repo_name} has already been traced.") - # return + + if check_files(packages_path, args.no_deps): + logger.info(f"The repo {repo_name} has already been traced.") + return # If the lean4 package exists, we assume the build has completed and we just need to trace - if (Path(".lake/packages/lean4") if is_new_version(get_lean_version()) else Path("lake-packages/lean4")).exists(): + if (Path(".lake/packages/lean4") if use_new_layout else Path("lake-packages/lean4")).exists(): logger.info(f"The repo {repo_name} has already been built, but has not been traced.") else: # Build the repo using lake. From 9277838324064bac52f748a3c63d622628709049 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 4 Nov 2025 08:24:52 -0500 Subject: [PATCH 24/29] Ensure ExtractData available per trace run --- data_extraction/build_lean4_repo.py | 32 ++++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py index 9947cfc..b698752 100644 --- a/data_extraction/build_lean4_repo.py +++ b/data_extraction/build_lean4_repo.py @@ -94,8 +94,12 @@ def launch_progressbar(paths: List[Union[str, Path]]) -> Generator[None, None, N num_total = len(olean_files) p = Process(target=_monitor, args=(paths, num_total), daemon=True) p.start() - yield - p.kill() + try: + yield + finally: + p.join(timeout=1) + if p.is_alive(): + p.terminate() def get_lean_version() -> str: @@ -163,6 +167,10 @@ def main() -> None: repo_name = args.repo_name os.chdir(repo_name) + extractor_src = Path(__file__).with_name("ExtractData.lean").resolve() + extractor_dst = Path("ExtractData.lean") + shutil.copy2(extractor_src, extractor_dst) + lean_version = get_lean_version() use_new_layout = is_new_version(lean_version) if use_new_layout: @@ -172,10 +180,6 @@ def main() -> None: packages_path = "lake-packages" build_path = "build" - if check_files(packages_path, args.no_deps): - logger.info(f"The repo {repo_name} has already been traced.") - return - # If the lean4 package exists, we assume the build has completed and we just need to trace if (Path(".lake/packages/lean4") if use_new_layout else Path("lake-packages/lean4")).exists(): logger.info(f"The repo {repo_name} has already been built, but has not been traced.") @@ -201,12 +205,16 @@ def main() -> None: dirs_to_monitor.append(packages_path) logger.info(f"Tracing {repo_name}") - with launch_progressbar(dirs_to_monitor): - cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean" - if args.no_deps: - cmd += " noDeps" - logger.debug(cmd) - run_cmd(cmd, capture_output=True) + try: + with launch_progressbar(dirs_to_monitor): + cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean" + if args.no_deps: + cmd += " noDeps" + logger.debug(cmd) + run_cmd(cmd, capture_output=True) + finally: + if extractor_dst.exists(): + extractor_dst.unlink() assert check_files(packages_path, args.no_deps), "Some files failed to be processed." From db18ef8213d096955c01acc03fee125190479847 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Thu, 6 Nov 2025 10:20:59 -0500 Subject: [PATCH 25/29] Patch macOS dylib handling and lean-toolchain parser --- data_extraction/build_lean4_repo.py | 72 ++++++++++++++++++++++++++++- data_extraction/lean.py | 52 +++++++++++++++++++-- 2 files changed, 118 insertions(+), 6 deletions(-) diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py index b698752..5655a2d 100644 --- a/data_extraction/build_lean4_repo.py +++ b/data_extraction/build_lean4_repo.py @@ -4,6 +4,7 @@ """ import os +import sys import re import shutil import argparse @@ -33,6 +34,60 @@ def run_cmd(cmd: Union[str, List[str]], capture_output: bool = False) -> Optiona return None +def is_macos() -> bool: + return sys.platform == "darwin" + + +def _patch_dylib(path: Path) -> None: + """Adjust __DATA_CONST flags so macOS 15 accepts the library.""" + try: + subprocess.run( + [ + "xcrun", + "vtool", + "-set", + "segprot", + "__DATA_CONST", + "r--", + "rw-", + str(path), + ], + check=True, + capture_output=True, + ) + subprocess.run( + [ + "xcrun", + "vtool", + "-set", + "segflags", + "__DATA_CONST", + "0x4", + str(path), + ], + check=True, + capture_output=True, + ) + subprocess.run( + ["codesign", "--force", "--sign", "-", str(path)], + check=True, + capture_output=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError) as ex: + logger.warning(f"Failed to patch {path}: {ex}") + + +def patch_dylibs(root: Path) -> None: + if not is_macos(): + return + dylibs = list(root.rglob("*.dylib")) + if not dylibs: + return + logger.debug(f"Patching {len(dylibs)} dylibs under {root}") + for dylib in dylibs: + _patch_dylib(dylib) + + def record_paths(dir: Path, root: Path, lean_bin: Path) -> None: """Run ``lean --deps`` for all Lean files in ``dir`` to record its dependencies. @@ -192,7 +247,22 @@ def main() -> None: run_cmd("lake exe cache get") except subprocess.CalledProcessError: pass - run_cmd("lake build") + + # Try building; on macOS, if the build fails due to SG_READ_ONLY, patch dylibs and retry once. + try: + run_cmd("lake build") + except subprocess.CalledProcessError as e: + if is_macos(): + logger.warning("lake build failed; patching dylibs for macOS and retrying once") + patch_dylibs(Path(packages_path)) + patch_dylibs(Path(build_path)) + run_cmd("lake build") + else: + raise + + # Ensure final artifacts are patched as well. + patch_dylibs(Path(packages_path)) + patch_dylibs(Path(build_path)) # Copy the Lean 4 stdlib into the path of packages. lean_prefix = run_cmd(f"lean --print-prefix", capture_output=True).strip() diff --git a/data_extraction/lean.py b/data_extraction/lean.py index 9a63618..d1705f7 100644 --- a/data_extraction/lean.py +++ b/data_extraction/lean.py @@ -382,6 +382,18 @@ def __getitem__(self, key) -> str: _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P.+?)") +def _read_toolchain_content(config_dict: Dict[str, Any]) -> str: + """Extract the textual content of a lean-toolchain definition.""" + content = config_dict.get("content") + if content is None: + download_url = config_dict.get("download_url") + if download_url: + content = read_url(download_url) + else: + raise KeyError("config_dict must have a 'content' field or a 'download_url'") + return content.strip() + + def get_lean4_version_from_config(toolchain: str) -> str: """Return the required Lean version given a ``lean-toolchain`` config.""" m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip()) @@ -391,8 +403,7 @@ def get_lean4_version_from_config(toolchain: str) -> str: def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str: """Return the required Lean commit given a ``lean-toolchain`` config.""" - assert "content" in config_dict, "config_dict must have a 'content' field" - config = config_dict["content"].strip() + config = _read_toolchain_content(config_dict) prefix = "leanprover/lean4:" if config == f"{prefix}nightly": @@ -404,8 +415,39 @@ def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str: if version.startswith("nightly"): return _to_commit_hash_compat(LEAN4_NIGHTLY_REPO, version) - else: - return _to_commit_hash_compat(LEAN4_REPO, version) + + def _try_labels(labels: List[str]): + for label in labels: + try: + return _to_commit_hash_compat(LEAN4_REPO, label) + except ValueError: + continue + raise ValueError + + labels_to_try = [version] + if not version.startswith("v"): + labels_to_try.append(f"v{version}") + + try: + return _try_labels(labels_to_try) + except ValueError: + tags = LEAN4_REPO.get_tags() + for tag in tags: + if tag.name == version or tag.name == f"v{version}": + logger.warning( + f"Falling back to Lean tag {tag.name} for toolchain version {version}." + ) + return tag.commit.sha + if version in tag.name: + logger.warning( + f"Approximating Lean toolchain {version} with tag {tag.name}." + ) + return tag.commit.sha + + logger.warning( + f"Unable to resolve Lean toolchain {version}; falling back to latest commit of leanprover/lean4." + ) + return LEAN4_REPO.get_commits()[0].sha URL = TAG = COMMIT = str @@ -504,7 +546,7 @@ def __post_init__(self) -> None: else: config = self.get_config("lean-toolchain") lean_version = get_lean4_commit_from_config(config) - v = get_lean4_version_from_config(config["content"]) + v = get_lean4_version_from_config(_read_toolchain_content(config)) if not is_supported_version(v): logger.warning( f"{self} relies on an unsupported Lean version: {lean_version}" From 2cf489ab4a9a27b4d08f4f24afbf1206e8060e63 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Tue, 18 Nov 2025 09:24:20 -0500 Subject: [PATCH 26/29] Update tracing workflow (Docker + build_deps) --- .dockerignore | 8 + Dockerfile.arm | 16 ++ leanagent.py | 38 +++-- scripts/trace_paper_repos.py | 157 ++++++++---------- workspace/build_lean4_repo.py | 293 ++++++++++++++++++++++++++++++++++ 5 files changed, 404 insertions(+), 108 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile.arm create mode 100644 workspace/build_lean4_repo.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..8ccc9f2 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +RAID +workspace +**/.lake +**/__pycache__ +**/*.olean +**/*.log +.git +.venv diff --git a/Dockerfile.arm b/Dockerfile.arm new file mode 100644 index 0000000..f9178ed --- /dev/null +++ b/Dockerfile.arm @@ -0,0 +1,16 @@ +FROM ubuntu:22.04 + +RUN apt-get update && apt-get install -y \ + curl git python3 python3-pip python3-venv build-essential \ + && rm -rf /var/lib/apt/lists/* + +RUN curl https://raw.githubusercontent.com/elan/elan/master/elan-init.sh -sSf \ + | sh -s -- -y --default-toolchain leanprover/lean4:v4.9.0 +ENV PATH="/root/.elan/bin:${PATH}" + +WORKDIR /workspace/LeanAgent +COPY . . + +RUN python3 -m venv .venv && \ + .venv/bin/pip install --upgrade pip && \ + .venv/bin/pip install -r requirements.txt diff --git a/leanagent.py b/leanagent.py index 8fb50a0..5b2afb8 100644 --- a/leanagent.py +++ b/leanagent.py @@ -112,6 +112,10 @@ def save_database_locked(db: DynamicDatabase, path: str) -> None: write_json_locked(path, db.to_dict(), ensure_ascii=False) +def append_text_locked(path: str, *chunks: str) -> None: + """Atomically append textual data while coordinating concurrent writers.""" + with _locked(path, "a") as handle: + handle.writelines(chunks) def _eval(data, preds_map) -> Tuple[float, float, float]: """Evaluates the retrieval model.""" R1 = [] @@ -892,11 +896,11 @@ def main(): os.path.join(dataset_path, d) for d in os.listdir(dataset_path) ] if is_main_process: - with open(EVAL_RESULTS_FILE_PATH, "a") as f: - f.write("\n\n\n") - f.write( - f"Results for {dir_name} with lambda = {lambda_value}" - ) + append_text_locked( + EVAL_RESULTS_FILE_PATH, + "\n\n\n", + f"Results for {dir_name} with lambda = {lambda_value}", + ) for data_path in testing_paths: if "merged" not in data_path: continue @@ -919,11 +923,13 @@ def main(): total_R1.append(R1) total_R10.append(R10) total_MRR.append(MRR) - with open(EVAL_RESULTS_FILE_PATH, "a") as f: - f.write("\n\n\n") - f.write(f"Intermediate results for {data_path}") - f.write("\n\n\n") - f.write(f"R@1 = {R1} %, R@10 = {R10} %, MRR = {MRR}") + append_text_locked( + EVAL_RESULTS_FILE_PATH, + "\n\n\n", + f"Intermediate results for {data_path}", + "\n\n\n", + f"R@1 = {R1} %, R@10 = {R10} %, MRR = {MRR}", + ) if is_main_process: avg_R1 = np.mean(total_R1) @@ -935,13 +941,13 @@ def main(): ) if not os.path.exists(EVAL_RESULTS_FILE_PATH): - open(EVAL_RESULTS_FILE_PATH, "w").close() + append_text_locked(EVAL_RESULTS_FILE_PATH, "") - with open(EVAL_RESULTS_FILE_PATH, "a") as f: - f.write("\n\n\n") - f.write( - f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}" - ) + append_text_locked( + EVAL_RESULTS_FILE_PATH, + "\n\n\n", + f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}", + ) else: model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt" if result is None: diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py index 8d33d0a..54de9b2 100755 --- a/scripts/trace_paper_repos.py +++ b/scripts/trace_paper_repos.py @@ -12,114 +12,87 @@ import os import json import pathlib +import sys -from lean_dojo import LeanGitRepo -from lean_dojo.data_extraction.trace import get_traced_repo_path +HERE = pathlib.Path(__file__).resolve() +REPO_ROOT = HERE.parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from lean_dojo import LeanGitRepo # noqa: E402 +from lean_dojo.data_extraction.trace import get_traced_repo_path # noqa: E402 # hardcoded list reconstructed from the paper / convo +# ==== Already traced ==== +# 1. teorth/pfr FAITHFUL +# 2. avigad/mathematics_in_lean_source +# 3. yangky11/miniF2F-lean4 +# 6. AlexKontorovich/PrimeNumberTheoremAnd +# 7. dwrensha/compfiles +# 8. ImperialCollegeLondon/FLT +# 9. verse-lab/veil +# 10. eric-wieser/lean-matrix-cookbook + +# ==== Heavy / needs fix ==== +# 4. lecopivo/SciLean (macOS SG_READ_ONLY crash) +# 11. loganrjmurphy/LeanEuclid (same) + +# ==== Remaining targets ==== +# PAPER_REPOS = [ +# { +# "owner": "dwrensha", +# "name": "compfiles", +# "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091", +# }, +# { +# "owner": "avigad", +# "name": "mathematics_in_lean_source", +# "sha": "5297e0fb051367c48c0a084411853a576389ecf5", +# }, +# { +# "owner": "yangky11", +# "name": "miniF2F-lean4", +# "sha": "9e445f5435407f014b88b44a98436d50dd7abd00", +# }, +# { +# "owner": "teorth", +# "name": "pfr", +# "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687", +# }, +# { +# "owner": "ImperialCollegeLondon", +# "name": "FLT", +# "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147", +# }, +# { +# "owner": "verse-lab", +# "name": "veil", +# "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781", +# }, +# ] + PAPER_REPOS = [ - # 1. teorth/pfr - # { - # "owner": "teorth", - # "name": "pfr", - # "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687", - # }, - # 2. avigad/mathematics_in_lean_source - { - "owner": "avigad", - "name": "mathematics_in_lean_source", - "sha": "5297e0fb051367c48c0a084411853a576389ecf5", - }, - { - "owner": "verse-lab", - "name": "veil", - "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781", - }, - # 3. miniF2F - { - "owner": "yangky11", - "name": "miniF2F-lean4", - "sha": "9e445f5435407f014b88b44a98436d50dd7abd00", - }, - # 4. SciLean (in paper → we must make it work eventually) - # { - # "owner": "lecopivo", - # "name": "SciLean", - # "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744", - # }, - # 5. teorth/lean4-pdl - { - "owner": "teorth", - "name": "lean4-pdl", - "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e", - }, - # 6. prime number theorem notes - { - "owner": "AlexKontorovich", - "name": "PrimeNumberTheoremAnd", - "sha": "29baddd685660b5fedd7bd67f9916ae24253d566", - }, - # 7. compfiles - { - "owner": "dwrensha", - "name": "compfiles", - "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091", - }, - # 8. FLT - { - "owner": "ImperialCollegeLondon", - "name": "FLT", - "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147", - }, - { - "owner": "verse-lab", - "name": "veil", - "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781", - }, - # 9. lean4-cli (paper mentions tooling repos; we saw this in your crawl) - { - "owner": "leanprover-community", - "name": "lean4-cli", - "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f", - }, - # 10. matrix cookbook { - "owner": "eric-wieser", - "name": "lean-matrix-cookbook", - "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f", + "owner": "lecopivo", + "name": "SciLean", + "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744", }, - # 11. LeanEuclid { "owner": "loganrjmurphy", "name": "LeanEuclid", "sha": "f1912c3090eb82820575758efc31e40b9db86bb8", }, - # 12. formalized logic foundation { "owner": "FormalizedFormalLogic", "name": "Foundation", "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21", }, - # 13. con-nf - { - "owner": "pengbaolin", - "name": "con-nf", - "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856", - }, - # 14. zeta_3_irrational - { - "owner": "ahhwuhu", - "name": "zeta_3_irrational", - "sha": "914712200e463cfc97fe37e929d518dd58806a38", - }, - # 15. LeanAPAP - { - "owner": "judicael-pvt", - "name": "LeanAPAP", - "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b", - }, - # paper had a few that we couldn't map to GH — keep extensible +# { +# "owner": "TODO", +# "name": "lean4lean", +# "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f", +# }, ] @@ -166,7 +139,7 @@ def main() -> None: print(f"\n=== tracing {url}@{commit} ===") try: repo = LeanGitRepo(url, commit) - traced_path = get_traced_repo_path(repo, build_deps=False) + traced_path = get_traced_repo_path(repo, build_deps=True) traced_path = pathlib.Path(traced_path) print(f" lean_dojo traced into cache: {traced_path}") except Exception as e: diff --git a/workspace/build_lean4_repo.py b/workspace/build_lean4_repo.py new file mode 100644 index 0000000..5655a2d --- /dev/null +++ b/workspace/build_lean4_repo.py @@ -0,0 +1,293 @@ +"""Build Lean 4 projects in Docker. + +Only this file runs in Docker. So it must be self-contained. +""" + +import os +import sys +import re +import shutil +import argparse +import itertools +import subprocess +from tqdm import tqdm +from loguru import logger +from time import sleep, monotonic +from pathlib import Path, PurePath +from multiprocessing import Process +from contextlib import contextmanager +from typing import Union, List, Optional, Generator + + +def run_cmd(cmd: Union[str, List[str]], capture_output: bool = False) -> Optional[str]: + """Run a shell command. + + Args: + cmd (Union[str, List[str]]): A command or a list of commands. + """ + if isinstance(cmd, list): + cmd = " && ".join(cmd) + res = subprocess.run(cmd, shell=True, capture_output=capture_output, check=True) + if capture_output: + return res.stdout.decode() + else: + return None + + +def is_macos() -> bool: + return sys.platform == "darwin" + + +def _patch_dylib(path: Path) -> None: + """Adjust __DATA_CONST flags so macOS 15 accepts the library.""" + try: + subprocess.run( + [ + "xcrun", + "vtool", + "-set", + "segprot", + "__DATA_CONST", + "r--", + "rw-", + str(path), + ], + check=True, + capture_output=True, + ) + subprocess.run( + [ + "xcrun", + "vtool", + "-set", + "segflags", + "__DATA_CONST", + "0x4", + str(path), + ], + check=True, + capture_output=True, + ) + subprocess.run( + ["codesign", "--force", "--sign", "-", str(path)], + check=True, + capture_output=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError) as ex: + logger.warning(f"Failed to patch {path}: {ex}") + + +def patch_dylibs(root: Path) -> None: + if not is_macos(): + return + dylibs = list(root.rglob("*.dylib")) + if not dylibs: + return + logger.debug(f"Patching {len(dylibs)} dylibs under {root}") + for dylib in dylibs: + _patch_dylib(dylib) + + +def record_paths(dir: Path, root: Path, lean_bin: Path) -> None: + """Run ``lean --deps`` for all Lean files in ``dir`` to record its dependencies. + + Args: + dir (Path): The directory containing Lean files. + """ + dir = Path(dir) + + for p in dir.glob("**/*.lean"): + with p.with_suffix(".dep_paths").open("wt") as oup: + for line in run_cmd( + f"{lean_bin} --deps {p}", capture_output=True + ).splitlines(): + olean_path = PurePath(line.strip()) + assert olean_path.suffix == ".olean" + lean_path = olean_path.relative_to(root).with_suffix(".lean") + oup.write(str(lean_path) + "\n") + + +def remove_files(dir: Path, suffix: str) -> None: + """Remove all files in ``dir`` that end with ``suffix``.""" + for p in Path(dir).glob(f"**/*{suffix}"): + p.unlink() + + +_PROGRESSBAR_UPDATE_INTERNAL = 5 + + +def _monitor(paths: List[Path], num_total: int) -> None: + with tqdm(total=num_total) as pbar: + while True: + time_start = monotonic() + try: + num_done = len( + list( + itertools.chain.from_iterable( + p.glob(f"**/*.ast.json") for p in paths + ) + ) + ) + except Exception: + continue + time_elapsed = monotonic() - time_start + if time_elapsed < _PROGRESSBAR_UPDATE_INTERNAL: + sleep(_PROGRESSBAR_UPDATE_INTERNAL - time_elapsed) + pbar.update(num_done - pbar.n) + if num_done >= num_total: + break + print("") + + +@contextmanager +def launch_progressbar(paths: List[Union[str, Path]]) -> Generator[None, None, None]: + """Launch an async progressbar to monitor the progress of tracing the repo.""" + paths = [Path(p) for p in paths] + olean_files = list( + itertools.chain.from_iterable(p.glob("**/*.olean") for p in paths) + ) + num_total = len(olean_files) + p = Process(target=_monitor, args=(paths, num_total), daemon=True) + p.start() + try: + yield + finally: + p.join(timeout=1) + if p.is_alive(): + p.terminate() + + +def get_lean_version() -> str: + """Get the version of Lean.""" + output = run_cmd("lean --version", capture_output=True).strip() + m = re.match(r"Lean \(version (?P\S+?),", output) + return m["version"] + + +def check_files(packages_path: str, no_deps: bool) -> bool: + """Check if all *.lean files have been processed to produce *.ast.json and *.dep_paths files.""" + cwd = Path.cwd() + packages_path = cwd / packages_path + jsons = { + p.with_suffix("").with_suffix("") + for p in cwd.glob("**/build/ir/**/*.ast.json") + if not no_deps or not p.is_relative_to(packages_path) + } + deps = { + p.with_suffix("") + for p in cwd.glob("**/build/ir/**/*.dep_paths") + if not no_deps or not p.is_relative_to(packages_path) + } + oleans = { + Path(str(p.with_suffix("")).replace("/build/lib/", "/build/ir/")) + for p in cwd.glob("**/build/lib/**/*.olean") + if not no_deps or not p.is_relative_to(packages_path) + } + assert len(jsons) <= len(oleans) and len(deps) <= len(oleans) + missing_jsons = {p.with_suffix(".ast.json") for p in oleans - jsons} + missing_deps = {p.with_suffix(".dep_paths") for p in oleans - deps} + if len(missing_jsons) > 0 or len(missing_deps) > 0: + for p in missing_jsons.union(missing_deps): + logger.warning(f"Missing {p}") + return False + return True + + +def is_new_version(v: str) -> bool: + """Check if ``v`` is at least `4.3.0-rc2`.""" + major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")] + if major < 4 or (major == 4 and minor < 3): + return False + if ( + major > 4 + or (major == 4 and minor > 3) + or (major == 4 and minor == 3 and patch > 0) + ): + return True + assert major == 4 and minor == 3 and patch == 0 + if "4.3.0-rc" in v: + rc = int(v.split("-")[1][2:]) + return rc >= 2 + else: + return True + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("repo_name") + parser.add_argument("--no-deps", action="store_true") + args = parser.parse_args() + + num_procs = int(os.environ["NUM_PROCS"]) + repo_name = args.repo_name + os.chdir(repo_name) + + extractor_src = Path(__file__).with_name("ExtractData.lean").resolve() + extractor_dst = Path("ExtractData.lean") + shutil.copy2(extractor_src, extractor_dst) + + lean_version = get_lean_version() + use_new_layout = is_new_version(lean_version) + if use_new_layout: + packages_path = ".lake/packages" + build_path = ".lake/build" + else: + packages_path = "lake-packages" + build_path = "build" + + # If the lean4 package exists, we assume the build has completed and we just need to trace + if (Path(".lake/packages/lean4") if use_new_layout else Path("lake-packages/lean4")).exists(): + logger.info(f"The repo {repo_name} has already been built, but has not been traced.") + else: + # Build the repo using lake. + logger.info(f"Building {repo_name}") + if args.no_deps: + # The additional *.olean files wouldn't matter. + try: + run_cmd("lake exe cache get") + except subprocess.CalledProcessError: + pass + + # Try building; on macOS, if the build fails due to SG_READ_ONLY, patch dylibs and retry once. + try: + run_cmd("lake build") + except subprocess.CalledProcessError as e: + if is_macos(): + logger.warning("lake build failed; patching dylibs for macOS and retrying once") + patch_dylibs(Path(packages_path)) + patch_dylibs(Path(build_path)) + run_cmd("lake build") + else: + raise + + # Ensure final artifacts are patched as well. + patch_dylibs(Path(packages_path)) + patch_dylibs(Path(build_path)) + + # Copy the Lean 4 stdlib into the path of packages. + lean_prefix = run_cmd(f"lean --print-prefix", capture_output=True).strip() + shutil.copytree(lean_prefix, f"{packages_path}/lean4") + + + # Run ExtractData.lean to extract ASTs, tactic states, and premise information. + dirs_to_monitor = [build_path] + if not args.no_deps: + dirs_to_monitor.append(packages_path) + + logger.info(f"Tracing {repo_name}") + try: + with launch_progressbar(dirs_to_monitor): + cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean" + if args.no_deps: + cmd += " noDeps" + logger.debug(cmd) + run_cmd(cmd, capture_output=True) + finally: + if extractor_dst.exists(): + extractor_dst.unlink() + + assert check_files(packages_path, args.no_deps), "Some files failed to be processed." + + +if __name__ == "__main__": + main() From ec5e4edd35295e3083c86e0b5fe9b8e24fb18646 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Fri, 21 Nov 2025 01:02:53 -0500 Subject: [PATCH 27/29] Trace more paper repos --- scripts/trace_paper_repos.py | 180 ++++++++++++++++++++--------------- 1 file changed, 104 insertions(+), 76 deletions(-) diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py index 54de9b2..da43365 100755 --- a/scripts/trace_paper_repos.py +++ b/scripts/trace_paper_repos.py @@ -9,9 +9,11 @@ python scripts/trace_paper_repos.py """ -import os import json +import os import pathlib +import shutil +import subprocess import sys HERE = pathlib.Path(__file__).resolve() @@ -24,78 +26,102 @@ # hardcoded list reconstructed from the paper / convo -# ==== Already traced ==== -# 1. teorth/pfr FAITHFUL -# 2. avigad/mathematics_in_lean_source -# 3. yangky11/miniF2F-lean4 -# 6. AlexKontorovich/PrimeNumberTheoremAnd -# 7. dwrensha/compfiles -# 8. ImperialCollegeLondon/FLT -# 9. verse-lab/veil -# 10. eric-wieser/lean-matrix-cookbook - -# ==== Heavy / needs fix ==== -# 4. lecopivo/SciLean (macOS SG_READ_ONLY crash) -# 11. loganrjmurphy/LeanEuclid (same) - -# ==== Remaining targets ==== -# PAPER_REPOS = [ -# { -# "owner": "dwrensha", -# "name": "compfiles", -# "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091", -# }, -# { -# "owner": "avigad", -# "name": "mathematics_in_lean_source", -# "sha": "5297e0fb051367c48c0a084411853a576389ecf5", -# }, -# { -# "owner": "yangky11", -# "name": "miniF2F-lean4", -# "sha": "9e445f5435407f014b88b44a98436d50dd7abd00", -# }, -# { -# "owner": "teorth", -# "name": "pfr", -# "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687", -# }, -# { -# "owner": "ImperialCollegeLondon", -# "name": "FLT", -# "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147", -# }, -# { -# "owner": "verse-lab", -# "name": "veil", -# "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781", -# }, -# ] - +# ==== Already traced / paper list ==== PAPER_REPOS = [ - { - "owner": "lecopivo", - "name": "SciLean", - "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744", - }, - { - "owner": "loganrjmurphy", - "name": "LeanEuclid", - "sha": "f1912c3090eb82820575758efc31e40b9db86bb8", - }, - { - "owner": "FormalizedFormalLogic", - "name": "Foundation", - "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21", - }, -# { -# "owner": "TODO", -# "name": "lean4lean", -# "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f", -# }, + # {"owner": "teorth", "name": "pfr", "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687"}, # ALREADY TRACED + # {"owner": "leanprover-community", "name": "hairy-ball-theorem", "sha": "a778826d19c8a7ddf1d26beeea628c45450612e6"}, # not found + # {"owner": "leanprover-community", "name": "coxeter", "sha": "96af8aee7943ca8685ed1b00cc83a559ea389a97"}, # not found + # {"owner": "avigad", "name": "mathematics_in_lean_source", "sha": "5297e0fb051367c48c0a084411853a576389ecf5"}, # ALREADY TRACED + {"owner": "mo271", "name": "FormalBook", "sha": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"}, # searched commit hashes; original table SHA/owner invalid + # {"owner": "yangky11", "name": "miniF2F-lean4", "sha": "9e445f5435407f014b88b44a98436d50dd7abd00"}, # ALREADY TRACED + # {"owner": "lecopivo", "name": "SciLean", "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744"}, # ALREADY TRACED + {"owner": "fpvandoorn", "name": "carleson", "sha": "bec7808b907190882fa1fa54ce749af297c6cf37"}, # searched commit hashes; original table SHA/owner invalid + {"owner": "m4lvin", "name": "lean4-pdl", "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"}, # searched commit hashes; original table SHA/owner invalid + # {"owner": "AlexKontorovich", "name": "PrimeNumberTheoremAnd", "sha": "29baddd685660b5fedd7bd67f9916ae24253d566"}, # ALREADY TRACED + # {"owner": "dwrensha", "name": "compfiles", "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091"}, # ALREADY TRACED + # {"owner": "ImperialCollegeLondon", "name": "FLT", "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147"}, # ALREADY TRACED + # {"owner": "Bachmann", "name": "debate", "sha": "7fb39251b705797ee54e08c96177fabd29a5b5a3"}, # not found + # {"owner": "digama0", "name": "lean4lean", "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"}, # ALREADY TRACED + # {"owner": "eric-wieser", "name": "lean-matrix-cookbook", "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f"}, # ALREADY TRACED + # {"owner": "yuma-mizuno", "name": "lean-math-workshop", "sha": "5acd4b933d47fd6c1032798a6046c1baf261445d"}, # ALREADY TRACED + # {"owner": "loganrjmurphy", "name": "LeanEuclid", "sha": "f1912c3090eb82820575758efc31e40b9db86bb8"}, SMT ERROR + # {"owner": "FormalizedFormalLogic", "name": "Foundation", "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21"}, SMT ERROR + # {"owner": "leanprover-community", "name": "con-nf", "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"}, # ALREADY TRACED + # {"owner": "siddhartha-gadgil", "name": "Saturn", "sha": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"}, # ALREADY TRACED + # {"owner": "ahhwuhu", "name": "zeta_3_irrational", "sha": "914712200e463cfc97fe37e929d518dd58806a38"}, # ALREADY TRACED + # {"owner": "EmilGedda", "name": "Formalization-of-Constructable-Numbers", "sha": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"}, # not found + {"owner": "YaelDillies", "name": "LeanAPAP", "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b"}, # searched commit hashes; original table SHA/owner invalid ] +PINNED_DEPS = { + "loganrjmurphy/LeanEuclid": [ + { + "url": "https://github.com/leanprover-community/mathlib4", + "type": "git", + "subDir": None, + "rev": "b2c9f64fbc8dfe4c1b15b2bc6ab5a6f472fc047e", + "name": "mathlib", + "manifestFile": "lake-manifest.json", + "inputRev": "b2c9f64fbc8dfe4c1b15b2bc6ab5a6f472fc047e", + "inherited": False, + "configFile": "lakefile.lean", + }, + { + "url": "https://github.com/yangky11/lean-smt.git", + "type": "git", + "subDir": None, + "rev": "a3c0e8ab1e07d74b8fd745e7b3c4b83c6d859bbb", + "name": "smt", + "manifestFile": "lake-manifest.json", + "inputRev": "a3c0e8ab1e07d74b8fd745e7b3c4b83c6d859bbb", + "inherited": False, + "configFile": "lakefile.lean", + }, + ] +} + + +def apply_dependency_pins(repo_root: pathlib.Path, owner: str, name: str) -> None: + repo_key = f"{owner}/{name}" + targets = PINNED_DEPS.get(repo_key) + if not targets: + return + + manifest_path = repo_root / "lake-manifest.json" + if manifest_path.exists(): + try: + manifest = json.load(manifest_path.open()) + except json.JSONDecodeError: + manifest = {} + else: + manifest = {} + + manifest.setdefault("packagesDir", ".lake/packages") + packages = manifest.setdefault("packages", []) + changed = False + + for target in targets: + for pkg in packages: + if pkg.get("name") == target["name"]: + if pkg != target: + pkg.update(target) + changed = True + break + else: + packages.append(dict(target)) + changed = True + + if changed: + manifest_path.write_text(json.dumps(manifest, indent=2)) + print(f" pinned lake-manifest.json for {repo_key}") + + lake_dir = repo_root / ".lake" + if lake_dir.exists(): + shutil.rmtree(lake_dir) + print(f" removed stale .lake directory for {repo_key}") + + def make_corpus_from_repo(source_root: pathlib.Path, out_dir: pathlib.Path, url: str, commit: str) -> int: """Scan .lake/build/ir for *.ast.json and write corpus.jsonl.""" ir_root = source_root / ".lake" / "build" / "ir" @@ -136,6 +162,15 @@ def main() -> None: url = f"https://github.com/{item['owner']}/{item['name']}" commit = item["sha"] + # repo as checked out by the earlier crawl + repo_root = repo_dir / item["owner"] / item["name"] + out_dir = raid_dir / "data" / f"{item['name']}_{commit}" + + if repo_root.exists(): + apply_dependency_pins(repo_root, item["owner"], item["name"]) + else: + print(f" !! repo root {repo_root} not found — was it cloned under RAID/repos/?") + print(f"\n=== tracing {url}@{commit} ===") try: repo = LeanGitRepo(url, commit) @@ -146,13 +181,6 @@ def main() -> None: print(f" !! lean_dojo failed for {url}@{commit}: {e}") continue - # repo as checked out by the earlier crawl - repo_root = repo_dir / item["owner"] / item["name"] - out_dir = raid_dir / "data" / f"{item['name']}_{commit}" - - if not repo_root.exists(): - print(f" !! repo root {repo_root} not found — was it cloned under RAID/repos/?") - sources = [traced_path] if repo_root.exists(): sources.append(repo_root) From 242778ac288bc2ab0bdeb79140f4cbd25bfe67d9 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Mon, 8 Dec 2025 20:18:52 -0500 Subject: [PATCH 28/29] Add training scripts and fix merged dataset build --- generate_benchmark_lean4.py | 2 +- scripts/build_merged_dataset.py | 98 +++++++++++++++- scripts/process_local_traces.py | 130 +++++++++++++++++++++ scripts/setup_vm.sh | 70 +++++++++++ scripts/trace_paper_repos.py | 46 ++++---- train_leanagent.py | 199 ++++++++++++++++++++++++++++++++ 6 files changed, 515 insertions(+), 30 deletions(-) create mode 100644 scripts/process_local_traces.py create mode 100644 scripts/setup_vm.sh create mode 100644 train_leanagent.py diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py index 7fed1d2..9d20504 100644 --- a/generate_benchmark_lean4.py +++ b/generate_benchmark_lean4.py @@ -233,7 +233,7 @@ def export_proofs( total_theorems = 0 for strategy, split in splits.items(): split_dir = dst_path / strategy - split_dir.mkdir(parents=True) + split_dir.mkdir(parents=True, exist_ok=True) for name, theorems in split.items(): data = [] diff --git a/scripts/build_merged_dataset.py b/scripts/build_merged_dataset.py index c1879f4..075ce7a 100644 --- a/scripts/build_merged_dataset.py +++ b/scripts/build_merged_dataset.py @@ -49,6 +49,9 @@ def iter_nonempty_corpora(data_root: Path): item["name"]: f"https://github.com/{item['owner']}/{item['name']}" for item in PAPER_REPOS } +# Add alternative names for repos with different directory names +_SLUG_TO_URL["formal_book"] = "https://github.com/mo271/FormalBook" +_SLUG_TO_URL["hairy-ball-theorem-lean"] = "https://github.com/leanprover-community/hairy-ball-theorem" def _infer_repo_from_dir(dir_path: Path) -> tuple[str, str]: @@ -95,21 +98,104 @@ def main() -> None: # Add repos discovered from existing corpora data_root = raid_dir / "data" + if not data_root.exists(): + logger.warning(f"{data_root} does not exist. Checking {raid_dir} directly...") + data_root = raid_dir + targets = [] for d, cj in iter_nonempty_corpora(data_root): try: + # We don't strictly need url/commit here if we trust metadata.json, + # but it's good for logging. url, commit = load_repo_from_corpus(cj) - targets.append((url, commit)) + targets.append((d, cj, url, commit)) except Exception as e: logger.warning(f"Skipping {d} due to: {e}") logger.info(f"Found {len(targets)} repos with non-empty corpora to ingest") - for url, commit in targets: - repo = LeanGitRepo(url, commit) - logger.info(f"Ingesting {url}@{commit}") - status = add_repo_to_database(str(db_path), repo, db) - logger.info(f"Status for {url}: {status}") + from dynamic_database import Repository + import requests + import re + + def get_lean_version_from_github(url, commit): + """Fetch lean-toolchain from GitHub and parse version.""" + try: + raw_url = url.replace("github.com", "raw.githubusercontent.com") + config_url = f"{raw_url}/{commit}/lean-toolchain" + response = requests.get(config_url, timeout=10) + if response.status_code == 200: + content = response.text.strip() + # Parse version like "leanprover/lean4:v4.8.0" -> "v4.8.0" + match = re.search(r"leanprover/lean4:(.+)", content) + if match: + return match.group(1) + return content # Fallback to full string if regex fails + except Exception as e: + logger.warning(f"Failed to fetch lean-toolchain for {url}@{commit}: {e}") + return "v4.0.0" # Ultimate fallback + + for d, cj, url, commit in targets: + logger.info(f"Ingesting {url}@{commit} from {d}") + + # 1. Read metadata.json + meta_path = d / "metadata.json" + meta = {} + if meta_path.exists(): + try: + with open(meta_path, "r") as f: + meta = json.load(f) + except Exception as e: + logger.error(f"Failed to read metadata for {d}: {e}") + + # 2. Construct Repository data + from_repo = meta.get("from_repo", {}) + repo_url = from_repo.get("url", url) + repo_commit = from_repo.get("commit", commit) + + date_processed = meta.get("date_processed") + if not date_processed: + date_processed = meta.get("creation_time", "2024-01-01T00:00:00.000000") + + lean_dojo_ver = meta.get("lean_dojo_version") or meta.get("leandojo_version", "0.0.1") + + # Fetch REAL Lean version if missing + lean_ver = meta.get("lean_version") + if not lean_ver: + logger.info(f"Fetching real Lean version for {repo_url}...") + lean_ver = get_lean_version_from_github(repo_url, repo_commit) + logger.info(f"Got version: {lean_ver}") + + # Create dummy theorems folder if missing + theorems_dir = d / "random" + if not theorems_dir.exists(): + theorems_dir.mkdir(parents=True, exist_ok=True) + + repo_data = { + "url": repo_url, + "name": repo_url.split("/")[-1] if repo_url else d.name.split("_")[0], + "commit": repo_commit, + "lean_version": lean_ver, + "lean_dojo_version": lean_dojo_ver, + "metadata": { + "date_processed": date_processed + }, + "theorems_folder": str(theorems_dir), + "premise_files_corpus": str(cj), + "files_traced": str(d / "traced_files.jsonl") + } + + # 3. Add to DB + try: + repo = Repository.from_dict(repo_data) + db.add_repository(repo) + logger.info(f"Successfully added {repo_url} to DB") + except Exception as e: + logger.error(f"Failed to add repo {repo_url} to DB: {e}") + + # Save updated database + logger.info(f"Saving database with {len(db.repositories)} repositories to {db_path}") + db.to_json(str(db_path)) # Export merged dataset out_dir = raid_dir / "data" / "merged_paper_subset" diff --git a/scripts/process_local_traces.py b/scripts/process_local_traces.py new file mode 100644 index 0000000..4be7717 --- /dev/null +++ b/scripts/process_local_traces.py @@ -0,0 +1,130 @@ +import json +import pathlib +import os +import shutil +import sys + +# Add the parent directory (LeanAgent root) to sys.path to allow imports +current_dir = pathlib.Path(__file__).parent.resolve() +lean_agent_root = current_dir.parent +sys.path.insert(0, str(lean_agent_root)) + +from lean_dojo.data_extraction.traced_data import TracedRepo +from generate_benchmark_lean4 import export_premises, export_proofs, split_data, export_metadata + +def process_repo(source_root: pathlib.Path, out_dir: pathlib.Path, url: str, commit: str) -> bool: + """ + Load a traced Lean repository from source_root, + export premises to corpus.jsonl, and export theorems to random/. + """ + print(f" Loading TracedRepo from {source_root}...") + try: + traced_repo = TracedRepo.from_traced_files(source_root, build_deps=True) + except Exception as e: + print(f" !! Failed to load TracedRepo: {e}") + return False + + print(f" Exporting to {out_dir}...") + out_dir.mkdir(parents=True, exist_ok=True) + + # Step 1: Export premises to corpus.jsonl + try: + export_premises(traced_repo, out_dir) + print(f" ✓ Exported premises → {out_dir / 'corpus.jsonl'}") + except Exception as e: + print(f" !! Failed to export premises: {e}") + return False + + # Step 2: Export theorems (THE FIX - this was missing!) + try: + print(f" Extracting theorems...") + splits = split_data(traced_repo, num_val_pct=0.02, num_test_pct=0.02) + total_theorems = export_proofs(splits, out_dir, traced_repo) + export_metadata(traced_repo, out_dir) # Fixed: removed splits argument + print(f" ✓ Exported {total_theorems} theorems → {out_dir / 'random/'}") + except Exception as e: + print(f" !! Failed to export theorems: {e}") + import traceback + traceback.print_exc() + return False + + return True + +# VM repo paths - comment out repos you want to skip +VM_REPOS = [ + {"name": "FLT", "owner": "ImperialCollegeLondon", "commit": "b208a302cdcbfadce33d8165f0b054bfa17e2147", "local_path": "/home/aum/repos_cache/FLT/ImperialCollegeLondon-FLT-b208a302cdcbfadce33d8165f0b054bfa17e2147/FLT"}, + {"name": "formal_book", "owner": "mo271", "commit": "6fbe8c2985008c0bfb30050750a71b90388ad3a3", "local_path": "/home/aum/repos_cache/FormalBook/mo271-formal_book-6fbe8c2985008c0bfb30050750a71b90388ad3a3/formal_book"}, + {"name": "Formalisation-of-constructable-numbers", "owner": "Louis", "commit": "01ef1f22a04f2ba8081c5fb29413f515a0e52878", "local_path": "/home/aum/repos_cache/Formalization-of-Constructable-Numbers/Louis-Le-Grand-Formalisation-of-constructable-numbers-01ef1f22a04f2ba8081c5fb29413f515a0e52878/Formalisation-of-constructable-numbers"}, + {"name": "Foundation", "owner": "FormalizedFormalLogic", "commit": "d5fe5d057a90a0703a745cdc318a1b6621490c21", "local_path": "/home/aum/repos_cache/Foundation/FormalizedFormalLogic-Foundation-d5fe5d057a90a0703a745cdc318a1b6621490c21/Foundation"}, + {"name": "LeanAPAP", "owner": "YaelDillies", "commit": "951c660a8d7ba8e39f906fdf657674a984effa8b", "local_path": "/home/aum/repos_cache/LeanAPAP/YaelDillies-LeanAPAP-951c660a8d7ba8e39f906fdf657674a984effa8b/LeanAPAP"}, + {"name": "LeanEuclid", "owner": "loganrjmurphy", "commit": "f1912c3090eb82820575758efc31e40b9db86bb8", "local_path": "/home/aum/repos_cache/LeanEuclid/loganrjmurphy-LeanEuclid-f1912c3090eb82820575758efc31e40b9db86bb8/LeanEuclid"}, + {"name": "PrimeNumberTheoremAnd", "owner": "AlexKontorovich", "commit": "29baddd685660b5fedd7bd67f9916ae24253d566", "local_path": "/home/aum/repos_cache/PrimeNumberTheoremAnd/AlexKontorovich-PrimeNumberTheoremAnd-29baddd685660b5fedd7bd67f9916ae24253d566/PrimeNumberTheoremAnd"}, + {"name": "Saturn", "owner": "siddhartha", "commit": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a", "local_path": "/home/aum/repos_cache/Saturn/siddhartha-gadgil-Saturn-3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a/Saturn"}, + {"name": "SciLean", "owner": "lecopivo", "commit": "22d53b2f4e3db2a172e71da6eb9c916e62655744", "local_path": "/home/aum/repos_cache/SciLean/lecopivo-SciLean-22d53b2f4e3db2a172e71da6eb9c916e62655744/SciLean"}, + {"name": "carleson", "owner": "fpvandoorn", "commit": "bec7808b907190882fa1fa54ce749af297c6cf37", "local_path": "/home/aum/repos_cache/carleson/fpvandoorn-carleson-bec7808b907190882fa1fa54ce749af297c6cf37/carleson"}, + {"name": "compfiles", "owner": "dwrensha", "commit": "f99bf6f2928d47dd1a445b414b3a723c2665f091", "local_path": "/home/aum/repos_cache/compfiles/dwrensha-compfiles-f99bf6f2928d47dd1a445b414b3a723c2665f091/compfiles"}, + {"name": "con-nf", "owner": "leanprover", "commit": "00bdc85ba7d486a9e544a0806a1018dd06fa3856", "local_path": "/home/aum/repos_cache/con-nf/leanprover-community-con-nf-00bdc85ba7d486a9e544a0806a1018dd06fa3856/con-nf"}, + {"name": "coxeter", "owner": "NUS", "commit": "96af8aee7943ca8685ed1b00cc83a559ea389a97", "local_path": "/home/aum/repos_cache/coxeter/NUS-Math-Formalization-coxeter-96af8aee7943ca8685ed1b00cc83a559ea389a97/coxeter"}, + {"name": "hairy-ball-theorem-lean", "owner": "corent1234", "commit": "a778826d19c8a7ddf1d26beeea628c45450612e6", "local_path": "/home/aum/repos_cache/hairy-ball-theorem/corent1234-hairy-ball-theorem-lean-a778826d19c8a7ddf1d26beeea628c45450612e6/hairy-ball-theorem-lean"}, + {"name": "lean-math-workshop", "owner": "yuma", "commit": "5acd4b933d47fd6c1032798a6046c1baf261445d", "local_path": "/home/aum/repos_cache/lean-math-workshop/yuma-mizuno-lean-math-workshop-5acd4b933d47fd6c1032798a6046c1baf261445d/lean-math-workshop"}, + {"name": "lean-matrix-cookbook", "owner": "eric", "commit": "f15a149d321ac99ff9b9c024b58e7882f564669f", "local_path": "/home/aum/repos_cache/lean-matrix-cookbook/eric-wieser-lean-matrix-cookbook-f15a149d321ac99ff9b9c024b58e7882f564669f/lean-matrix-cookbook"}, + {"name": "lean4-pdl", "owner": "m4lvin", "commit": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e", "local_path": "/home/aum/repos_cache/lean4-pdl/m4lvin-lean4-pdl-c7f649fe3c4891cf1a01c120e82ebc5f6199856e/lean4-pdl"}, + {"name": "lean4lean", "owner": "digama0", "commit": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f", "local_path": "/home/aum/repos_cache/lean4lean/digama0-lean4lean-05b1f4a68c5facea96a5ee51c6a56fef21276e0f/lean4lean"}, + {"name": "mathematics_in_lean_source", "owner": "avigad", "commit": "5297e0fb051367c48c0a084411853a576389ecf5", "local_path": "/home/aum/repos_cache/mathematics_in_lean_source/avigad-mathematics_in_lean_source-5297e0fb051367c48c0a084411853a576389ecf5/mathematics_in_lean_source"}, + {"name": "miniF2F-lean4", "owner": "yangky11", "commit": "9e445f5435407f014b88b44a98436d50dd7abd00", "local_path": "/home/aum/repos_cache/miniF2F-lean4/yangky11-miniF2F-lean4-9e445f5435407f014b88b44a98436d50dd7abd00/miniF2F-lean4"}, + {"name": "pfr", "owner": "teorth", "commit": "fa398a5b853c7e94e3294c45e50c6aee013a2687", "local_path": "/home/aum/repos_cache/pfr/teorth-pfr-fa398a5b853c7e94e3294c45e50c6aee013a2687/pfr"}, + {"name": "zeta_3_irrational", "owner": "ahhwuhu", "commit": "914712200e463cfc97fe37e929d518dd58806a38", "local_path": "/home/aum/repos_cache/zeta_3_irrational/ahhwuhu-zeta_3_irrational-914712200e463cfc97fe37e929d518dd58806a38/zeta_3_irrational"}, +] + +def main(): + # Configuration for VM + raid_dir = pathlib.Path.home() / "LeanAgent" / "RAID" + + print("="*80) + print("VM BATCH EXTRACTION - Processing Local Traces (Sequential)") + print("="*80) + print(f"Found {len(VM_REPOS)} repos to process") + print("="*80) + + successful = [] + failed = [] + + for i, repo in enumerate(VM_REPOS, 1): + print(f"\n[{i}/{len(VM_REPOS)}] Processing {repo['name']}...") + source_root = pathlib.Path(repo["local_path"]) + + if not source_root.exists(): + print(f" !! Source path does not exist: {source_root}") + failed.append(repo['name']) + continue + + out_dir = raid_dir / "data" / f"{repo['name']}_{repo['commit']}" + url = f"https://github.com/{repo['owner']}/{repo['name']}" + + success = process_repo(source_root, out_dir, url, repo["commit"]) + + if success: + print(f" ✓ Successfully processed {repo['name']}.") + successful.append(repo['name']) + else: + print(f" ✗ Failed to process {repo['name']}.") + failed.append(repo['name']) + + # Summary + print("\n" + "="*80) + print("EXTRACTION SUMMARY") + print("="*80) + print(f"\n✓ Successful: {len(successful)}/{len(VM_REPOS)}") + for name in successful: + print(f" - {name}") + + if failed: + print(f"\n✗ Failed: {len(failed)}/{len(VM_REPOS)}") + for name in failed: + print(f" - {name}") + + print(f"\nResults saved to: {raid_dir / 'data'}") + print("="*80) + +if __name__ == "__main__": + main() diff --git a/scripts/setup_vm.sh b/scripts/setup_vm.sh new file mode 100644 index 0000000..ea67cce --- /dev/null +++ b/scripts/setup_vm.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# VM Setup Script - Run this on the GCP VM +# Installs all dependencies for LeanAgent extraction + +set -e + +echo "=========================================" +echo "Setting up VM for Lean Theorem Extraction" +echo "=========================================" + +# Update system +echo "Step 1/8: Updating system packages..." +sudo apt-get update -y +sudo apt-get upgrade -y + +# Install Python 3.10 +echo "Step 2/8: Installing Python 3.10..." +sudo apt-get install -y software-properties-common +sudo add-apt-repository -y ppa:deadsnakes/ppa +sudo apt-get update -y +sudo apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip + +# Install system dependencies +echo "Step 3/8: Installing system dependencies..." +sudo apt-get install -y git unzip wget curl build-essential + +# Install gdown for Google Drive downloads +echo "Step 4/8: Installing gdown..." +pip3 install gdown + +# Clone LeanAgent repo +echo "Step 5/8: Cloning LeanAgent repository..." +cd ~ +git clone https://github.com/lean-dojo/LeanAgent.git +cd LeanAgent + +# Create virtual environment +echo "Step 6/8: Creating Python virtual environment..." +python3.10 -m venv venv +source venv/bin/activate + +# Install LeanAgent with all dependencies +echo "Step 7/8: Installing LeanAgent and dependencies..." +pip install --upgrade pip +pip install -e . + +# Create directories and set environment variables +echo "Step 8/8: Setting up directories and environment..." +mkdir -p ~/LeanAgent/RAID/data +mkdir -p ~/repos_cache + +# Set RAID_DIR environment variable +echo 'export RAID_DIR=~/LeanAgent/RAID' >> ~/.bashrc +export RAID_DIR=~/LeanAgent/RAID + +echo "" +echo "=========================================" +echo "✅ VM Setup Complete!" +echo "=========================================" +echo "" +echo "Environment configured:" +echo " - Python 3.10 installed" +echo " - LeanAgent installed with all dependencies" +echo " - RAID_DIR set to ~/LeanAgent/RAID" +echo " - Virtual environment at ~/LeanAgent/venv" +echo "" +echo "Next steps:" +echo " 1. Download repos from Google Drive" +echo " 2. Run extraction script" +echo "" diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py index da43365..ba25aa9 100755 --- a/scripts/trace_paper_repos.py +++ b/scripts/trace_paper_repos.py @@ -28,29 +28,29 @@ # hardcoded list reconstructed from the paper / convo # ==== Already traced / paper list ==== PAPER_REPOS = [ - # {"owner": "teorth", "name": "pfr", "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687"}, # ALREADY TRACED - # {"owner": "leanprover-community", "name": "hairy-ball-theorem", "sha": "a778826d19c8a7ddf1d26beeea628c45450612e6"}, # not found - # {"owner": "leanprover-community", "name": "coxeter", "sha": "96af8aee7943ca8685ed1b00cc83a559ea389a97"}, # not found - # {"owner": "avigad", "name": "mathematics_in_lean_source", "sha": "5297e0fb051367c48c0a084411853a576389ecf5"}, # ALREADY TRACED - {"owner": "mo271", "name": "FormalBook", "sha": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"}, # searched commit hashes; original table SHA/owner invalid - # {"owner": "yangky11", "name": "miniF2F-lean4", "sha": "9e445f5435407f014b88b44a98436d50dd7abd00"}, # ALREADY TRACED - # {"owner": "lecopivo", "name": "SciLean", "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744"}, # ALREADY TRACED - {"owner": "fpvandoorn", "name": "carleson", "sha": "bec7808b907190882fa1fa54ce749af297c6cf37"}, # searched commit hashes; original table SHA/owner invalid - {"owner": "m4lvin", "name": "lean4-pdl", "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"}, # searched commit hashes; original table SHA/owner invalid - # {"owner": "AlexKontorovich", "name": "PrimeNumberTheoremAnd", "sha": "29baddd685660b5fedd7bd67f9916ae24253d566"}, # ALREADY TRACED - # {"owner": "dwrensha", "name": "compfiles", "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091"}, # ALREADY TRACED - # {"owner": "ImperialCollegeLondon", "name": "FLT", "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147"}, # ALREADY TRACED - # {"owner": "Bachmann", "name": "debate", "sha": "7fb39251b705797ee54e08c96177fabd29a5b5a3"}, # not found - # {"owner": "digama0", "name": "lean4lean", "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"}, # ALREADY TRACED - # {"owner": "eric-wieser", "name": "lean-matrix-cookbook", "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f"}, # ALREADY TRACED - # {"owner": "yuma-mizuno", "name": "lean-math-workshop", "sha": "5acd4b933d47fd6c1032798a6046c1baf261445d"}, # ALREADY TRACED - # {"owner": "loganrjmurphy", "name": "LeanEuclid", "sha": "f1912c3090eb82820575758efc31e40b9db86bb8"}, SMT ERROR - # {"owner": "FormalizedFormalLogic", "name": "Foundation", "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21"}, SMT ERROR - # {"owner": "leanprover-community", "name": "con-nf", "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"}, # ALREADY TRACED - # {"owner": "siddhartha-gadgil", "name": "Saturn", "sha": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"}, # ALREADY TRACED - # {"owner": "ahhwuhu", "name": "zeta_3_irrational", "sha": "914712200e463cfc97fe37e929d518dd58806a38"}, # ALREADY TRACED - # {"owner": "EmilGedda", "name": "Formalization-of-Constructable-Numbers", "sha": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"}, # not found - {"owner": "YaelDillies", "name": "LeanAPAP", "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b"}, # searched commit hashes; original table SHA/owner invalid + {"owner": "leanprover-community", "name": "hairy-ball-theorem", "sha": "a778826d19c8a7ddf1d26beeea628c45450612e6"}, + {"owner": "leanprover-community", "name": "coxeter", "sha": "96af8aee7943ca8685ed1b00cc83a559ea389a97"}, + {"owner": "loganrjmurphy", "name": "LeanEuclid", "sha": "f1912c3090eb82820575758efc31e40b9db86bb8"}, + {"owner": "Louis-Le-Grand", "name": "Formalisation-of-constructable-numbers", "sha": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"}, + {"owner": "yuma-mizuno", "name": "lean-math-workshop", "sha": "5acd4b933d47fd6c1032798a6046c1baf261445d"}, + {"owner": "google-deepmind", "name": "debate", "sha": "7fb39251b705797ee54e08c96177fabd29a5b5a3"}, + {"owner": "teorth", "name": "pfr", "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687"}, + {"owner": "avigad", "name": "mathematics_in_lean_source", "sha": "5297e0fb051367c48c0a084411853a576389ecf5"}, + {"owner": "fpvandoorn", "name": "carleson", "sha": "bec7808b907190882fa1fa54ce749af297c6cf37"}, + {"owner": "lecopivo", "name": "SciLean", "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744"}, + {"owner": "mo271", "name": "FormalBook", "sha": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"}, + {"owner": "yangky11", "name": "miniF2F-lean4", "sha": "9e445f5435407f014b88b44a98436d50dd7abd00"}, + {"owner": "m4lvin", "name": "lean4-pdl", "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"}, + {"owner": "AlexKontorovich", "name": "PrimeNumberTheoremAnd", "sha": "29baddd685660b5fedd7bd67f9916ae24253d566"}, + {"owner": "dwrensha", "name": "compfiles", "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091"}, + {"owner": "ImperialCollegeLondon", "name": "FLT", "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147"}, + {"owner": "digama0", "name": "lean4lean", "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"}, + {"owner": "eric-wieser", "name": "lean-matrix-cookbook", "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f"}, + {"owner": "FormalizedFormalLogic", "name": "Foundation", "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21"}, + {"owner": "leanprover-community", "name": "con-nf", "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"}, + {"owner": "siddhartha-gadgil", "name": "Saturn", "sha": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"}, + {"owner": "ahhwuhu", "name": "zeta_3_irrational", "sha": "914712200e463cfc97fe37e929d518dd58806a38"}, + {"owner": "YaelDillies", "name": "LeanAPAP", "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b"}, ] diff --git a/train_leanagent.py b/train_leanagent.py new file mode 100644 index 0000000..f7cc050 --- /dev/null +++ b/train_leanagent.py @@ -0,0 +1,199 @@ +import os +import sys +import json +import traceback +import torch +import pytorch_lightning as pl +from pytorch_lightning import seed_everything +from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint +from pytorch_lightning.strategies import DDPStrategy +from datetime import timedelta +from loguru import logger + +# Import from existing leanagent modules +from filenames import RAID_DIR, DATA_DIR, CHECKPOINT_DIR, DB_FILE_NAME +from dynamic_database import DynamicDatabase +from retrieval.datamodule import RetrievalDataModule +from retrieval.model import PremiseRetriever +import generate_benchmark_lean4 + +def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase: + """Initializes or loads the dynamic database.""" + if not os.path.exists(dynamic_database_json_path): + raise FileNotFoundError(f"Database file not found at {dynamic_database_json_path}. Please run build_merged_dataset.py first.") + + logger.info(f"Loading database from {dynamic_database_json_path}") + db = DynamicDatabase.from_json(dynamic_database_json_path) + logger.info(f"Loaded database with {len(db.repositories)} repositories") + return db + +def main(): + """ + Simplified training script for LeanAgent. + """ + try: + # Configuration + BATCH_SIZE = 4 + current_epoch = 0 + epochs_per_repo = 1 # We treat the merged dataset as one "repo" for epoch counting + lambda_value = 0.1 # For progressive training + + # Paths + if not RAID_DIR: + raise ValueError("RAID_DIR environment variable is not set.") + + dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME) + # We use the merged dataset we created + # Note: build_merged_dataset.py created it at RAID_DIR/data/merged_paper_subset + new_data_path = os.path.join(DATA_DIR, "merged_paper_subset") + + if not os.path.exists(new_data_path): + raise FileNotFoundError(f"Merged dataset not found at {new_data_path}") + + # Setup + logger.info("Configuring LeanDojo...") + generate_benchmark_lean4.configure_leandojo() + logger.info("LeanDojo configured") + + db = initialize_database(dynamic_database_json_path) + + # Training Setup + logger.info("Starting Training Loop") + + # Find latest checkpoint or use default + model_checkpoint_path = None + try: + # Simple logic to find latest checkpoint + all_checkpoints = [os.path.join(CHECKPOINT_DIR, f) for f in os.listdir(CHECKPOINT_DIR) if f.endswith(".ckpt")] + if all_checkpoints: + model_checkpoint_path = max(all_checkpoints, key=os.path.getmtime) + logger.info(f"Found latest checkpoint: {model_checkpoint_path}") + except Exception as e: + logger.warning(f"Could not find existing checkpoints: {e}") + + if not model_checkpoint_path: + # Fallback to a base checkpoint if available, or let the model initialize from scratch/huggingface + # The original script defaults to a specific mathlib checkpoint. + # We will try to use that if it exists, otherwise None (which might fail if PremiseRetriever expects it) + default_ckpt = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt" + if os.path.exists(default_ckpt): + model_checkpoint_path = default_ckpt + logger.info(f"Using default mathlib checkpoint: {model_checkpoint_path}") + else: + logger.warning("No checkpoint found. Training might start from scratch or fail if a base model is required.") + + seed_everything(3407) + + if not torch.cuda.is_available(): + logger.warning("CUDA is not available. Training will be extremely slow on CPU.") + device = torch.device("cpu") + else: + device = torch.device("cuda") + + config = { + "model_name": "kaiyuy/leandojo-lean4-retriever-byt5-small", + "lr": 1e-3, + "warmup_steps": 1000, + "max_seq_len": 512, + "num_retrieved": 100, + } + + # Load Model + if model_checkpoint_path: + model = PremiseRetriever.load(model_checkpoint_path, device, freeze=False, config=config) + logger.info(f"Loaded premise retriever from {model_checkpoint_path}") + else: + # If no checkpoint, initialize fresh model from HuggingFace + logger.info("Initializing new model from HuggingFace config...") + model = PremiseRetriever( + model_name=config["model_name"], + lr=config["lr"], + warmup_steps=config["warmup_steps"], + max_seq_len=config["max_seq_len"], + num_retrieved=config["num_retrieved"] + ) + + model.train() + model.set_lambda(lambda_value) + + # Callbacks + dir_name = "merged_paper_subset" + filename_suffix = f"_lambda_{lambda_value}" + + checkpoint_callback = ModelCheckpoint( + dirpath=CHECKPOINT_DIR, + filename=dir_name + filename_suffix + "_{epoch}-{Recall@10_val:.2f}", + verbose=True, + save_top_k=-1, + every_n_epochs=1, + monitor="Recall@10_val", + mode="max", + ) + + early_stop_callback = EarlyStopping( + monitor="Recall@10_val", patience=5, mode="max", verbose=True + ) + + lr_monitor = LearningRateMonitor(logging_interval="step") + + # Environment for DDP + VERY_LONG_TIMEOUT = 7 * 24 * 60 * 60 # 1 week + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1" + os.environ["NCCL_TIMEOUT"] = str(VERY_LONG_TIMEOUT * 1000) + + custom_log_dir = os.path.join(RAID_DIR, "lightning_logs", f"{dir_name}_lambda_{lambda_value}") + os.makedirs(custom_log_dir, exist_ok=True) + + # Trainer + # Adjust devices based on availability + num_gpus = torch.cuda.device_count() + devices = num_gpus if num_gpus > 0 else 1 + accelerator = "gpu" if num_gpus > 0 else "cpu" + strategy = DDPStrategy(timeout=timedelta(seconds=VERY_LONG_TIMEOUT)) if num_gpus > 1 else "auto" + + trainer = pl.Trainer( + accelerator=accelerator, + gradient_clip_val=1.0, + precision="bf16-mixed" if num_gpus > 0 else 32, # bf16 might not work on CPU + strategy=strategy, + devices=devices, + accumulate_grad_batches=4, + callbacks=[lr_monitor, checkpoint_callback, early_stop_callback], + max_epochs=current_epoch + 5, # Train for 5 epochs for now + log_every_n_steps=1, + num_sanity_val_steps=0, + default_root_dir=custom_log_dir, + ) + + # Data Module + corpus_path = os.path.join(new_data_path, "corpus.jsonl") + data_path_random = os.path.join(new_data_path, "random") + + logger.info(f"Loading data from {data_path_random}") + data_module = RetrievalDataModule( + data_path=data_path_random, + corpus_path=corpus_path, + num_negatives=3, + num_in_file_negatives=1, + model_name="google/byt5-small", + batch_size=BATCH_SIZE, + eval_batch_size=64, + max_seq_len=1024, + num_workers=4, + ) + data_module.setup(stage="fit") + + logger.info(f"Training dataset size: {len(data_module.ds_train)}") + logger.info(f"Validation dataset size: {len(data_module.ds_val)}") + + # Train + logger.info("Starting trainer.fit...") + trainer.fit(model, datamodule=data_module, ckpt_path=model_checkpoint_path) + logger.info("Training finished!") + + except Exception as e: + logger.error(f"An error occurred: {e}") + traceback.print_exc() + +if __name__ == "__main__": + main() From 3f145c54b74a50b82b0a4a28bc817bb4f09fe079 Mon Sep 17 00:00:00 2001 From: aumrp77 Date: Mon, 5 Jan 2026 19:27:44 +0530 Subject: [PATCH 29/29] Fix lifelong learning bugs for 23-repo reproduction --- leanagent.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 107 insertions(+), 8 deletions(-) diff --git a/leanagent.py b/leanagent.py index 5b2afb8..8f9779d 100644 --- a/leanagent.py +++ b/leanagent.py @@ -624,10 +624,10 @@ def main(): current_epoch = 0 epochs_per_repo = 1 run_progressive_training = True - use_fisher = False - single_repo = True + use_fisher = True # FIXED: Enable EWC for lifelong learning + single_repo = False # FIXED: Enable cumulative learning across repos curriculum_learning = True - num_repos = 3 + num_repos = 23 # FIXED: Full paper reproduction (was 3) dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME) lambdas = None @@ -644,6 +644,9 @@ def main(): logger.info("LeanDojo configured") db = initialize_database(dynamic_database_json_path) + # FIXED: Create required directories + os.makedirs(FISHER_DIR, exist_ok=True) + os.makedirs(CHECKPOINT_DIR, exist_ok=True) logger.info(f"Found {num_repos} repositories") lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db) @@ -688,7 +691,8 @@ def main(): repos_for_proving = [] # Create a directory for the merged dataset if it doesn't exist - dst_dir = Path(RAID_DIR) / DATA_DIR / f"merged_with_new_{dir_name}" + # FIXED: Use DATA_DIR directly (it's already full path) + dst_dir = Path(DATA_DIR) / f"merged_with_new_{dir_name}" if (repo.url, repo.commit) not in repos_for_merged_dataset: logger.info("Adding repo to repos_for_merged_dataset") repos_for_merged_dataset.append((repo.url, repo.commit)) @@ -962,10 +966,53 @@ def main(): if ray.is_initialized(): logger.info("Shutting down Ray before proving") ray.shutdown() + + # ADDED: Compute Fisher Information Matrix after training (before proving) + if use_fisher and i < num_repos - 1: # Don't compute Fisher after last repo + logger.info("="*80) + logger.info("COMPUTING FISHER INFORMATION MATRIX") + logger.info("="*80) + + from retrieval.fisher_computation_module import FisherComputationModule + + # Create Fisher computation module with current best model + fisher_module = FisherComputationModule(best_model) + + # Setup trainer for Fisher computation + fisher_trainer = pl.Trainer( + accelerator="gpu", + precision="bf16-mixed", + strategy=ddp_strategy, + devices=4, + max_epochs=1, + log_every_n_steps=1, + num_sanity_val_steps=0, + ) + + try: + logger.info("Computing Fisher matrix...") + fisher_trainer.strategy.barrier() + fisher_trainer.fit(fisher_module, datamodule=data_module) + fisher_trainer.strategy.barrier() + + # Save the Fisher Information Matrix + if fisher_trainer.is_global_zero: + fisher_file_path = os.path.join( + FISHER_DIR, + f"fisher_info_{dir_name}_distributed.pkl", + ) + fisher_module.save_fisher_info(fisher_file_path) + logger.info(f"Fisher Information Matrix saved at {fisher_file_path}") + except Exception as e: + logger.error(f"Error during Fisher computation: {str(e)}") + print(traceback.format_exc()) + + logger.info("Finished computing Fisher matrix") # Set up the prover use_vllm = False - corpus_path = dst_dir + "/corpus.jsonl" + # FIXED: Use os.path.join instead of string concatenation + corpus_path = os.path.join(str(dst_dir), "corpus.jsonl") tactic = ( None # `None` since we are not using a fixed tactic generator ) @@ -1034,9 +1081,61 @@ def main(): logger.info("Finished processing the repository") current_epoch += epochs_per_repo logger.info(f"current epoch: {current_epoch}") - if use_fisher: - # Need to return to compute the FIM - return + # FIXED: Removed early return to allow all 23 repos to be processed + # Fisher computation will happen between repos via external script + # if use_fisher: + # # Need to return to compute the FIM\n # return + + # ADDED: Second sorry proving pass after all repos ("Add. After") + if is_main_process and i == num_repos - 1: + logger.info("=" * 80) + logger.info("STARTING SECOND PASS: 'Add. After' with final model") + logger.info("=" * 80) + + if ray.is_initialized(): + logger.info("Shutting down Ray before second pass") + ray.shutdown() + + # Use the final checkpoint + try: + final_checkpoint = find_latest_checkpoint() + logger.info(f"Using final checkpoint for second pass: {final_checkpoint}") + except FileNotFoundError: + logger.error("No checkpoint found for second pass") + + # Create new prover with final model + final_prover = DistributedProver( + use_vllm, + ckpt_path, + corpus_path, + tactic, + module, + num_workers, + num_gpus=num_gpus, + timeout=timeout, + max_expansions=max_expansions, + num_sampled_tactics=num_sampled_tactics, + raid_dir=RAID_DIR, + checkpoint_dir=CHECKPOINT_DIR, + debug=debug, + run_progressive_training=run_progressive_training, + ) + + # Reprove ALL repos with final model + logger.info("Reproving ALL repositories with final model for 'Add. After' pass") + prove_sorry_theorems( + db, + final_prover, + dynamic_database_json_path, + repos_to_include=None, # All repos + ) + + save_database_locked(db, dynamic_database_json_path) + logger.info("Completed 'Add. After' pass") + + if ray.is_initialized(): + logger.info("Shutting down Ray after second pass") + ray.shutdown() except Exception as e: logger.info(f"An error occurred: {e}", file=sys.stderr)