diff --git a/pyproject.toml b/pyproject.toml index c1a1a4e..29671b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dev = [ "nbqa>=1.9.1", "pip-audit>=2.7.3", "pre-commit>=4.1.0", + "pymupdf>=1.26.7", "pytest>=8.3.4", "pytest-asyncio>=1.2.0", "pytest-cov>=7.0.0", diff --git a/src/utils/data/pdf_to_hf_dataset.py b/src/utils/data/pdf_to_hf_dataset.py new file mode 100644 index 0000000..931bcaa --- /dev/null +++ b/src/utils/data/pdf_to_hf_dataset.py @@ -0,0 +1,852 @@ +"""Convert PDF files into a HuggingFace dataset with OpenAI-compatible OCR. + +This script renders PDF pages to images, sends each page to a multimodal model +via an OpenAI-compatible API, and then chunks the resulting text into a +HuggingFace dataset with a ``text`` column. + +Requirements +------------ +- ``OPENAI_API_KEY`` must be set in your environment. +- ``OPENAI_BASE_URL`` is optional if you use a non-default endpoint. +- ``pymupdf`` is required for PDF rendering. + +Examples +-------- +Transcribe a single PDF and save to ``hf_dataset``: + uv run --env-file .env src/utils/data/pdf_to_hf_dataset.py \ + --input-path ./docs/example.pdf + +Transcribe a folder recursively with a smaller DPI and a custom output: + uv run --env-file .env src/utils/data/pdf_to_hf_dataset.py \ + --input-path ./docs --recursive --dpi 150 --output-dir ./out_dataset + +Notes +----- +- Pages can be skipped using regex patterns or by skipping front/back matter. +- Chunking is token-based using the tokenizer for the target embedding model. +- Use ``--structured-ocr`` to request JSON blocks for more reliable headings. +""" + +import base64 +import os +import re +import time +from contextlib import contextmanager +from dataclasses import dataclass +from enum import Enum +from pathlib import Path + +import click +import datasets +from dotenv import load_dotenv +from openai import OpenAI +from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator +from transformers import AutoTokenizer + + +DEFAULT_SKIP_PATTERNS = ( + r"^table of contents\b", + r"^contents\b", + r"^appendix\b", + r"^appendices\b", + r"^references\b", + r"^bibliography\b", + r"^glossary\b", + r"^index\b", + r"^acknowledg(e)?ments\b", + r"^foreword\b", + r"^preface\b", +) + +DEFAULT_PROMPT = ( + "Transcribe all readable text from this page in natural reading order. " + "Return plain text only. Do not summarize or add commentary. " + "If images, charts, or other visualizations are referenced in the text, " + "transcribe their content as part of the flow. " + "Ignore headers, footers, page numbers, and decorative elements." +) + +STRUCTURED_PROMPT = ( + "Extract the page content and return JSON only with this schema: " + '{"blocks":[{"type":"heading|paragraph|list|table|figure|caption",' + '"text":"...","level":1}]}. ' + "Use type=heading for section titles. If you can infer hierarchy from " + "numbering, set level (1-4); otherwise omit or set null. Preserve the " + "reading order and keep text verbatim. Do not add commentary. If there is " + 'no readable text, return {"blocks":[]}. Ignore page numbers, headers, footers, ' + "and decorative elements. For figures (images, charts, schematics, etc.) " + "describe the content in text in as much detail as possible." +) + + +@dataclass +class Segment: + """Chunking segment derived from a page.""" + + text: str + title: str | None = None + level: int | None = None + + +class BlockType(str, Enum): + """Allowed block types for structured OCR.""" + + HEADING = "heading" + PARAGRAPH = "paragraph" + LIST = "list" + TABLE = "table" + FIGURE = "figure" + CAPTION = "caption" + + +class Block(BaseModel): + """Structured OCR block schema for response_format.""" + + model_config = ConfigDict(extra="ignore") + + type: BlockType = Field( + default=BlockType.PARAGRAPH, + description="Block type (heading, paragraph, list, table).", + ) + text: str = Field(..., description="Verbatim block text.") + level: int | None = Field(default=None, description="Heading level if available.") + + @field_validator("type", mode="before") + @classmethod + def _normalize_type(cls, value: object) -> BlockType: + if isinstance(value, BlockType): + return value + if isinstance(value, str): + cleaned = value.strip().lower() + try: + return BlockType(cleaned) + except ValueError: + return BlockType.PARAGRAPH + return BlockType.PARAGRAPH + + +class Page(BaseModel): + """Structured OCR output for a page.""" + + model_config = ConfigDict(extra="ignore") + + blocks: list[Block] = Field(default_factory=list) + + +def _load_pymupdf() -> object: + """Import PyMuPDF lazily and raise a clear error if missing.""" + try: + import pymupdf # noqa: PLC0415 + except ImportError as exc: # pragma: no cover - import guard + raise SystemExit( + "PyMuPDF is required to read PDFs. Install it with `pip install pymupdf`." + ) from exc + return pymupdf + + +def _resolve_pdf_paths(input_path: Path, recursive: bool) -> list[Path]: + """Return a sorted list of PDF files from a file or directory input.""" + if input_path.is_file(): + if input_path.suffix.lower() != ".pdf": + raise ValueError("input_path must point to a PDF file.") + return [input_path] + + if not input_path.is_dir(): + raise ValueError("input_path must be a PDF file or directory.") + + if recursive: + candidates = [p for p in input_path.rglob("*") if p.is_file()] + else: + candidates = [p for p in input_path.iterdir() if p.is_file()] + + return sorted([p for p in candidates if p.suffix.lower() == ".pdf"]) + + +def _looks_like_toc(text: str) -> bool: + """Heuristic for detecting table-of-contents style pages.""" + # look for repeated "title .... page" patterns across many lines. + # TOCs are usually noisy for retrieval and waste embedding budget. + lines = [line.strip() for line in text.splitlines() if line.strip()] + if len(lines) < 6: + return False + + dotted = sum(1 for line in lines if re.search(r"\.{2,}\s*\d{1,4}$", line)) + numbered = sum(1 for line in lines if re.search(r"\s\d{1,4}$", line)) + score = max(dotted, numbered) / len(lines) + return score >= 0.3 + + +def _should_skip_page( + text: str, + *, + min_page_characters: int, + min_page_words: int, + skip_patterns: list[re.Pattern[str]], + skip_toc_detection: bool, +) -> bool: + """Decide whether to drop a page based on text heuristics.""" + # front/back matter and sparse pages add noise to RAG indexes. + # reject pages that are too short or match known filler patterns. + stripped = text.strip() + if not stripped: + return True + + normalized = " ".join(stripped.split()) + if min_page_characters and len(normalized) < min_page_characters: + return True + if min_page_words and len(normalized.split()) < min_page_words: + return True + + first_line = "" + for line_text in stripped.splitlines(): + stripped_line = line_text.strip() + if stripped_line: + first_line = stripped_line.lower() + break + + if first_line and any(pattern.search(first_line) for pattern in skip_patterns): + return True + + return bool(skip_toc_detection and _looks_like_toc(stripped)) + + +def _chunk_text( + text: str, + tokenizer: AutoTokenizer, + chunk_size: int, + chunk_overlap: int, +) -> list[str]: + """Split text into overlapping token windows for embedding.""" + # the embedding model has a hard context window we must respect. + # let the tokenizer produce overlapping windows capped at chunk_size. + if not text.strip(): + return [] + if chunk_size <= 0: + return [text.strip()] + if chunk_overlap < 0 or chunk_overlap >= chunk_size: + raise ValueError("chunk_overlap must be >= 0 and smaller than chunk_size.") + + # some tokenizers can panic on unexpected characters; keep the pipeline moving. + # try tokenization, then sanitize/retry, then use char windows. + try: + encoding = tokenizer( + text, + add_special_tokens=False, + max_length=chunk_size, + truncation=True, + stride=chunk_overlap, + return_overflowing_tokens=True, + return_attention_mask=False, + return_token_type_ids=False, + ) + except Exception: + cleaned = _sanitize_text(text) + if cleaned and cleaned != text: + try: + encoding = tokenizer( + cleaned, + add_special_tokens=False, + max_length=chunk_size, + truncation=True, + stride=chunk_overlap, + return_overflowing_tokens=True, + return_attention_mask=False, + return_token_type_ids=False, + ) + text = cleaned + except Exception: + return _fallback_chunk_text(cleaned, chunk_size, chunk_overlap) + else: + return _fallback_chunk_text(cleaned, chunk_size, chunk_overlap) + + input_ids = encoding.get("input_ids", []) + if not input_ids: + return [] + if isinstance(input_ids[0], int): + input_ids = [input_ids] + + chunks: list[str] = [] + for chunk_ids in input_ids: + chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True).strip() + if chunk_text: + chunks.append(chunk_text) + + return chunks + + +def _sanitize_text(text: str) -> str: + """Remove invalid unicode sequences that can crash some tokenizers.""" + # OCR sometimes yields unpaired surrogates or invalid UTF-8 sequences. + # round-trip through UTF-8 with "ignore" to drop problematic bytes. + return text.encode("utf-8", "ignore").decode("utf-8", "ignore") + + +def _fallback_chunk_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]: + """Chunk by character window when tokenization is unavailable.""" + # if the tokenizer panics, approximate chunking still produces usable text. + # treat 1 token ~= 4 chars to create overlapping character windows. + if not text.strip(): + return [] + if chunk_size <= 0: + return [text.strip()] + + char_limit = max(chunk_size * 4, 1) + char_overlap = max(chunk_overlap * 4, 0) + step = max(char_limit - char_overlap, 1) + + chunks: list[str] = [] + for start in range(0, len(text), step): + chunk = text[start : start + char_limit].strip() + if chunk: + chunks.append(chunk) + if start + char_limit >= len(text): + break + return chunks + + +def _parse_structured_page(response_text: str) -> Page | None: + """Parse structured OCR output into a Pydantic model.""" + # some providers return raw JSON even with response_format enabled. + # attempt a strict JSON parse into the Page schema. + try: + return Page.model_validate_json(response_text) + except ValidationError: + return None + + +def _token_count(tokenizer: AutoTokenizer, text: str) -> int: + """Return the number of tokens for a text string.""" + # we need approximate token sizes to merge blocks for RAG. + # try tokenization, then sanitize/retry, then approximate by chars. + try: + return len(tokenizer.encode(text, add_special_tokens=False)) + except Exception: + cleaned = _sanitize_text(text) + if cleaned and cleaned != text: + try: + return len(tokenizer.encode(cleaned, add_special_tokens=False)) + except Exception: + pass + approx = len(cleaned) if cleaned else len(text) + return max(1, approx // 4) + + +def _segments_from_blocks( + blocks: list[Block], + tokenizer: AutoTokenizer, + *, + max_tokens: int, + min_tokens: int, +) -> list[Segment]: + """Split blocks into heading-scoped segments with size limits.""" + # RAG works better with coherent sections than isolated blocks. + # collect blocks under the latest heading until a size budget is hit. + if not blocks: + return [] + + segments: list[Segment] = [] + current_parts: list[str] = [] + active_title: str | None = None + active_level: int | None = None + current_tokens = 0 + + def render_segment_text(body: str) -> str: + # headings are useful retrieval context but should not stand alone. + # prefix the heading once if the body does not already include it. + if active_title and not body.lower().startswith(active_title.lower()): + return f"{active_title}\n{body}".strip() + return body + + def flush() -> None: + nonlocal current_parts, current_tokens + body = "\n".join(part for part in current_parts if part.strip()).strip() + if body: + segments.append( + Segment( + text=render_segment_text(body), + title=active_title, + level=active_level, + ) + ) + current_parts = [] + current_tokens = 0 + + for block in blocks: + text = block.text.strip() + if not text: + continue + + if block.type == BlockType.HEADING: + flush() + active_title = text + active_level = block.level + continue + + block_tokens = _token_count(tokenizer, text) if max_tokens > 0 else 0 + if not current_parts: + current_parts = [text] + current_tokens = block_tokens + continue + + if ( + max_tokens > 0 + and current_tokens + block_tokens > max_tokens + and current_tokens >= min_tokens + ): + flush() + current_parts = [text] + current_tokens = block_tokens + else: + current_parts.append(text) + current_tokens += block_tokens + + flush() + return segments + + +def _transcribe_page( + client: object, + *, + model: str, + image_bytes: bytes, + prompt: str, + temperature: float, + max_output_tokens: int | None, + seed: int | None, + max_retries: int, + retry_base_seconds: float, + response_format: type[BaseModel] | None, +) -> tuple[str, Page | None]: + """Call the multimodal model to OCR a single page image.""" + # the OCR model expects a prompt plus an image payload. + # send a chat message with the prompt and base64-encoded PNG. + image_b64 = base64.b64encode(image_bytes).decode("ascii") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_b64}"}, + }, + ], + } + ] + + for attempt in range(max_retries + 1): + try: + request_kwargs: dict[str, object] = { + "model": model, + "messages": messages, + "temperature": temperature, + } + if max_output_tokens is not None: + request_kwargs["max_tokens"] = max_output_tokens + if seed is not None: + request_kwargs["seed"] = seed + if response_format is not None: + request_kwargs["response_format"] = response_format + + # structured OCR returns parsed content when response_format is set. + # use parse() to capture message.parsed without extra JSON parsing. + response = client.chat.completions.parse(**request_kwargs) + message = response.choices[0].message + content = message.content if message else "" + parsed_page: Page | None = None + if message is not None and hasattr(message, "parsed"): + parsed = message.parsed + if isinstance(parsed, Page): + parsed_page = parsed + elif isinstance(parsed, BaseModel): + parsed_page = None + elif isinstance(parsed, dict) and response_format is not None: + try: + parsed_page = response_format.model_validate(parsed) + except ValidationError: + parsed_page = None + return (content.strip() if content else ""), parsed_page + except Exception as exc: # pragma: no cover - network errors + if attempt >= max_retries: + raise exc + time.sleep(retry_base_seconds * (2**attempt)) + + return "", None + + +@contextmanager +def _page_indices(page_limit: int, show_progress: bool, label: str): + """Yield page indices, optionally wrapped in a progress bar.""" + indices = range(page_limit) + if show_progress: + with click.progressbar(indices, label=label) as progress: + yield progress + else: + yield indices + + +def _compile_skip_patterns( + skip_pattern: tuple[str, ...], + use_default_skip_patterns: bool, +) -> list[re.Pattern[str]]: + """Compile regex patterns that identify filler pages.""" + # reusable regexes are faster and keep skip logic centralized. + # merge default and custom patterns, then compile with IGNORECASE. + patterns: list[str] = list(skip_pattern) + if use_default_skip_patterns: + patterns = list(DEFAULT_SKIP_PATTERNS) + patterns + return [re.compile(pattern, re.IGNORECASE) for pattern in patterns] + + +def _resolve_openai_api_key() -> str: + """Load the API key from environment variables.""" + # keys should be sourced from env to avoid shell history leaks. + # check common env var names used for OpenAI/Gemini-compatible keys. + openai_api_key = ( + os.getenv("OPENAI_API_KEY") + or os.getenv("GEMINI_API_KEY") + or os.getenv("GOOGLE_API_KEY") + ) + if openai_api_key is None: + raise ValueError( + "API key not found. Set OPENAI_API_KEY (or GEMINI_API_KEY/GOOGLE_API_KEY)." + ) + return openai_api_key + + +def _get_openai_client(api_key: str, base_url: str | None) -> object: + """Instantiate an OpenAI-compatible client.""" + client_kwargs = {"api_key": api_key} + if base_url: + client_kwargs["base_url"] = base_url + return OpenAI(**client_kwargs) + + +def _collect_records( + pdf_paths: list[Path], + pymupdf: object, + client: object, + tokenizer: AutoTokenizer, + *, + chunk_size: int, + chunk_overlap: int, + model: str, + prompt: str, + temperature: float, + max_output_tokens: int | None, + seed: int | None, + max_retries: int, + retry_base_seconds: float, + dpi: int, + max_pages_per_doc: int | None, + skip_front_pages: int, + skip_back_pages: int, + min_page_characters: int, + min_page_words: int, + skip_patterns: list[re.Pattern[str]], + skip_toc_detection: bool, + show_progress: bool, + structured_ocr: bool, + source_root: Path, +) -> list[dict[str, object]]: + """Process PDFs into a list of dataset records.""" + records: list[dict[str, object]] = [] + # segment sizes should roughly align with final chunk sizes for RAG. + # derive a max/min token budget from chunk_size. + segment_max_tokens = chunk_size if chunk_size > 0 else 0 + segment_min_tokens = max(64, chunk_size // 4) if chunk_size > 0 else 0 + + for pdf_path in pdf_paths: + try: + doc = pymupdf.open(pdf_path) + except Exception as exc: # pragma: no cover - backend errors + click.echo(f"Skipping {pdf_path.name}: failed to open PDF ({exc}).") + continue + + with doc: + page_count = doc.page_count + max_pages = max_pages_per_doc or page_count + page_limit = min(page_count, max_pages) + source = pdf_path.relative_to(source_root).as_posix() + + with _page_indices( + page_limit, show_progress, f"OCR {pdf_path.name}" + ) as page_iter: + for page_index in page_iter: + if skip_front_pages and page_index < skip_front_pages: + continue + if skip_back_pages and page_index >= page_limit - skip_back_pages: + continue + try: + # render each page as an image to feed the OCR model. + # rasterize the page into PNG bytes for the API call. + page = doc.load_page(page_index) + pixmap = page.get_pixmap(dpi=dpi) + image_bytes = pixmap.tobytes("png") + + response_text, structured_page = _transcribe_page( + client, + model=model, + image_bytes=image_bytes, + prompt=prompt, + temperature=temperature, + max_output_tokens=max_output_tokens, + seed=seed, + max_retries=max_retries, + retry_base_seconds=retry_base_seconds, + response_format=(Page if structured_ocr else None), + ) + except Exception as exc: # pragma: no cover - backend errors + click.echo( + f"Skipping {pdf_path.name} page {page_index + 1}: {exc}" + ) + continue + + # some providers return JSON as plain text even with parsing. + # parse the raw response into the Page schema when needed. + if structured_ocr and structured_page is None and response_text: + structured_page = _parse_structured_page(response_text) + page_text = response_text + if structured_page is not None and structured_page.blocks: + page_text = "\n".join( + block.text for block in structured_page.blocks if block.text + ) + + if _should_skip_page( + page_text, + min_page_characters=min_page_characters, + min_page_words=min_page_words, + skip_patterns=skip_patterns, + skip_toc_detection=skip_toc_detection, + ): + continue + + segments = [Segment(text=page_text)] + if structured_page is not None and structured_page.blocks: + segments = _segments_from_blocks( + structured_page.blocks, + tokenizer, + max_tokens=segment_max_tokens, + min_tokens=segment_min_tokens, + ) + + for segment_index, segment in enumerate(segments): + # final chunks must fit the embedding context window. + # split each segment into token-sized windows. + chunks = _chunk_text( + segment.text, tokenizer, chunk_size, chunk_overlap + ) + for chunk_index, chunk in enumerate(chunks): + records.append( + { + "text": chunk, + "source": source, + "page_index": page_index + 1, + "segment_index": segment_index, + "chunk_index": chunk_index, + "section_title": segment.title, + "section_level": segment.level, + } + ) + + return records + + +def _save_dataset( + records: list[dict[str, object]], output_dir: Path +) -> datasets.DatasetDict: + """Save records to disk as a HuggingFace DatasetDict.""" + dataset = datasets.Dataset.from_list(records) + dataset_dict = datasets.DatasetDict({"train": dataset}) + output_dir.mkdir(parents=True, exist_ok=True) + dataset_dict.save_to_disk(output_dir) + click.echo(f"Saved dataset with {len(dataset)} chunks to {output_dir}.") + return dataset_dict + + +@click.command() +@click.option( + "--input-path", + required=True, + type=click.Path(exists=True, path_type=Path), + help="Path to a PDF file or directory of PDFs.", +) +@click.option( + "--output-dir", + default=Path("hf_dataset"), + type=click.Path(path_type=Path), + show_default=True, + help="Directory to save the HuggingFace dataset.", +) +@click.option( + "--recursive/--no-recursive", + default=False, + show_default=True, + help="Recursively scan directories for PDFs.", +) +@click.option( + "--tokenizer-name", + default="BAAI/bge-m3", + show_default=True, + help="Tokenizer used for chunking.", +) +@click.option( + "--chunk-size", + default=512, + show_default=True, + help="Max tokens per chunk.", +) +@click.option( + "--chunk-overlap", + default=64, + show_default=True, + help="Token overlap between chunks.", +) +@click.option( + "--model", + default="gemini-2.5-flash", + show_default=True, + help="Model used for OCR (OpenAI-compatible endpoint).", +) +@click.option( + "--openai-base-url", + default=None, + help="Overrides OPENAI_BASE_URL for the OpenAI-compatible endpoint.", +) +@click.option( + "--prompt", + default=DEFAULT_PROMPT, + show_default=False, + help="Prompt passed to the OCR model.", +) +@click.option("--temperature", default=0.0, show_default=True, type=float) +@click.option("--max-output-tokens", default=4096, show_default=True, type=int) +@click.option("--seed", default=None, type=int) +@click.option("--dpi", default=300, show_default=True, type=int) +@click.option("--max-pages-per-doc", default=None, type=int) +@click.option("--skip-front-pages", default=0, show_default=True, type=int) +@click.option("--skip-back-pages", default=0, show_default=True, type=int) +@click.option("--min-page-characters", default=200, show_default=True, type=int) +@click.option("--min-page-words", default=0, show_default=True, type=int) +@click.option( + "--skip-toc-detection/--no-skip-toc-detection", + default=True, + show_default=True, +) +@click.option( + "--use-default-skip-patterns/--no-default-skip-patterns", + default=True, + show_default=True, +) +@click.option( + "--skip-pattern", + multiple=True, + help="Regex pattern to skip pages if it matches the first line.", +) +@click.option("--max-retries", default=3, show_default=True, type=int) +@click.option("--retry-base-seconds", default=2.0, show_default=True, type=float) +@click.option( + "--show-progress/--no-show-progress", + default=True, + show_default=True, + help="Show a progress indicator while OCR runs.", +) +@click.option( + "--structured-ocr/--no-structured-ocr", + default=False, + show_default=True, + help="Request structured JSON blocks from the OCR model (overrides --prompt).", +) +@click.option("--save-to-hub", is_flag=True) +@click.option("--hub-repo-id", default=None) +def main( + input_path: Path, + output_dir: Path, + recursive: bool, + tokenizer_name: str, + chunk_size: int, + chunk_overlap: int, + model: str, + openai_base_url: str | None, + prompt: str, + temperature: float, + max_output_tokens: int, + seed: int | None, + dpi: int, + max_pages_per_doc: int | None, + skip_front_pages: int, + skip_back_pages: int, + min_page_characters: int, + min_page_words: int, + skip_toc_detection: bool, + use_default_skip_patterns: bool, + skip_pattern: tuple[str, ...], + max_retries: int, + retry_base_seconds: float, + show_progress: bool, + structured_ocr: bool, + save_to_hub: bool, + hub_repo_id: str | None, +) -> None: + """Convert PDFs to a chunked HuggingFace dataset.""" + if chunk_overlap >= chunk_size: + raise ValueError("chunk_overlap must be smaller than chunk_size.") + + pdf_paths = _resolve_pdf_paths(input_path, recursive) + if not pdf_paths: + raise ValueError("No PDF files found to process.") + + compiled_patterns = _compile_skip_patterns(skip_pattern, use_default_skip_patterns) + openai_api_key = _resolve_openai_api_key() + prompt_text = STRUCTURED_PROMPT if structured_ocr else prompt + + max_output = max_output_tokens if max_output_tokens > 0 else None + + pymupdf = _load_pymupdf() + client = _get_openai_client(openai_api_key, openai_base_url) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True) + + source_root = input_path if input_path.is_dir() else input_path.parent + + records = _collect_records( + pdf_paths, + pymupdf, + client, + tokenizer, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + model=model, + prompt=prompt_text, + temperature=temperature, + max_output_tokens=max_output, + seed=seed, + max_retries=max_retries, + retry_base_seconds=retry_base_seconds, + dpi=dpi, + max_pages_per_doc=max_pages_per_doc, + skip_front_pages=skip_front_pages, + skip_back_pages=skip_back_pages, + min_page_characters=min_page_characters, + min_page_words=min_page_words, + skip_patterns=compiled_patterns, + skip_toc_detection=skip_toc_detection, + show_progress=show_progress, + structured_ocr=structured_ocr, + source_root=source_root, + ) + + if not records: + raise ValueError("No text chunks were produced.") + + dataset_dict = _save_dataset(records, output_dir) + + if save_to_hub: + if not hub_repo_id: + raise ValueError("hub_repo_id must be provided when save_to_hub is True.") + dataset_dict.push_to_hub(hub_repo_id, private=False) + + +if __name__ == "__main__": + load_dotenv() + + main() diff --git a/uv.lock b/uv.lock index 77a816c..605b563 100644 --- a/uv.lock +++ b/uv.lock @@ -42,6 +42,7 @@ dev = [ { name = "nbqa" }, { name = "pip-audit" }, { name = "pre-commit" }, + { name = "pymupdf" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, @@ -97,6 +98,7 @@ dev = [ { name = "nbqa", specifier = ">=1.9.1" }, { name = "pip-audit", specifier = ">=2.7.3" }, { name = "pre-commit", specifier = ">=4.1.0" }, + { name = "pymupdf", specifier = ">=1.26.7" }, { name = "pytest", specifier = ">=8.3.4" }, { name = "pytest-asyncio", specifier = ">=1.2.0" }, { name = "pytest-cov", specifier = ">=7.0.0" }, @@ -3959,6 +3961,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/06/43084e6cbd4b3bc0e80f6be743b2e79fbc6eed8de9ad8c629939fa55d972/pymdown_extensions-10.16.1-py3-none-any.whl", hash = "sha256:d6ba157a6c03146a7fb122b2b9a121300056384eafeec9c9f9e584adfdb2a32d", size = 266178, upload-time = "2025-07-28T16:19:31.401Z" }, ] +[[package]] +name = "pymupdf" +version = "1.26.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/48/d6/09b28f027b510838559f7748807192149c419b30cb90e6d5f0cf916dc9dc/pymupdf-1.26.7.tar.gz", hash = "sha256:71add8bdc8eb1aaa207c69a13400693f06ad9b927bea976f5d5ab9df0bb489c3", size = 84327033, upload-time = "2025-12-11T21:48:50.694Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/35/cd74cea1787b2247702ef8522186bdef32e9cb30a099e6bb864627ef6045/pymupdf-1.26.7-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:07085718dfdae5ab83b05eb5eb397f863bcc538fe05135318a01ea353e7a1353", size = 23179369, upload-time = "2025-12-11T21:47:21.587Z" }, + { url = "https://files.pythonhosted.org/packages/72/74/448b6172927c829c6a3fba80078d7b0a016ebbe2c9ee528821f5ea21677a/pymupdf-1.26.7-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:31aa9c8377ea1eea02934b92f4dcf79fb2abba0bf41f8a46d64c3e31546a3c02", size = 22470101, upload-time = "2025-12-11T21:47:37.105Z" }, + { url = "https://files.pythonhosted.org/packages/65/e7/47af26f3ac76be7ac3dd4d6cc7ee105948a8355d774e5ca39857bf91c11c/pymupdf-1.26.7-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e419b609996434a14a80fa060adec72c434a1cca6a511ec54db9841bc5d51b3c", size = 23502486, upload-time = "2025-12-12T09:51:25.824Z" }, + { url = "https://files.pythonhosted.org/packages/2a/6b/3de1714d734ff949be1e90a22375d0598d3540b22ae73eb85c2d7d1f36a9/pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:69dfc78f206a96e5b3ac22741263ebab945fdf51f0dbe7c5757c3511b23d9d72", size = 24115727, upload-time = "2025-12-11T21:47:51.274Z" }, + { url = "https://files.pythonhosted.org/packages/62/9b/f86224847949577a523be2207315ae0fd3155b5d909cd66c274d095349a3/pymupdf-1.26.7-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1d5106f46e1ca0d64d46bd51892372a4f82076bdc14a9678d33d630702abca36", size = 24324386, upload-time = "2025-12-12T14:58:45.483Z" }, + { url = "https://files.pythonhosted.org/packages/85/8e/a117d39092ca645fde8b903f4a941d9aa75b370a67b4f1f435f56393dc5a/pymupdf-1.26.7-cp310-abi3-win32.whl", hash = "sha256:7c9645b6f5452629c747690190350213d3e5bbdb6b2eca227d82702b327f6eee", size = 17203888, upload-time = "2025-12-12T13:59:57.613Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c3/d0047678146c294469c33bae167c8ace337deafb736b0bf97b9bc481aa65/pymupdf-1.26.7-cp310-abi3-win_amd64.whl", hash = "sha256:425b1befe40d41b72eb0fe211711c7ae334db5eb60307e9dd09066ed060cceba", size = 18405952, upload-time = "2025-12-11T21:48:02.947Z" }, +] + [[package]] name = "pyparsing" version = "3.2.5"