Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,34 +147,65 @@ You can follow these steps to generate a PageIndex tree from a PDF document.
pip3 install --upgrade -r requirements.txt
```

### 2. Set your OpenAI API key
### 2. Set your API key

Create a `.env` file in the root directory and add your API key:
Create a `.env` file in the root directory and add your API key for your chosen provider:

```bash
# OpenAI (default)
CHATGPT_API_KEY=your_openai_key_here

# MiniMax (optional)
MINIMAX_API_KEY=your_minimax_key_here
```

### 3. Run PageIndex on your PDF

**OpenAI (default):**
```bash
python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
```

**MiniMax:**
```bash
python3 run_pageindex.py --pdf_path /path/to/your/document.pdf \
--provider minimax --model MiniMax-Text-01
```

<details>
<summary><strong>Optional parameters</strong></summary>
<br>
You can customize the processing with additional optional arguments:

```
--model OpenAI model to use (default: gpt-4o-2024-11-20)
--model Model to use (default: gpt-4o-2024-11-20)
--provider LLM provider: openai or minimax (default: openai)
--api-base-url Custom API base URL (e.g. https://api.minimax.io/v1 for MiniMax)
--toc-check-pages Pages to check for table of contents (default: 20)
--max-pages-per-node Max pages per node (default: 10)
--max-tokens-per-node Max tokens per node (default: 20000)
--if-add-node-id Add node ID (yes/no, default: yes)
--if-add-node-summary Add node summary (yes/no, default: yes)
--if-add-doc-description Add doc description (yes/no, default: yes)
```

You can also set the provider via environment variables instead of CLI flags:
```bash
export LLM_PROVIDER=minimax
export API_BASE_URL=https://api.minimax.io/v1 # optional, for custom endpoints
```
</details>

<details>
<summary><strong>Supported LLM Providers</strong></summary>
<br>

| Provider | Example Models | API Key Env Var | Notes |
|----------|---------------|-----------------|-------|
| **OpenAI** (default) | `gpt-4o-2024-11-20`, `gpt-4o-mini` | `CHATGPT_API_KEY` | Full support, recommended |
| **MiniMax** | `MiniMax-M2.5 | `MINIMAX_API_KEY` | Full support via OpenAI-compatible API at `https://api.minimax.io/v1` |

**Note:** PageIndex relies on structured JSON output from the LLM. For best results, use capable models (GPT-4o or MiniMax-Text-01). Smaller models may produce lower-quality tree structures.
</details>

<details>
Expand Down
2 changes: 2 additions & 0 deletions pageindex/config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
model: "gpt-4o-2024-11-20"
provider: "openai" # "openai" or "minimax"
api_base_url: null # Custom API base URL (e.g. https://api.minimax.io/v1 for MiniMax)
toc_check_page_num: 20
max_page_num_each_node: 10
max_token_num_each_node: 20000
Expand Down
17 changes: 14 additions & 3 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,17 @@ async def tree_parser(page_list, opt, doc=None, logger=None):

def page_index_main(doc, opt=None):
logger = JsonLogger(doc)


# Set provider config from opt so all downstream API calls pick it up
if hasattr(opt, 'provider') and opt.provider:
os.environ['LLM_PROVIDER'] = opt.provider
from pageindex import utils
utils.LLM_PROVIDER = opt.provider
if hasattr(opt, 'api_base_url') and opt.api_base_url:
os.environ['API_BASE_URL'] = opt.api_base_url
from pageindex import utils
utils.API_BASE_URL = opt.api_base_url

is_valid_pdf = (
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
isinstance(doc, BytesIO)
Expand All @@ -1066,7 +1076,7 @@ def page_index_main(doc, opt=None):
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")

print('Parsing PDF...')
page_list = get_page_tokens(doc)
page_list = get_page_tokens(doc, model=opt.model)

logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])})
Expand Down Expand Up @@ -1100,7 +1110,8 @@ async def page_index_builder():
return asyncio.run(page_index_builder())


def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
def page_index(doc, model=None, provider=None, api_base_url=None,
toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):

user_opt = {
Expand Down
98 changes: 82 additions & 16 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,88 @@
from types import SimpleNamespace as config

CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY")
LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai") # "openai" or "minimax"
API_BASE_URL = os.getenv("API_BASE_URL") # Custom API base URL

MINIMAX_BASE_URL = "https://api.minimax.io/v1"


def _is_minimax_model(model):
"""Check if the model is a MiniMax model by name prefix."""
if not model:
return False
model_lower = model.lower()
# Support various MiniMax model naming patterns:
# - minimax-m1, minimax-m2.5, minimax-m2.5-highspeed
# - MiniMax-Text-01, abab6.5s-chat, etc.
return model_lower.startswith("minimax") or model_lower.startswith("abab")


def _get_provider_config(provider=None, api_key=None, base_url=None):
"""Resolve provider, api_key, and base_url from args or environment."""
provider = provider or LLM_PROVIDER

if provider == "minimax":
return {
"provider": "minimax",
"api_key": api_key or MINIMAX_API_KEY,
"base_url": base_url or API_BASE_URL or MINIMAX_BASE_URL,
}
else: # openai (default)
cfg = {
"provider": "openai",
"api_key": api_key or CHATGPT_API_KEY,
}
if base_url or API_BASE_URL:
cfg["base_url"] = base_url or API_BASE_URL
return cfg


def _get_client_kwargs(model, api_key=None, provider=None, base_url=None):
"""Get OpenAI client kwargs based on model name or explicit provider config."""
# If provider is explicitly set, use provider config
if provider:
pcfg = _get_provider_config(provider, api_key, base_url)
client_kwargs = {"api_key": pcfg["api_key"]}
if "base_url" in pcfg:
client_kwargs["base_url"] = pcfg["base_url"]
return client_kwargs

# Auto-detect based on model name
if _is_minimax_model(model):
return {
"api_key": api_key or MINIMAX_API_KEY,
"base_url": base_url or API_BASE_URL or MINIMAX_BASE_URL,
}

# Default to OpenAI
cfg = {"api_key": api_key or CHATGPT_API_KEY}
if base_url or API_BASE_URL:
cfg["base_url"] = base_url or API_BASE_URL
return cfg

def count_tokens(text, model=None):
if not text:
return 0
enc = tiktoken.encoding_for_model(model)
try:
enc = tiktoken.encoding_for_model(model)
except KeyError:
enc = tiktoken.get_encoding("cl100k_base")
tokens = enc.encode(text)
return len(tokens)

def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
def ChatGPT_API_with_finish_reason(model, prompt, api_key=None, chat_history=None, provider=None, base_url=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(**_get_client_kwargs(model, api_key, provider, base_url))
for i in range(max_retries):
try:
if chat_history:
messages = chat_history
messages.append({"role": "user", "content": prompt})
else:
messages = [{"role": "user", "content": prompt}]

response = client.chat.completions.create(
model=model,
messages=messages,
Expand All @@ -51,47 +114,47 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_
print('************* Retrying *************')
logging.error(f"Error: {e}")
if i < max_retries - 1:
time.sleep(1) # Wait for 1秒 before retrying
time.sleep(1)
else:
logging.error('Max retries reached for prompt: ' + prompt)
return "Error"



def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
def ChatGPT_API(model, prompt, api_key=None, chat_history=None, provider=None, base_url=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(**_get_client_kwargs(model, api_key, provider, base_url))
for i in range(max_retries):
try:
if chat_history:
messages = chat_history
messages.append({"role": "user", "content": prompt})
else:
messages = [{"role": "user", "content": prompt}]

response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)

return response.choices[0].message.content
except Exception as e:
print('************* Retrying *************')
logging.error(f"Error: {e}")
if i < max_retries - 1:
time.sleep(1) # Wait for 1秒 before retrying
time.sleep(1)
else:
logging.error('Max retries reached for prompt: ' + prompt)
return "Error"


async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
async def ChatGPT_API_async(model, prompt, api_key=None, provider=None, base_url=None):
max_retries = 10
messages = [{"role": "user", "content": prompt}]
for i in range(max_retries):
try:
async with openai.AsyncOpenAI(api_key=api_key) as client:
async with openai.AsyncOpenAI(**_get_client_kwargs(model, api_key, provider, base_url)) as client:
response = await client.chat.completions.create(
model=model,
messages=messages,
Expand All @@ -102,7 +165,7 @@ async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
print('************* Retrying *************')
logging.error(f"Error: {e}")
if i < max_retries - 1:
await asyncio.sleep(1) # Wait for 1s before retrying
await asyncio.sleep(1)
else:
logging.error('Max retries reached for prompt: ' + prompt)
return "Error"
Expand Down Expand Up @@ -411,7 +474,10 @@ def add_preface_if_needed(data):


def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
enc = tiktoken.encoding_for_model(model)
try:
enc = tiktoken.encoding_for_model(model)
except KeyError:
enc = tiktoken.get_encoding("cl100k_base")
if pdf_parser == "PyPDF2":
pdf_reader = PyPDF2.PdfReader(pdf_path)
page_list = []
Expand Down Expand Up @@ -530,10 +596,10 @@ def remove_structure_text(data):
return data


def check_token_limit(structure, limit=110000):
def check_token_limit(structure, limit=110000, model='gpt-4o'):
list = structure_to_list(structure)
for node in list:
num_tokens = count_tokens(node['text'], model='gpt-4o')
num_tokens = count_tokens(node['text'], model=model)
if num_tokens > limit:
print(f"Node ID: {node['node_id']} has {num_tokens} tokens")
print("Start Index:", node['start_index'])
Expand Down
12 changes: 11 additions & 1 deletion run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')

parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20',
help='Model to use (e.g. gpt-4o-2024-11-20, MiniMax-Text-01, abab6.5s-chat)')
parser.add_argument('--provider', type=str, default='openai',
choices=['openai', 'minimax'],
help='LLM provider: openai or minimax (default: openai)')
parser.add_argument('--api-base-url', type=str, default=None,
help='Custom API base URL (e.g. https://api.minimax.io/v1 for MiniMax)')

parser.add_argument('--toc-check-pages', type=int, default=20,
help='Number of pages to check for table of contents (PDF only)')
Expand Down Expand Up @@ -54,6 +60,8 @@
# Configure options
opt = config(
model=args.model,
provider=args.provider,
api_base_url=args.api_base_url,
toc_check_page_num=args.toc_check_pages,
max_page_num_each_node=args.max_pages_per_node,
max_token_num_each_node=args.max_tokens_per_node,
Expand Down Expand Up @@ -98,6 +106,8 @@
# Create options dict with user args
user_opt = {
'model': args.model,
'provider': args.provider,
'api_base_url': args.api_base_url,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
Expand Down