From d79c1a2b95cfda6420614b3bcb12a8751999d5f0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 8 Mar 2026 23:48:39 +0100 Subject: [PATCH 1/3] Add pycryptodome dependency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 463db58f1..36172b0fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ PyPDF2==3.0.1 python-dotenv==1.1.0 tiktoken==0.11.0 pyyaml==6.0.2 +pycryptodome==3.15.0 \ No newline at end of file From 584a9c9575c7f1428d92c3c780fb096caac79c86 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 8 Mar 2026 23:49:38 +0100 Subject: [PATCH 2/3] Fix offset defaulting to 0 when calculate_page_offset returns None --- pageindex/page_index.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 39018c4df..5b48535cb 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -632,6 +632,8 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_che logger.info(f'matching_pairs: {matching_pairs}') offset = calculate_page_offset(matching_pairs) + if offset is None: + offset = 0 logger.info(f'offset: {offset}') toc_with_page_number = add_page_offset_to_toc_json(toc_with_page_number, offset) From ba8c6c36dfb2cc422307dfde8eb97a79f15db960 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 8 Mar 2026 23:49:49 +0100 Subject: [PATCH 3/3] Add configurable OpenAI model and tokenizer parameter --- pageindex/config.yaml | 1 + pageindex/page_index.py | 18 +++++++++--------- pageindex/page_index_md.py | 26 +++++++++++++------------- pageindex/utils.py | 14 ++++++++++---- run_pageindex.py | 4 ++++ 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/pageindex/config.yaml b/pageindex/config.yaml index fd73e3a2c..9414f0e11 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,4 +1,5 @@ model: "gpt-4o-2024-11-20" +tokenizer: "" toc_check_page_num: 20 max_page_num_each_node: 10 max_token_num_each_node: 20000 diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 5b48535cb..f1c9b45ca 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -565,13 +565,13 @@ def generate_toc_init(part, model=None): else: raise Exception(f'finish reason: {finish_reason}') -def process_no_toc(page_list, start_index=1, model=None, logger=None): +def process_no_toc(page_list, start_index=1, model=None, logger=None, tokenizer=None): page_contents=[] token_lengths=[] for page_index in range(start_index, start_index+len(page_list)): page_text = f"\n{page_list[page_index-start_index][0]}\n\n\n" page_contents.append(page_text) - token_lengths.append(count_tokens(page_text, model)) + token_lengths.append(count_tokens(page_text, model, tokenizer=tokenizer)) group_texts = page_list_to_group_text(page_contents, token_lengths) logger.info(f'len(group_texts): {len(group_texts)}') @@ -586,7 +586,7 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None): return toc_with_page_number -def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None): +def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None, tokenizer=None): page_contents=[] token_lengths=[] toc_content = toc_transformer(toc_content, model) @@ -594,8 +594,8 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_in for page_index in range(start_index, start_index+len(page_list)): page_text = f"\n{page_list[page_index-start_index][0]}\n\n\n" page_contents.append(page_text) - token_lengths.append(count_tokens(page_text, model)) - + token_lengths.append(count_tokens(page_text, model, tokenizer=tokenizer)) + group_texts = page_list_to_group_text(page_contents, token_lengths) logger.info(f'len(group_texts): {len(group_texts)}') @@ -957,9 +957,9 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N if mode == 'process_toc_with_page_numbers': toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger) elif mode == 'process_toc_no_page_numbers': - toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger) + toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger, tokenizer=opt.tokenizer) else: - toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger) + toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger, tokenizer=opt.tokenizer) toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] @@ -1068,7 +1068,7 @@ def page_index_main(doc, opt=None): raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc) + page_list = get_page_tokens(doc, model=opt.model, tokenizer=opt.tokenizer) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) @@ -1102,7 +1102,7 @@ async def page_index_builder(): return asyncio.run(page_index_builder()) -def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, +def page_index(doc, model=None, tokenizer=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None): user_opt = { diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 70e8de086..11c482a3b 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -7,18 +7,18 @@ except: from utils import * -async def get_node_summary(node, summary_token_threshold=200, model=None): +async def get_node_summary(node, summary_token_threshold=200, model=None, tokenizer=None): node_text = node.get('text') - num_tokens = count_tokens(node_text, model=model) + num_tokens = count_tokens(node_text, model=model, tokenizer=tokenizer) if num_tokens < summary_token_threshold: return node_text else: return await generate_node_summary(node, model=model) -async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None): +async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None, tokenizer=None): nodes = structure_to_list(structure) - tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model) for node in nodes] + tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model, tokenizer=tokenizer) for node in nodes] summaries = await asyncio.gather(*tasks) for node, summary in zip(nodes, summaries): @@ -86,7 +86,7 @@ def extract_node_text_content(node_list, markdown_lines): node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip() return all_nodes -def update_node_list_with_text_token_count(node_list, model=None): +def update_node_list_with_text_token_count(node_list, model=None, tokenizer=None): def find_all_children(parent_index, parent_level, node_list): """Find all direct and indirect children of a parent node""" @@ -127,12 +127,12 @@ def find_all_children(parent_index, parent_level, node_list): total_text += '\n' + child_text # Calculate token count for combined text - result_list[i]['text_token_count'] = count_tokens(total_text, model=model) - + result_list[i]['text_token_count'] = count_tokens(total_text, model=model, tokenizer=tokenizer) + return result_list -def tree_thinning_for_index(node_list, min_node_token=None, model=None): +def tree_thinning_for_index(node_list, min_node_token=None, model=None, tokenizer=None): def find_all_children(parent_index, parent_level, node_list): children_indices = [] @@ -179,7 +179,7 @@ def find_all_children(parent_index, parent_level, node_list): result_list[i]['text'] = merged_text - result_list[i]['text_token_count'] = count_tokens(merged_text, model=model) + result_list[i]['text_token_count'] = count_tokens(merged_text, model=model, tokenizer=tokenizer) for index in sorted(nodes_to_remove, reverse=True): result_list.pop(index) @@ -240,7 +240,7 @@ def clean_tree_for_output(tree_nodes): return cleaned_nodes -async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'): +async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, tokenizer=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'): with open(md_path, 'r', encoding='utf-8') as f: markdown_content = f.read() @@ -251,9 +251,9 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad nodes_with_content = extract_node_text_content(node_list, markdown_lines) if if_thinning: - nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model) + nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model, tokenizer=tokenizer) print(f"Thinning nodes...") - nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model) + nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model, tokenizer=tokenizer) print(f"Building tree from nodes...") tree_structure = build_tree_from_nodes(nodes_with_content) @@ -268,7 +268,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes']) print(f"Generating summaries for each node...") - tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model) + tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model, tokenizer=tokenizer) if if_add_node_text == 'no': # Remove text after summary generation if not requested diff --git a/pageindex/utils.py b/pageindex/utils.py index dc7acd888..84cf9dcbc 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -19,10 +19,13 @@ CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") -def count_tokens(text, model=None): +def count_tokens(text, model=None, tokenizer=None): if not text: return 0 - enc = tiktoken.encoding_for_model(model) + if tokenizer: + enc = tiktoken.get_encoding(tokenizer) + else: + enc = tiktoken.encoding_for_model(model) tokens = enc.encode(text) return len(tokens) @@ -410,8 +413,11 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): - enc = tiktoken.encoding_for_model(model) +def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2", tokenizer=None): + if tokenizer: + enc = tiktoken.get_encoding(tokenizer) + else: + enc = tiktoken.encoding_for_model(model) if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] diff --git a/run_pageindex.py b/run_pageindex.py index 107024505..acb715794 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -11,6 +11,7 @@ parser.add_argument('--md_path', type=str, help='Path to the Markdown file') parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') + parser.add_argument('--tokenizer', type=str, default='', help='Tiktoken encoding name (e.g. o200k_base). Empty = auto-detect from model.') parser.add_argument('--toc-check-pages', type=int, default=20, help='Number of pages to check for table of contents (PDF only)') @@ -54,6 +55,7 @@ # Configure options opt = config( model=args.model, + tokenizer=args.tokenizer, toc_check_page_num=args.toc_check_pages, max_page_num_each_node=args.max_pages_per_node, max_token_num_each_node=args.max_tokens_per_node, @@ -98,6 +100,7 @@ # Create options dict with user args user_opt = { 'model': args.model, + 'tokenizer': args.tokenizer, 'if_add_node_summary': args.if_add_node_summary, 'if_add_doc_description': args.if_add_doc_description, 'if_add_node_text': args.if_add_node_text, @@ -114,6 +117,7 @@ if_add_node_summary=opt.if_add_node_summary, summary_token_threshold=args.summary_token_threshold, model=opt.model, + tokenizer=opt.tokenizer, if_add_doc_description=opt.if_add_doc_description, if_add_node_text=opt.if_add_node_text, if_add_node_id=opt.if_add_node_id