From 2967d3e84b57278ed37a61885f8000ea464ffa49 Mon Sep 17 00:00:00 2001 From: Rahul Johny Date: Mon, 8 Jun 2026 17:49:08 +0530 Subject: [PATCH 1/3] feat: add word_confidence_threshold parameter to whisper Adds the `word_confidence_threshold` (float, default 0.3) parameter to the v2 client's whisper() method, forwarding it to the API. Words whose OCR confidence falls below the threshold are excluded from the output. Works with form, high_quality and table modes. Adds unit tests covering the default and a custom value. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/unstract/llmwhisperer/client_v2.py | 6 ++++++ tests/unit/client_v2_test.py | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index 00afc50..8c1b0e9 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -385,6 +385,7 @@ def whisper( line_spitter_strategy: str = "left-priority", add_line_nos: bool = False, include_line_confidence: bool = False, + word_confidence_threshold: float = 0.3, lang: str = "eng", tag: str = "default", filename: str = "", @@ -418,6 +419,10 @@ def whisper( which can be queried later using the highlights API. include_line_confidence (bool, optional): Adds line confidence to the line metadata returned by the highlights API. Requires add_line_nos to be enabled. Defaults to False. + word_confidence_threshold (float, optional): The minimum OCR confidence score a word must have to be + included in the extracted text. Any text whose confidence value falls below the configured threshold + is ignored and excluded from the final output. This parameter works only with "form", "high_quality" + and "table" modes. Defaults to 0.3. lang (str, optional): The language of the document. Defaults to "eng". tag (str, optional): The tag for the document. Defaults to "default". filename (str, optional): The name of the file to store in reports. Defaults to "". @@ -454,6 +459,7 @@ def whisper( "line_spitter_strategy": line_spitter_strategy, "add_line_nos": add_line_nos, "include_line_confidence": include_line_confidence, + "word_confidence_threshold": word_confidence_threshold, "lang": lang, "tag": tag, "filename": filename, diff --git a/tests/unit/client_v2_test.py b/tests/unit/client_v2_test.py index 8b1535f..952b120 100644 --- a/tests/unit/client_v2_test.py +++ b/tests/unit/client_v2_test.py @@ -151,6 +151,32 @@ def test_whisper_invalid_json_response_202(mocker: MockerFixture, client_v2: LLM assert response["extraction"] == {} +def test_whisper_default_word_confidence_threshold(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: + """whisper() sends the default word_confidence_threshold when not specified.""" + mock_send = mocker.patch("requests.Session.send") + mock_send.return_value = _mock_response(200, '{"status_code": 200, "extraction": {"text": "ok"}}') + + client_v2.whisper(url="https://example.com/test.pdf", wait_for_completion=False) + + prepared_request = mock_send.call_args[0][0] + assert "word_confidence_threshold=0.3" in prepared_request.url + + +def test_whisper_custom_word_confidence_threshold(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: + """whisper() forwards a custom word_confidence_threshold as a request param.""" + mock_send = mocker.patch("requests.Session.send") + mock_send.return_value = _mock_response(200, '{"status_code": 200, "extraction": {"text": "ok"}}') + + client_v2.whisper( + url="https://example.com/test.pdf", + word_confidence_threshold=0.75, + wait_for_completion=False, + ) + + prepared_request = mock_send.call_args[0][0] + assert "word_confidence_threshold=0.75" in prepared_request.url + + # --- Retry behavior tests --- From 3cba816c2ede99b6708cb023a4369a9bf55f1a02 Mon Sep 17 00:00:00 2001 From: Rahul Johny Date: Tue, 9 Jun 2026 15:57:34 +0530 Subject: [PATCH 2/3] refactor: address PR review on word_confidence_threshold - Add "table" to the list of valid whisper() modes in the docstring - Document the valid range [0.0, 1.0] for word_confidence_threshold and make the wording word-consistent - Assert on the parsed query value (parse_qs) instead of a URL substring so the tests can't false-match on a prefix (e.g. 0.3 vs 0.35) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/unstract/llmwhisperer/client_v2.py | 11 ++++++----- tests/unit/client_v2_test.py | 7 +++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index 8c1b0e9..4f47005 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -402,8 +402,8 @@ def whisper( file_path (str, optional): The path to the file to be processed. Defaults to "". stream (IO[bytes], optional): A stream of bytes to be processed. Defaults to None. url (str, optional): The URL of the file to be processed. Defaults to "". - mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text". - Defaults to "high_quality". + mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost", "native_text" + or "table". Defaults to "high_quality". output_mode (str, optional): The output mode. Can be "layout_preserving" or "text". Defaults to "layout_preserving". page_seperator (str, optional): The page separator. Defaults to "<<<". @@ -420,9 +420,10 @@ def whisper( include_line_confidence (bool, optional): Adds line confidence to the line metadata returned by the highlights API. Requires add_line_nos to be enabled. Defaults to False. word_confidence_threshold (float, optional): The minimum OCR confidence score a word must have to be - included in the extracted text. Any text whose confidence value falls below the configured threshold - is ignored and excluded from the final output. This parameter works only with "form", "high_quality" - and "table" modes. Defaults to 0.3. + included in the extracted text. Accepts a value in the range [0.0, 1.0], where higher values are + stricter. Any word whose confidence value falls below the configured threshold is ignored and + excluded from the final output. This parameter works only with "form", "high_quality" and "table" + modes. Defaults to 0.3. lang (str, optional): The language of the document. Defaults to "eng". tag (str, optional): The tag for the document. Defaults to "default". filename (str, optional): The name of the file to store in reports. Defaults to "". diff --git a/tests/unit/client_v2_test.py b/tests/unit/client_v2_test.py index 952b120..eb5921f 100644 --- a/tests/unit/client_v2_test.py +++ b/tests/unit/client_v2_test.py @@ -1,5 +1,6 @@ import time from unittest.mock import MagicMock +from urllib.parse import parse_qs, urlparse import pytest import requests @@ -159,7 +160,8 @@ def test_whisper_default_word_confidence_threshold(mocker: MockerFixture, client client_v2.whisper(url="https://example.com/test.pdf", wait_for_completion=False) prepared_request = mock_send.call_args[0][0] - assert "word_confidence_threshold=0.3" in prepared_request.url + query = parse_qs(urlparse(prepared_request.url).query) + assert query["word_confidence_threshold"] == ["0.3"] def test_whisper_custom_word_confidence_threshold(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None: @@ -174,7 +176,8 @@ def test_whisper_custom_word_confidence_threshold(mocker: MockerFixture, client_ ) prepared_request = mock_send.call_args[0][0] - assert "word_confidence_threshold=0.75" in prepared_request.url + query = parse_qs(urlparse(prepared_request.url).query) + assert query["word_confidence_threshold"] == ["0.75"] # --- Retry behavior tests --- From 244e810e920b424e2e61e740242ce5596fec52b7 Mon Sep 17 00:00:00 2001 From: Rahul Johny Date: Tue, 9 Jun 2026 16:08:59 +0530 Subject: [PATCH 3/3] test: delete pre-existing webhook before registering in test_webhook A webhook left over from a previous failed run caused register_webhook to fail on a stale record. Delete any existing webhook (ignoring not-found) before registering so the test starts from a clean slate. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/integration/client_v2_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py index dd60297..7ec0a5a 100644 --- a/tests/integration/client_v2_test.py +++ b/tests/integration/client_v2_test.py @@ -206,6 +206,13 @@ def test_webhook(client_v2: LLMWhispererClientV2, url: str, token: str, webhook_ Returns: None """ + # Clean up any webhook left over from a previous (possibly failed) run so + # registration starts from a clean slate. + try: + client_v2.delete_webhook(webhook_name) + except LLMWhispererClientException: + pass + result = client_v2.register_webhook(url, token, webhook_name) assert isinstance(result, dict) assert result["message"] == "Webhook created successfully"