Zipstack · johnyrahul · Jun 10, 2026 · Jun 8, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py
@@ -385,6 +385,7 @@ def whisper(
         line_spitter_strategy: str = "left-priority",
         add_line_nos: bool = False,
         include_line_confidence: bool = False,
+        word_confidence_threshold: float = 0.3,
         lang: str = "eng",
         tag: str = "default",
         filename: str = "",
@@ -401,8 +402,8 @@ def whisper(
             file_path (str, optional): The path to the file to be processed. Defaults to "".
             stream (IO[bytes], optional): A stream of bytes to be processed. Defaults to None.
             url (str, optional): The URL of the file to be processed. Defaults to "".
-            mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text".
-                Defaults to "high_quality".
+            mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost", "native_text"
+                or "table". Defaults to "high_quality".
             output_mode (str, optional): The output mode. Can be "layout_preserving" or "text".
                 Defaults to "layout_preserving".
             page_seperator (str, optional): The page separator. Defaults to "<<<".
@@ -418,6 +419,11 @@ def whisper(
               which can be queried later using the highlights API.
             include_line_confidence (bool, optional): Adds line confidence to the line metadata returned by
               the highlights API. Requires add_line_nos to be enabled. Defaults to False.
+            word_confidence_threshold (float, optional): The minimum OCR confidence score a word must have to be
+              included in the extracted text. Accepts a value in the range [0.0, 1.0], where higher values are
+              stricter. Any word whose confidence value falls below the configured threshold is ignored and
+              excluded from the final output. This parameter works only with "form", "high_quality" and "table"
+              modes. Defaults to 0.3.
             lang (str, optional): The language of the document. Defaults to "eng".
             tag (str, optional): The tag for the document. Defaults to "default".
             filename (str, optional): The name of the file to store in reports. Defaults to "".
@@ -454,6 +460,7 @@ def whisper(
             "line_spitter_strategy": line_spitter_strategy,
             "add_line_nos": add_line_nos,
             "include_line_confidence": include_line_confidence,
+            "word_confidence_threshold": word_confidence_threshold,
             "lang": lang,
             "tag": tag,
             "filename": filename,

diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py
@@ -206,6 +206,13 @@ def test_webhook(client_v2: LLMWhispererClientV2, url: str, token: str, webhook_
     Returns:
         None
     """
+    # Clean up any webhook left over from a previous (possibly failed) run so
+    # registration starts from a clean slate.
+    try:
+        client_v2.delete_webhook(webhook_name)
+    except LLMWhispererClientException:
+        pass
+
     result = client_v2.register_webhook(url, token, webhook_name)
     assert isinstance(result, dict)
     assert result["message"] == "Webhook created successfully"

diff --git a/tests/unit/client_v2_test.py b/tests/unit/client_v2_test.py
@@ -1,5 +1,6 @@
 import time
 from unittest.mock import MagicMock
+from urllib.parse import parse_qs, urlparse
 
 import pytest
 import requests
@@ -151,6 +152,34 @@ def test_whisper_invalid_json_response_202(mocker: MockerFixture, client_v2: LLM
     assert response["extraction"] == {}
 
 
+def test_whisper_default_word_confidence_threshold(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None:
+    """whisper() sends the default word_confidence_threshold when not specified."""
+    mock_send = mocker.patch("requests.Session.send")
+    mock_send.return_value = _mock_response(200, '{"status_code": 200, "extraction": {"text": "ok"}}')
+
+    client_v2.whisper(url="https://example.com/test.pdf", wait_for_completion=False)
+
+    prepared_request = mock_send.call_args[0][0]
+    query = parse_qs(urlparse(prepared_request.url).query)
+    assert query["word_confidence_threshold"] == ["0.3"]
+
+
+def test_whisper_custom_word_confidence_threshold(mocker: MockerFixture, client_v2: LLMWhispererClientV2) -> None:
+    """whisper() forwards a custom word_confidence_threshold as a request param."""
+    mock_send = mocker.patch("requests.Session.send")
+    mock_send.return_value = _mock_response(200, '{"status_code": 200, "extraction": {"text": "ok"}}')
+
+    client_v2.whisper(
+        url="https://example.com/test.pdf",
+        word_confidence_threshold=0.75,
+        wait_for_completion=False,
+    )
+
+    prepared_request = mock_send.call_args[0][0]
+    query = parse_qs(urlparse(prepared_request.url).query)
+    assert query["word_confidence_threshold"] == ["0.75"]
+
+
 # --- Retry behavior tests ---