Merge pull request #22 from 74th/fix/whisper-cpp

74th · web-flow · commit c6bb02492e28 · 2026-03-14T09:50:09.000+09:00
Handle invalid UTF-8 in whisper transcription responses
diff --git a/stackchan_server/speech_recognition/whisper_cpp.py b/stackchan_server/speech_recognition/whisper_cpp.py
@@ -165,7 +165,14 @@ def _normalize_transcript(text: str) -> str:
 
 
 def _load_transcript_from_json(path: Path) -> str:
-    data = json.loads(path.read_text(encoding="utf-8"))
+    try:
+        raw_bytes = path.read_bytes()
+        text = raw_bytes.decode("utf-8", errors="replace")
+        if "\ufffd" in text:
+            logger.warning("whisper.cpp JSON output contains invalid UTF-8 bytes")
+        data = json.loads(text)
+    except (json.JSONDecodeError, OSError) as exc:
+        raise RuntimeError(f"Failed to read whisper.cpp JSON output: {exc}") from exc
     transcription = data.get("transcription")
     if not isinstance(transcription, list):
         return ""
diff --git a/stackchan_server/speech_recognition/whisper_server.py b/stackchan_server/speech_recognition/whisper_server.py
@@ -96,11 +96,14 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
             raise RuntimeError(f"whisper-server request failed: {exc.reason}") from exc
 
         if self._response_format == "json":
-            payload = json.loads(response_body.decode("utf-8"))
+            payload = _load_json_response_bytes(response_body)
+            if not isinstance(payload, Mapping):
+                return ""
+            payload = cast(Mapping[str, object], payload)
             text = payload.get("text")
             return text.strip() if isinstance(text, str) else ""
 
-        payload = json.loads(response_body.decode("utf-8"))
+        payload = _load_json_response_bytes(response_body)
         return _load_transcript_from_verbose_json(payload)
 
 
@@ -118,6 +121,13 @@ def _normalize_language(language_code: str) -> str:
     return language_code.split("-", 1)[0].lower()
 
 
+def _load_json_response_bytes(response_body: bytes) -> object:
+    response_text = response_body.decode("utf-8", errors="replace")
+    if "\ufffd" in response_text:
+        logger.warning("whisper-server JSON output contains invalid UTF-8 bytes")
+    return json.loads(response_text)
+
+
 def _load_transcript_from_verbose_json(payload: object) -> str:
     if not isinstance(payload, Mapping):
         return ""
diff --git a/tests/test_whisper_server.py b/tests/test_whisper_server.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+import unittest
+
+from stackchan_server.speech_recognition.whisper_server import (
+    _load_json_response_bytes,
+    _load_transcript_from_verbose_json,
+)
+
+
+class WhisperServerJsonTests(unittest.TestCase):
+    def test_load_json_response_bytes_replaces_invalid_utf8(self) -> None:
+        payload = _load_json_response_bytes(b'{"transcription":[{"text":"\xe6\x90"},{"text":"ok"}]}')
+
+        self.assertEqual(payload, {"transcription": [{"text": "�"}, {"text": "ok"}]})
+
+    def test_load_transcript_from_verbose_json_with_replacement_char(self) -> None:
+        payload = {
+            "transcription": [
+                {"text": "�"},
+                {"text": "ok"},
+            ]
+        }
+
+        transcript = _load_transcript_from_verbose_json(payload)
+
+        self.assertEqual(transcript, "� ok")
+
+
+if __name__ == "__main__":
+    unittest.main()