diff --git a/.vscode/settings.json b/.vscode/settings.json index c02be26..26c144c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,5 +12,7 @@ "numeric": "cpp", "ostream": "cpp", "sstream": "cpp" - } + }, + "python.analysis.typeCheckingMode": "off", + "ty.interpreter": [".venv/bin/python"] } diff --git a/misc/on_mac_demo/README.md b/misc/on_mac_demo/README.md new file mode 100644 index 0000000..5a51674 --- /dev/null +++ b/misc/on_mac_demo/README.md @@ -0,0 +1,29 @@ +need Xcode app + +``` +sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer +``` + +``` +sudo git clone https://github.com/ggml-org/whisper.cpp /opt/whisper.cpp +sudo chown -R $(id -u):$(id -g) /opt/whisper.cpp + +cd /opt/whisper.cpp + +uv venv -p 3.11 +uv pip install ane_transformers openai-whisper coremltools +source .venv/bin/activate +``` + +``` +./models/generate-coreml-model.sh small +./models/download-ggml-model.sh small +./models/generate-coreml-model.sh large-v3-turbo +./models/download-ggml-model.sh large-v3-turbo +``` + +``` +# rm -rf build +cmake -B build -DWHISPER_COREML=ON -DWHISPER_FFMPEG=ON -DGGML_NATIVE=OFF +cmake --build build -j --config Release +``` diff --git a/misc/on_mac_demo/run-whisper-server-large-turbo.sh b/misc/on_mac_demo/run-whisper-server-large-turbo.sh new file mode 100755 index 0000000..40b4be9 --- /dev/null +++ b/misc/on_mac_demo/run-whisper-server-large-turbo.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -xe + +/opt/whisper.cpp/build/bin/whisper-server \ + --host 0.0.0.0 \ + --port "8432" \ + --model /opt/whisper.cpp/models/ggml-large-v3-turbo.bin \ + -l ja \ + -nt \ + -sns \ + --vad \ + -vm /opt/whisper.cpp/models/ggml-silero-v6.2.0.bin \ + -vt "0.5" \ + -vspd "100" \ + -vsd "500" \ + -vp "200" \ + --convert diff --git a/misc/on_mac_demo/run-whisper-server-small.sh b/misc/on_mac_demo/run-whisper-server-small.sh new file mode 100755 index 0000000..94217c0 --- /dev/null +++ b/misc/on_mac_demo/run-whisper-server-small.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -xe + +/opt/whisper.cpp/build/bin/whisper-server \ + --host 0.0.0.0 \ + --port "8431" \ + --model /opt/whisper.cpp/models/ggml-small.bin \ + -l ja \ + -nt \ + -sns \ + --vad \ + -vm /opt/whisper.cpp/models/ggml-silero-v6.2.0.bin \ + -vt "0.5" \ + -vspd "100" \ + -vsd "500" \ + -vp "200" diff --git a/stackchan_server/speech_recognition/whisper_server.py b/stackchan_server/speech_recognition/whisper_server.py index d508cb5..c992ab1 100644 --- a/stackchan_server/speech_recognition/whisper_server.py +++ b/stackchan_server/speech_recognition/whisper_server.py @@ -43,16 +43,16 @@ def __init__( *, config: WhisperServerSpeechToTextConfig | None = None, ) -> None: - self._conf = config or WhisperServerSpeechToTextConfig() - self._server_url = self._conf.url + self.config = config or WhisperServerSpeechToTextConfig() + self._server_url = self.config.url async def transcribe(self, pcm_bytes: bytes) -> str: rms_level = _pcm_rms_level(pcm_bytes) - if rms_level < self._conf.silence_rms_threshold: + if rms_level < self.config.silence_rms_threshold: logger.info( "Skipping whisper-server transcription because pcm rms %.2f is below silence threshold %.2f", rms_level, - self._conf.silence_rms_threshold, + self.config.silence_rms_threshold, ) return "" @@ -65,7 +65,7 @@ async def transcribe(self, pcm_bytes: bytes) -> str: transcript = await asyncio.to_thread( self._request_transcript, wav_bytes, - self._conf.language, + self.config.language, ) if transcript: logger.info("whisper-server transcript: %s", transcript) @@ -73,20 +73,20 @@ async def transcribe(self, pcm_bytes: bytes) -> str: def _request_transcript(self, wav_bytes: bytes, language: str) -> str: fields = { - "response_format": self._conf.response_format, + "response_format": self.config.response_format, } normalized_language = language.strip() if normalized_language: fields["language"] = normalized_language - if self._conf.prompt: - fields["prompt"] = self._conf.prompt + if self.config.prompt: + fields["prompt"] = self.config.prompt - if self._conf.model: - fields["model"] = self._conf.model + if self.config.model: + fields["model"] = self.config.model - if self._conf.detect_language: + if self.config.detect_language: fields["detect_language"] = "true" body, content_type = _encode_multipart_formdata( @@ -102,7 +102,7 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str: logger.info("Running whisper-server request: POST %s", self._server_url) try: with urlopen( - request, timeout=self._conf.request_timeout_seconds + request, timeout=self.config.request_timeout_seconds ) as response: response_body = response.read() except HTTPError as exc: @@ -113,7 +113,7 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str: except URLError as exc: raise RuntimeError(f"whisper-server request failed: {exc.reason}") from exc - if self._conf.response_format == "json": + if self.config.response_format == "json": payload = _load_json_response_bytes(response_body) if not isinstance(payload, Mapping): return "" diff --git a/stackchan_server/wakeup_word_detection/whisper_server.py b/stackchan_server/wakeup_word_detection/whisper_server.py index 6b25fe6..38a3703 100644 --- a/stackchan_server/wakeup_word_detection/whisper_server.py +++ b/stackchan_server/wakeup_word_detection/whisper_server.py @@ -25,7 +25,7 @@ class WakeWordDetectionTimeout(WakeWordDetectionError): class WhisperServerWakeWordDetectorConfig(BaseSettings): - keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"]) + keywords: list[str] = Field(default_factory=lambda: ["ハイスタックチャン"]) window_seconds: float = 3.0 interval_seconds: float = 0.5 timeout_seconds: float = 300.0 @@ -192,6 +192,9 @@ def _contains_wake_word(self, transcript: str) -> bool: if not normalized_transcript: return False + if self.recognizer.config.prompt in normalized_transcript: + # If the prompt is included in the transcript, it may indicate that the transcription is not accurate or that the model is confused. In this case, we choose to ignore the transcript to avoid false positives. + return False for keyword in self.config.keywords: normalized_keyword = _normalize_text(keyword) if normalized_keyword and normalized_keyword in normalized_transcript: