Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@
"numeric": "cpp",
"ostream": "cpp",
"sstream": "cpp"
}
},
"python.analysis.typeCheckingMode": "off",
"ty.interpreter": [".venv/bin/python"]
}
29 changes: 29 additions & 0 deletions misc/on_mac_demo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
need Xcode app

```
sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
```

```
sudo git clone https://github.com/ggml-org/whisper.cpp /opt/whisper.cpp
sudo chown -R $(id -u):$(id -g) /opt/whisper.cpp

cd /opt/whisper.cpp

uv venv -p 3.11
uv pip install ane_transformers openai-whisper coremltools
source .venv/bin/activate
```

```
./models/generate-coreml-model.sh small
./models/download-ggml-model.sh small
./models/generate-coreml-model.sh large-v3-turbo
./models/download-ggml-model.sh large-v3-turbo
```

```
# rm -rf build
cmake -B build -DWHISPER_COREML=ON -DWHISPER_FFMPEG=ON -DGGML_NATIVE=OFF
cmake --build build -j --config Release
```
17 changes: 17 additions & 0 deletions misc/on_mac_demo/run-whisper-server-large-turbo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -xe

/opt/whisper.cpp/build/bin/whisper-server \
--host 0.0.0.0 \
--port "8432" \
--model /opt/whisper.cpp/models/ggml-large-v3-turbo.bin \
-l ja \
-nt \
-sns \
--vad \
-vm /opt/whisper.cpp/models/ggml-silero-v6.2.0.bin \
-vt "0.5" \
-vspd "100" \
-vsd "500" \
-vp "200" \
--convert
16 changes: 16 additions & 0 deletions misc/on_mac_demo/run-whisper-server-small.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
set -xe

/opt/whisper.cpp/build/bin/whisper-server \
--host 0.0.0.0 \
--port "8431" \
--model /opt/whisper.cpp/models/ggml-small.bin \
-l ja \
-nt \
-sns \
--vad \
-vm /opt/whisper.cpp/models/ggml-silero-v6.2.0.bin \
-vt "0.5" \
-vspd "100" \
-vsd "500" \
-vp "200"
26 changes: 13 additions & 13 deletions stackchan_server/speech_recognition/whisper_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,16 @@ def __init__(
*,
config: WhisperServerSpeechToTextConfig | None = None,
) -> None:
self._conf = config or WhisperServerSpeechToTextConfig()
self._server_url = self._conf.url
self.config = config or WhisperServerSpeechToTextConfig()
self._server_url = self.config.url

async def transcribe(self, pcm_bytes: bytes) -> str:
rms_level = _pcm_rms_level(pcm_bytes)
if rms_level < self._conf.silence_rms_threshold:
if rms_level < self.config.silence_rms_threshold:
logger.info(
"Skipping whisper-server transcription because pcm rms %.2f is below silence threshold %.2f",
rms_level,
self._conf.silence_rms_threshold,
self.config.silence_rms_threshold,
)
return ""

Expand All @@ -65,28 +65,28 @@ async def transcribe(self, pcm_bytes: bytes) -> str:
transcript = await asyncio.to_thread(
self._request_transcript,
wav_bytes,
self._conf.language,
self.config.language,
)
if transcript:
logger.info("whisper-server transcript: %s", transcript)
return transcript

def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
fields = {
"response_format": self._conf.response_format,
"response_format": self.config.response_format,
}

normalized_language = language.strip()
if normalized_language:
fields["language"] = normalized_language

if self._conf.prompt:
fields["prompt"] = self._conf.prompt
if self.config.prompt:
fields["prompt"] = self.config.prompt

if self._conf.model:
fields["model"] = self._conf.model
if self.config.model:
fields["model"] = self.config.model

if self._conf.detect_language:
if self.config.detect_language:
fields["detect_language"] = "true"

body, content_type = _encode_multipart_formdata(
Expand All @@ -102,7 +102,7 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
logger.info("Running whisper-server request: POST %s", self._server_url)
try:
with urlopen(
request, timeout=self._conf.request_timeout_seconds
request, timeout=self.config.request_timeout_seconds
) as response:
response_body = response.read()
except HTTPError as exc:
Expand All @@ -113,7 +113,7 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
except URLError as exc:
raise RuntimeError(f"whisper-server request failed: {exc.reason}") from exc

if self._conf.response_format == "json":
if self.config.response_format == "json":
payload = _load_json_response_bytes(response_body)
if not isinstance(payload, Mapping):
return ""
Expand Down
5 changes: 4 additions & 1 deletion stackchan_server/wakeup_word_detection/whisper_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class WakeWordDetectionTimeout(WakeWordDetectionError):


class WhisperServerWakeWordDetectorConfig(BaseSettings):
keywords: list[str] = Field(default_factory=lambda: ["スタックチャン"])
keywords: list[str] = Field(default_factory=lambda: ["ハイスタックチャン"])
window_seconds: float = 3.0
interval_seconds: float = 0.5
timeout_seconds: float = 300.0
Expand Down Expand Up @@ -192,6 +192,9 @@ def _contains_wake_word(self, transcript: str) -> bool:
if not normalized_transcript:
return False

if self.recognizer.config.prompt in normalized_transcript:
# If the prompt is included in the transcript, it may indicate that the transcription is not accurate or that the model is confused. In this case, we choose to ignore the transcript to avoid false positives.
return False
for keyword in self.config.keywords:
normalized_keyword = _normalize_text(keyword)
if normalized_keyword and normalized_keyword in normalized_transcript:
Expand Down
Loading