74th · 74th · May 17, 2026 · May 17, 2026 · May 17, 2026
diff --git a/docs/server_ja.md b/docs/server_ja.md
@@ -98,20 +98,33 @@ STACKCHAN_WHISPER_SERVER_PROMPT=""
 
 - `STACKCHAN_USE_WWD_WHISPER_SERVER`: `1`
 - `STACKCHAN_WWD_WHISPER_SERVER_URL`: wakeword 検出専用 Whisper Server の推論エンドポイント URL
+- `STACKCHAN_WWD_WHISPER_SERVER_KEYWORDS`: wakeword として検出するキーワード一覧。JSON 配列文字列で指定します
 - `STACKCHAN_WWD_WHISPER_SERVER_MODEL`: wakeword 検出専用に利用するモデル名
 - `STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE`: wakeword 検出専用 Whisper Server リクエストへ渡す language
 - `STACKCHAN_WWD_WHISPER_SERVER_PROMPT`: wakeword 検出専用 Whisper Server リクエストへ渡す prompt
+- `STACKCHAN_WWD_WHISPER_SERVER_IGNORE_PHRASES`: 認識結果に含まれていたら誤検出として無視するフレーズ一覧。JSON 配列文字列で指定します
 
 通常の音声認識で使う `STACKCHAN_WHISPER_SERVER_URL` / `STACKCHAN_WHISPER_SERVER_MODEL` とは別設定です。
 
+Whisper 系モデルは無音時や短い音声で、`prompt` に与えた文言をそのまま返すことがあります。
+余計な語を `prompt` に入れておくと、他の語を無音時の音と認識してくれることがあるため、次の例のように近い語を複数入れておくと良い様です。
+`STACKCHAN_WWD_WHISPER_SERVER_IGNORE_PHRASES` は、語の検出から除外するフレーズを入れておきます。
+
+### 例: 「ハイスタックチャン」で呼びかける設定
+
 ```
 STACKCHAN_USE_WWD_WHISPER_SERVER=1
-STACKCHAN_WWD_WHISPER_SERVER_URL="http://127.0.0.1:8080/inference"
+STACKCHAN_WWD_WHISPER_SERVER_URL=http://localhost:8431/inference
+STACKCHAN_WWD_WHISPER_SERVER_KEYWORDS='["ハイスタックチャン"]'
 STACKCHAN_WWD_WHISPER_SERVER_MODEL=
 STACKCHAN_WWD_WHISPER_SERVER_LANGUAGE="ja"
-STACKCHAN_WWD_WHISPER_SERVER_PROMPT="日本語で、スタックチャンという名前で、話しかけらるので、話しかけられたことを検出してください"
+STACKCHAN_WWD_WHISPER_SERVER_PROMPT="ハイスーチャン。ネエハイトチャン。ハイスタックチャン。ハイスズキクン。ハイフロントチャン。"
+STACKCHAN_WWD_WHISPER_SERVER_IGNORE_PHRASES=[]
 ```
 
+この設定では、サーバーサイド wakeword 検出が `ハイスタックチャン` を含む認識結果を検出対象にします。
+一方で、無音時などに whisper-server が `prompt` の一部である `ハイスタックチャンと言ってください` を返した場合は、`IGNORE_PHRASES` に一致するため誤検出として無視します。
+
 ## 音声合成の設定
 
 音声合成エンジンとして、以下に対応しています。

diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md
@@ -151,9 +151,10 @@ CoreS3 側は `has_server_wake_word=true` を受けると、デバイス側 wake
 
 - 環境変数 `STACKCHAN_USE_WWD_WHISPER_SERVER=1` の場合、サーバーは `@app.setup()` 完了後と `Idle` 復帰後に自動でサーバーサイド wakeword 検出を開始します。
 - サーバーは `StateCmd(ServerWwd)` を送信して `MESSAGE_KIND_SERVER_WWD_PCM` のマイク uplink を受信します。
-- 受信した音声の直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、
+- 受信した音声は既定で 1.0 秒以上たまってから、直近 3 秒窓を 0.5 秒ごとに音声認識へ渡し、
   定義キーワード（例: `スタクチャン`）を含むか判定します。
 - 各判定タイミングの認識結果はすべてログ出力されます。
+- `prompt` を設定した場合、無音時などにモデルが `prompt` と同じ文言を返すことがあります。`STACKCHAN_WWD_WHISPER_SERVER_IGNORE_PHRASES` に除外フレーズを複数設定すると、それらを含む認識結果は誤検出として無視します。
 - キーワード検出時は内部 wakeword イベントを発火し、通常の `talk_session` フローに進みます。
 - 検出完了時（検出/未検出を問わず）は `StateCmd(Idle)` で待機状態に戻します。
 - この間、CoreS3 の画面表示は `Listening` ではなく `Idle(Server-WWD)` を維持します。

diff --git a/misc/on_mac_demo/run-whisper-server-small.sh b/misc/on_mac_demo/run-whisper-server-small.sh
@@ -8,9 +8,8 @@ set -xe
     -l ja \
     -nt \
     -sns \
-    --vad \
-    -vm /opt/whisper.cpp/models/ggml-silero-v6.2.0.bin \
     -vt "0.5" \
     -vspd "100" \
     -vsd "500" \
-    -vp "200"
+    -vp "200" \
+    --convert
diff --git a/stackchan_server/wakeup_word_detection/whisper_server.py b/stackchan_server/wakeup_word_detection/whisper_server.py
@@ -4,7 +4,6 @@
 import unicodedata
 from logging import getLogger
 
-from pydantic import Field
 from pydantic_settings import BaseSettings
 
 from ..speech_recognition.whisper_server import (
@@ -25,13 +24,16 @@ class WakeWordDetectionTimeout(WakeWordDetectionError):
 
 
 class WhisperServerWakeWordDetectorConfig(BaseSettings):
-    keywords: list[str] = Field(default_factory=lambda: ["ハイスタックチャン"])
+    keywords: list[str] = ["スタックチャン"]
     window_seconds: float = 3.0
+    min_buffer_seconds: float = 1.0
     interval_seconds: float = 0.5
     timeout_seconds: float = 300.0
+    ignore_phrases: list[str] = []
 
     class Config:
-        env_prefix = "STACKCHAN_WWD_"
+        env_prefix = "STACKCHAN_WWD_WHISPER_SERVER_"
+
 
 
 class WhisperServerWakeWordSpeechToTextConfig(WhisperServerSpeechToTextConfig):
@@ -45,11 +47,11 @@ def __init__(
         *,
         recognizer: WhisperServerSpeechToText | None = None,
         config: WhisperServerWakeWordDetectorConfig | None = None,
+        recognizer_config: WhisperServerWakeWordSpeechToTextConfig | None = None,
     ) -> None:
         self.config = config or WhisperServerWakeWordDetectorConfig()
-        self.recognizer = recognizer or WhisperServerSpeechToText(
-            config=WhisperServerWakeWordSpeechToTextConfig()
-        )
+        self.recognizer_config = recognizer_config or WhisperServerWakeWordSpeechToTextConfig()
+        self.recognizer = recognizer or WhisperServerSpeechToText(config=self.recognizer_config)
         self._pcm_buffer = bytearray()
         self._running = False
         self._detected = False
@@ -112,10 +114,16 @@ async def handle_data(self, payload: bytes) -> None:
                 len(payload),
             )
             return
+        if not payload:
+            return
 
         self._pcm_buffer.extend(payload)
         self._truncate_buffer_to_window()
 
+        buffered_seconds = self._pcm_duration_seconds(len(self._pcm_buffer))
+        if buffered_seconds < self.config.min_buffer_seconds:
+            return
+
         loop = asyncio.get_running_loop()
         now = loop.time()
         if (now - self._last_inference_at) < self.config.interval_seconds:
@@ -171,6 +179,9 @@ async def _run_inference(self, pcm_bytes: bytes) -> None:
         if not pcm_bytes:
             return
 
+        if self._pcm_duration_seconds(len(pcm_bytes)) < self.config.min_buffer_seconds:
+            return
+
         try:
             async with self._lock:
                 transcript = await self.recognizer.transcribe(pcm_bytes)
@@ -192,25 +203,36 @@ def _contains_wake_word(self, transcript: str) -> bool:
         if not normalized_transcript:
             return False
 
-        if self.recognizer.config.prompt in normalized_transcript:
-            # If the prompt is included in the transcript, it may indicate that the transcription is not accurate or that the model is confused. In this case, we choose to ignore the transcript to avoid false positives.
-            return False
+        for ignore_phrase in self.config.ignore_phrases:
+            if ignore_phrase in normalized_transcript:
+                # If the ignore_detected phrase is included in the transcript, it may indicate that the transcription is not accurate or that the model is confused. In this case, we choose to ignore the transcript to avoid false positives.
+                return False
+
         for keyword in self.config.keywords:
             normalized_keyword = _normalize_text(keyword)
             if normalized_keyword and normalized_keyword in normalized_transcript:
                 return True
         return False
 
     def _truncate_buffer_to_window(self) -> None:
-        sample_rate = LISTEN_AUDIO_FORMAT.sample_rate_hz
-        channels = LISTEN_AUDIO_FORMAT.channels
-        sample_width = LISTEN_AUDIO_FORMAT.sample_width
-        bytes_per_second = sample_rate * channels * sample_width
+        bytes_per_second = self._pcm_bytes_per_second()
         max_bytes = max(1, int(bytes_per_second * self.config.window_seconds))
         if len(self._pcm_buffer) <= max_bytes:
             return
         del self._pcm_buffer[: len(self._pcm_buffer) - max_bytes]
 
+    def _pcm_bytes_per_second(self) -> int:
+        sample_rate = LISTEN_AUDIO_FORMAT.sample_rate_hz
+        channels = LISTEN_AUDIO_FORMAT.channels
+        sample_width = LISTEN_AUDIO_FORMAT.sample_width
+        return sample_rate * channels * sample_width
+
+    def _pcm_duration_seconds(self, pcm_byte_length: int) -> float:
+        bytes_per_second = self._pcm_bytes_per_second()
+        if bytes_per_second <= 0:
+            return 0.0
+        return pcm_byte_length / float(bytes_per_second)
+
 
 def _normalize_text(text: str) -> str:
     normalized = unicodedata.normalize("NFKC", text or "")