diff --git a/skills.json b/skills.json index 99c3b11..5fde718 100644 --- a/skills.json +++ b/skills.json @@ -101,8 +101,9 @@ "id": "camera-claw", "name": "Camera Claw", "description": "Security camera for your AI agent — sandbox, record, and monitor OpenClaw activity.", - "version": "1.1.0", + "version": "2026.3.12", "category": "integrations", + "url": "https://github.com/SharpAI/CameraClaw", "repo_url": "https://github.com/SharpAI/CameraClaw", "code_structure": [ { "path": "SKILL.md", "desc": "Aegis skill manifest (11 params)" }, @@ -114,7 +115,7 @@ { "path": "scripts/health-check.js", "desc": "Container health checker" }, { "path": "docs/aegis_openclaw_note.md", "desc": "Aegis integration requirements" } ], - "tags": ["security", "sandbox", "monitoring", "openclaw"], + "tags": ["security", "sandbox", "monitoring", "openclaw", "ai-agent"], "platforms": [ "linux-x64", "linux-arm64", diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index 96dd71c..55f7dda 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -165,6 +165,15 @@ async function llmCall(messages, opts = {}) { } const model = opts.model || (opts.vlm ? VLM_MODEL : LLM_MODEL) || undefined; + // For JSON-expected tests, disable thinking (Qwen3 /no_think directive) + // This prevents the model from wasting tokens on reasoning before outputting JSON + if (opts.expectJSON) { + const lastUserIdx = messages.findLastIndex(m => m.role === 'user'); + if (lastUserIdx >= 0) { + messages = [...messages]; + messages[lastUserIdx] = { ...messages[lastUserIdx], content: messages[lastUserIdx].content + ' /no_think' }; + } + } // Build request params const params = { @@ -173,6 +182,7 @@ async function llmCall(messages, opts = {}) { ...(model && { model }), ...(opts.temperature !== undefined && { temperature: opts.temperature }), ...(opts.maxTokens && { max_completion_tokens: opts.maxTokens }), + ...(opts.expectJSON && { response_format: { type: 'json_object' } }), ...(opts.tools && { tools: opts.tools }), }; @@ -181,6 +191,34 @@ async function llmCall(messages, opts = {}) { const idleMs = opts.timeout || IDLE_TIMEOUT_MS; let idleTimer = setTimeout(() => controller.abort(), idleMs); const resetIdle = () => { clearTimeout(idleTimer); idleTimer = setTimeout(() => controller.abort(), idleMs); }; + // Log prompt being sent + log(`\n 📤 Prompt (${messages.length} messages, params: ${JSON.stringify({maxTokens: opts.maxTokens, expectJSON: !!opts.expectJSON, response_format: params.response_format})}):`); + for (const m of messages) { + if (typeof m.content === 'string') { + log(` [${m.role}] ${m.content}`); + } else if (Array.isArray(m.content)) { + // Multi-part content (VLM with images) + for (const part of m.content) { + if (part.type === 'text') { + log(` [${m.role}] ${part.text}`); + } else if (part.type === 'image_url') { + const url = part.image_url?.url || ''; + const b64Match = url.match(/^data:([^;]+);base64,(.+)/); + if (b64Match) { + const mimeType = b64Match[1]; + const b64Data = b64Match[2]; + const sizeKB = Math.round(b64Data.length * 3 / 4 / 1024); + log(` [${m.role}] 🖼️ [Image: ${mimeType}, ~${sizeKB}KB]`); + log(`[IMG:${url}]`); + } else { + log(` [${m.role}] 🖼️ [Image URL: ${url.slice(0, 80)}…]`); + } + } + } + } else { + log(` [${m.role}] ${JSON.stringify(m.content).slice(0, 200)}`); + } + } try { const stream = await client.chat.completions.create(params, { @@ -193,6 +231,7 @@ async function llmCall(messages, opts = {}) { let model = ''; let usage = {}; let tokenCount = 0; + let tokenBuffer = ''; for await (const chunk of stream) { resetIdle(); @@ -204,8 +243,43 @@ async function llmCall(messages, opts = {}) { if (delta?.reasoning_content) reasoningContent += delta.reasoning_content; if (delta?.content || delta?.reasoning_content) { tokenCount++; + // Buffer and log tokens — tag with field source + const isContent = !!delta?.content; + const tok = delta?.content || delta?.reasoning_content || ''; + // Tag first token of each field type + if (tokenCount === 1) tokenBuffer += isContent ? '[C] ' : '[R] '; + tokenBuffer += tok; + if (tokenCount % 20 === 0) { + log(tokenBuffer); + tokenBuffer = ''; + } if (tokenCount % 100 === 0) { - log(` … ${tokenCount} tokens received`); + log(` … ${tokenCount} tokens (content: ${content.length}c, reasoning: ${reasoningContent.length}c)`); + } + + // Smart early abort for JSON-expected tests: + // If the model is producing reasoning_content (thinking) for a JSON test, + // abort after 100 reasoning tokens — it should output JSON directly. + if (opts.expectJSON && !isContent && tokenCount > 100) { + log(` ⚠ Aborting: ${tokenCount} reasoning tokens for JSON test — model is thinking instead of outputting JSON`); + controller.abort(); + break; + } + // If content is arriving, check it starts with JSON + if (opts.expectJSON && isContent && content.length >= 50) { + const stripped = content.replace(/[\s\S]*?<\/think>\s*/gi, '').trimStart(); + if (stripped.length >= 50 && !/^\s*[{\[]/.test(stripped)) { + log(` ⚠ Aborting: expected JSON but got: "${stripped.slice(0, 80)}…"`); + controller.abort(); + break; + } + } + // Hard cap: abort if token count far exceeds maxTokens (server may + // not count thinking tokens toward the limit) + if (opts.maxTokens && tokenCount > opts.maxTokens * 3) { + log(` ⚠ Aborting: ${tokenCount} tokens exceeds ${opts.maxTokens}×3 safety limit`); + controller.abort(); + break; } } @@ -224,6 +298,9 @@ async function llmCall(messages, opts = {}) { if (chunk.usage) usage = chunk.usage; } + // Flush remaining token buffer + if (tokenBuffer) log(tokenBuffer); + // If the model only produced reasoning_content (thinking) with no content, // use the reasoning output as the response content for evaluation purposes. if (!content && reasoningContent) { @@ -337,12 +414,11 @@ ${userMessage} 4. Keep system messages (they contain tool results) ## Response Format -Return ONLY this JSON (no other text): -{"keep": [0, 5, 8], "summary": "brief 1-line summary of dropped exchanges"} +Respond with ONLY a valid JSON object, no other text: +{"keep": [], "summary": ""} -- "keep": array of message indices to KEEP (from the index list above) -- "summary": what the dropped messages were about (so context is not lost entirely) -- If nothing should be dropped, set keep to ALL indices and summary to ""`; +Example: if keeping messages at indices 0, 18, 22 → {"keep": [0, 18, 22], "summary": "Removed 4 duplicate 'what happened today' questions"} +If nothing should be dropped, keep ALL indices and set summary to "".`; } suite('📋 Context Preprocessing', async () => { @@ -356,7 +432,7 @@ suite('📋 Context Preprocessing', async () => { { idx: 18, ts: '12:56 PM', text: 'What has happened today' }, { idx: 22, ts: '1:08 PM', text: 'What has happened today' }, ]; - const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What has happened today?') }]); + const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What has happened today?') }], { maxTokens: 300, expectJSON: true }); const p = parseJSON(r.content); assert(Array.isArray(p.keep), 'keep must be array'); assert(p.keep.length <= 3, `Expected ≤3, got ${p.keep.length}`); @@ -373,7 +449,7 @@ suite('📋 Context Preprocessing', async () => { { idx: 18, ts: '12:00 PM', text: 'What is the system status?' }, { idx: 22, ts: '1:00 PM', text: 'What has happened today' }, ]; - const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any alerts triggered?') }]); + const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any alerts triggered?') }], { maxTokens: 300, expectJSON: true }); const p = parseJSON(r.content); assert(Array.isArray(p.keep), 'keep must be array'); assert(p.keep.includes(3) || p.keep.includes(10) || p.keep.includes(18), 'Should keep unique topics'); @@ -387,7 +463,7 @@ suite('📋 Context Preprocessing', async () => { { idx: 6, ts: '10:00 AM', text: 'What is the system status?' }, { idx: 10, ts: '11:00 AM', text: 'Analyze the clip from 9:40 AM' }, ]; - const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any new motion events?') }]); + const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Any new motion events?') }], { maxTokens: 300, expectJSON: true }); const p = parseJSON(r.content); assert(Array.isArray(p.keep) && p.keep.length === 4, `Expected 4, got ${p.keep?.length}`); return `kept all 4 ✓`; @@ -398,7 +474,7 @@ suite('📋 Context Preprocessing', async () => { { idx: 0, ts: '9:00 AM', text: 'Hello' }, { idx: 2, ts: '9:05 AM', text: 'Show cameras' }, ]; - const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Thanks') }]); + const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Thanks') }], { maxTokens: 300, expectJSON: true }); const p = parseJSON(r.content); assert(Array.isArray(p.keep), 'keep must be array'); return `kept ${p.keep.length}/2`; @@ -427,7 +503,7 @@ suite('📋 Context Preprocessing', async () => { { idx: 36, ts: '12:30 PM', text: 'What happened today?' }, { idx: 38, ts: '12:45 PM', text: 'Were there any packages delivered?' }, ]; - const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What happened today?') }]); + const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'What happened today?') }], { maxTokens: 300, expectJSON: true }); const p = parseJSON(r.content); assert(Array.isArray(p.keep), 'keep must be array'); // 10 duplicates of "What happened today?" → should keep ≤12 of 20 @@ -444,7 +520,7 @@ suite('📋 Context Preprocessing', async () => { { idx: 3, ts: '9:05 AM', text: '[System] Alert triggered: person at front door' }, { idx: 4, ts: '9:10 AM', text: 'What happened today?' }, ]; - const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Show me alerts') }]); + const r = await llmCall([{ role: 'user', content: buildPreprocessPrompt(idx, 'Show me alerts') }], { maxTokens: 300, expectJSON: true }); const p = parseJSON(r.content); assert(Array.isArray(p.keep), 'keep must be array'); // System messages (idx 1, 3) must be kept @@ -538,7 +614,7 @@ suite('🧠 Knowledge Distillation', async () => { const r = await llmCall([ { role: 'system', content: DISTILL_PROMPT }, { role: 'user', content: `## Topic: Camera Setup\n## Existing KIs: (none)\n## Conversation\nUser: I have three cameras. Front door is a Blink Mini, living room is Blink Indoor, side parking is Blink Outdoor.\nAegis: Got it! Want to set up alerts?\nUser: Yes, person detection on front door after 10pm. My name is Sam.\nAegis: Alert set. Nice to meet you, Sam!` }, - ]); + ], { maxTokens: 500, expectJSON: true }); const p = parseJSON(r.content); assert(p && typeof p === 'object', 'Must return object'); const facts = (p.items || []).reduce((n, i) => n + (i.facts?.length || 0), 0) + (p.new_items || []).reduce((n, i) => n + (i.facts?.length || 0), 0); @@ -550,7 +626,7 @@ suite('🧠 Knowledge Distillation', async () => { const r = await llmCall([ { role: 'system', content: DISTILL_PROMPT }, { role: 'user', content: `## Topic: Greeting\n## Existing KIs: (none)\n## Conversation\nUser: Hi\nAegis: Hello! How can I help?\nUser: Thanks, bye\nAegis: Goodbye!` }, - ]); + ], { maxTokens: 500, expectJSON: true }); const p = parseJSON(r.content); const facts = (p.items || []).reduce((n, i) => n + (i.facts?.length || 0), 0) + (p.new_items || []).reduce((n, i) => n + (i.facts?.length || 0), 0); assert(facts === 0, `Expected 0 facts, got ${facts}`); @@ -561,7 +637,7 @@ suite('🧠 Knowledge Distillation', async () => { const r = await llmCall([ { role: 'system', content: DISTILL_PROMPT }, { role: 'user', content: `## Topic: Alert Configuration\n## Existing KIs: alert_preferences\n## Conversation\nUser: No notifications from side parking 8am-5pm. Too many false alarms from passing cars.\nAegis: Quiet hours set for side parking 8 AM-5 PM.\nUser: Front door alerts go to Telegram. Discord for everything else.\nAegis: Done — front door to Telegram, rest to Discord.` }, - ]); + ], { maxTokens: 500, expectJSON: true }); const p = parseJSON(r.content); const facts = (p.items || []).reduce((n, i) => n + (i.facts?.length || 0), 0) + (p.new_items || []).reduce((n, i) => n + (i.facts?.length || 0), 0); assert(facts >= 2, `Expected ≥2 facts, got ${facts}`); @@ -572,7 +648,7 @@ suite('🧠 Knowledge Distillation', async () => { const r = await llmCall([ { role: 'system', content: DISTILL_PROMPT }, { role: 'user', content: `## Topic: Camera Update\n## Existing KIs: home_profile (facts: ["3 cameras: Blink Mini front, Blink Indoor living, Blink Outdoor side", "Owner: Sam"])\n## Conversation\nUser: I just installed a fourth camera in the backyard. It's a Reolink Argus 3 Pro.\nAegis: Nice upgrade! I've noted your new backyard Reolink camera. That brings your total to 4 cameras.\nUser: Also, I got a dog named Max, golden retriever.\nAegis: Welcome, Max! I'll note that for the pet detections.` }, - ]); + ], { maxTokens: 500, expectJSON: true }); const p = parseJSON(r.content); const allFacts = [...(p.items || []).flatMap(i => i.facts || []), ...(p.new_items || []).flatMap(i => i.facts || [])]; assert(allFacts.length >= 2, `Expected ≥2 facts, got ${allFacts.length}`); @@ -587,7 +663,7 @@ suite('🧠 Knowledge Distillation', async () => { const r = await llmCall([ { role: 'system', content: DISTILL_PROMPT }, { role: 'user', content: `## Topic: Camera Change\n## Existing KIs: home_profile (facts: ["3 cameras: Blink Mini front, Blink Indoor living, Blink Outdoor side"])\n## Conversation\nUser: I replaced the living room camera. The Blink Indoor died. I put a Ring Indoor there now.\nAegis: Got it — living room camera is now a Ring Indoor. Updated.\nUser: Actually I also moved the side parking camera to the garage instead.\nAegis: Camera moved from side parking to garage, noted.` }, - ]); + ], { maxTokens: 500, expectJSON: true }); const p = parseJSON(r.content); const allFacts = [...(p.items || []).flatMap(i => i.facts || []), ...(p.new_items || []).flatMap(i => i.facts || [])]; assert(allFacts.length >= 1, `Expected ≥1 fact, got ${allFacts.length}`); @@ -634,7 +710,7 @@ suite('🔔 Event Deduplication', async () => { const r = await llmCall([ { role: 'system', content: 'You are a security event classifier. Respond only with valid JSON.' }, { role: 'user', content: buildDedupPrompt(s.current, s.recent, s.age_sec) }, - ], { maxTokens: 150, temperature: 0.1 }); + ], { maxTokens: 150, temperature: 0.1, expectJSON: true }); const p = parseJSON(r.content); if (s.expected_duplicate !== undefined) { assert(p.duplicate === s.expected_duplicate, `Expected duplicate=${s.expected_duplicate}, got ${p.duplicate}`); @@ -847,7 +923,7 @@ suite('🛡️ Security Classification', async () => { const r = await llmCall([ { role: 'system', content: SECURITY_CLASSIFY_PROMPT }, { role: 'user', content: `Event description: ${s.description}` }, - ], { maxTokens: 200, temperature: 0.1 }); + ], { maxTokens: 200, temperature: 0.1, expectJSON: true }); const p = parseJSON(r.content); assert(expectedClassifications.includes(p.classification), `Expected "${expectedLabel}", got "${p.classification}"`);