diff --git a/README.md b/README.md index 22b9ba1..0ceafdb 100644 --- a/README.md +++ b/README.md @@ -84,12 +84,13 @@ The web UI is a static Preact app served by `mind-map serve` over HTTP. It uses Both modes use the same wiki engine and the same wiki directory (`~/.mind-map/wiki` by default). Multiple stdio processes can safely share the same wiki via SQLite page locking. -## MCP Tools (10 total) +## MCP Tools (11 total) | Tool | Description | |------|-------------| | `search_pages` | Full-text search across page titles and content (SQLite FTS5) | -| `get_wiki_context` | Wiki overview: page count, top-level directories, recent pages | +| `get_wiki_digest` | Per-conversation orientation: page count, word/phrase cloud, active-use recents LRU, per-area counts, ~4 KB rendered markdown. Call this at the start of every new conversation. | +| `get_wiki_context` | Wiki overview: page count, top-level directories, recent pages (mtime-sorted). Also returns the digest fields for new clients. | | `get_page` | Read a page with parsed frontmatter, body, outgoing links, and backlinks | | `create_page` | Create a new page (markdown with optional YAML frontmatter) | | `update_page` | Update an existing page's content | @@ -102,6 +103,7 @@ Both modes use the same wiki engine and the same wiki directory (`~/.mind-map/wi ## Wiki Features +- **Per-conversation digest**: a compact orientation blob (cloud of top terms, recents LRU, area counts, rendered markdown) for LLMs to consume at conversation start. Always-current; background job rebuilds every 5 minutes; persisted to SQLite across restarts. - **YAML frontmatter**: structured metadata on every page (`title`, `type`, `status`, custom fields) - **Wikilinks**: `[[target]]` and `[[display|target]]` syntax, resolved to clickable links - **Backlink index**: every page knows what links to it diff --git a/SKILL.md b/SKILL.md index 97ac592..5845ab2 100644 --- a/SKILL.md +++ b/SKILL.md @@ -4,6 +4,7 @@ description: A wiki for AI agents and humans -- search, read, and write markdown tools: - search_pages - get_wiki_context + - get_wiki_digest - get_page - create_page - update_page @@ -47,12 +48,29 @@ Use mind-map as your **persistent memory**: ## Getting Oriented -**Always start by understanding what's already in the wiki:** +**Always start a new conversation with the digest:** +``` +get_wiki_digest() +→ returns a compact markdown blob: page count, top word/phrase cloud + (what this wiki is about), pages you or other agents recently + touched (intent, not file-mtime), and per-area page counts. + ~4 KB cap, ~1K tokens — designed to fit any context budget. +``` + +The digest is always-current: a background job rebuilds the cloud +every few minutes and the recents LRU updates on every page op. +Persisted to SQLite so a fresh server restart already has signal. + +If you need the legacy mtime-sorted "recently modified pages" list +or the filesystem-derived top-level directory list, call: ``` get_wiki_context() -→ returns page count, top-level directories, and 20 most recently modified pages +→ same shape as before, plus the digest fields layered on for free. ``` +New clients should prefer `get_wiki_digest`; `get_wiki_context` +remains for backwards compatibility. + ## Searching ``` diff --git a/cmd/mind-map/main.go b/cmd/mind-map/main.go index 0090799..a0234c6 100644 --- a/cmd/mind-map/main.go +++ b/cmd/mind-map/main.go @@ -12,6 +12,7 @@ import ( "time" "github.com/aniongithub/mind-map/internal/config" + "github.com/aniongithub/mind-map/internal/digest" "github.com/aniongithub/mind-map/internal/httpapi" "github.com/aniongithub/mind-map/internal/logging" mindmcp "github.com/aniongithub/mind-map/internal/mcp" @@ -87,17 +88,57 @@ func init() { func runStdio(cmd *cobra.Command, args []string) error { dir, _ := cmd.Flags().GetString("dir") - w, err := wiki.Open(dir) + cfgPath := config.DefaultPath() + cfg, err := config.Load(cfgPath) + if err != nil { + slog.Warn("failed to load config, using defaults", slog.Any("error", err)) + cfg = config.DefaultConfig() + } + + w, err := wiki.Open(dir, wiki.WithOptions(wikiOptionsFromConfig(cfg))) if err != nil { return fmt.Errorf("open wiki: %w", err) } defer w.Close() + // Spin up the digest's background maintenance (cloud rebuild + + // recents flush) for the duration of the stdio session. Stop + // before Close so a mid-rebuild ticker doesn't race the DB + // shutdown. + dm := digest.NewManager(w, digestOptionsFromConfig(cfg)) + dm.Start(cmd.Context()) + defer dm.Stop() + s := mindmcp.NewServer(w, nil, getVersion()) slog.Info("mind-map MCP server starting", slog.String("mode", "stdio"), slog.String("wiki", w.Root())) return s.MCPServer().Run(cmd.Context(), &mcpsdk.StdioTransport{}) } +// wikiOptionsFromConfig maps the digest section of config.Config to +// the construction-time knobs the Wiki cares about (recents capacity, +// render cap, stopword extras). Zero/missing values keep the Wiki's +// own defaults — DigestConfig is documented as fully optional. +func wikiOptionsFromConfig(cfg *config.Config) wiki.Options { + d := cfg.Digest + return wiki.Options{ + RecentsSize: d.RecentsSize, + MaxRenderBytes: d.MaxRenderBytes, + StopwordsExtra: d.StopwordsExtra, + } +} + +// digestOptionsFromConfig maps the digest section to the runtime +// (ticker / rebuild) knobs the digest.Manager cares about. Same +// "zero means default" contract. +func digestOptionsFromConfig(cfg *config.Config) digest.Options { + d := cfg.Digest + return digest.Options{ + CloudRefresh: d.ParseCloudRefresh(), + CloudSize: d.CloudSize, + StopwordsExtra: d.StopwordsExtra, + } +} + func runServe(cmd *cobra.Command, args []string) error { dir, _ := cmd.Flags().GetString("dir") logFile, _ := cmd.Flags().GetString("log-file") @@ -145,12 +186,6 @@ func runServe(cmd *cobra.Command, args []string) error { // runHTTPServer wires the HTTP handler from internal/httpapi and serves it. // Shared by the interactive `serve` command and the system service. func runHTTPServer(addr, dir, webuiDir string, idleTimeout time.Duration, stopCh chan struct{}) error { - w, err := wiki.Open(dir) - if err != nil { - return fmt.Errorf("open wiki: %w", err) - } - defer w.Close() - cfgPath := config.DefaultPath() cfg, err := config.Load(cfgPath) if err != nil { @@ -158,6 +193,32 @@ func runHTTPServer(addr, dir, webuiDir string, idleTimeout time.Duration, stopCh cfg = config.DefaultConfig() } + w, err := wiki.Open(dir, wiki.WithOptions(wikiOptionsFromConfig(cfg))) + if err != nil { + return fmt.Errorf("open wiki: %w", err) + } + defer w.Close() + + // Background digest maintenance runs for the lifetime of the + // HTTP server. We use a context derived from stopCh so that the + // graceful /api/restart path (which closes stopCh) also stops + // the tickers cleanly. Stopping before Close ensures the LRU + // flush in the manager's final tick doesn't race with db.Close. + dctx, dcancel := context.WithCancel(context.Background()) + defer dcancel() + go func() { + select { + case <-stopCh: + dcancel() + case <-dctx.Done(): + // Normal function return; the defer above cancelled us. + return + } + }() + dm := digest.NewManager(w, digestOptionsFromConfig(cfg)) + dm.Start(dctx) + defer dm.Stop() + handler := httpapi.New(httpapi.Deps{ Wiki: w, CfgPath: cfgPath, diff --git a/internal/config/config.go b/internal/config/config.go index 0e447a4..84dc10f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -120,9 +120,55 @@ func (s *SyncConfig) Remotes() []string { return remotes } +// DigestConfig holds tunables for the per-conversation orientation +// digest (cloud rebuild, recents LRU, render cap, stopword extras). +// All fields are optional; zero or invalid values fall back to the +// built-in defaults. Documented in detail in mind-map/plans/digest. +type DigestConfig struct { + // CloudSize caps the top-K terms surfaced in the word cloud. + // Default 50. Tunable up if your wiki is large enough that 50 + // terms feels too sparse; down if context budget is tight. + CloudSize int `json:"cloud_size,omitempty"` + + // RecentsSize caps the active-use LRU ring. Default 20. Applied + // at wiki Open; live changes via /api/settings take effect after + // the next server restart. + RecentsSize int `json:"recents_size,omitempty"` + + // CloudRefresh controls how often the cloud rebuilds. Default 5m. + // Accepts any time.ParseDuration value; values below 30 seconds + // are clamped up so a busy wiki doesn't burn CPU. + CloudRefresh string `json:"cloud_refresh,omitempty"` + + // StopwordsExtra extends the built-in English stopword list. + // Words are case-folded on load. Useful for domain-specific + // noise like "TODO" or "FIXME". + StopwordsExtra []string `json:"stopwords_extra,omitempty"` + + // MaxRenderBytes caps the rendered markdown blob. Default 4096 + // (~1K tokens for most LLMs). Trim discipline when over: drop + // recents, then cloud, never areas/header/footer. + MaxRenderBytes int `json:"max_render_bytes,omitempty"` +} + +// ParseCloudRefresh returns the cloud rebuild interval. Returns the +// default (5m) if empty or invalid. Floor at 30 seconds — anything +// faster is wasted CPU for a signal nobody reads that often. +func (d *DigestConfig) ParseCloudRefresh() time.Duration { + if d.CloudRefresh == "" { + return 5 * time.Minute + } + v, err := time.ParseDuration(d.CloudRefresh) + if err != nil || v < 30*time.Second { + return 5 * time.Minute + } + return v +} + // Config holds all runtime settings. type Config struct { - Sync SyncConfig `json:"sync"` + Sync SyncConfig `json:"sync"` + Digest DigestConfig `json:"digest,omitempty"` } // DefaultConfig returns a Config with sensible defaults. diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 0af2e6a..bf7acb6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -172,3 +172,81 @@ func TestSaveAndLoad(t *testing.T) { t.Errorf("loaded mapping prefix = %q", loaded.Sync.Mappings[0].Prefix) } } + +func TestParseCloudRefresh(t *testing.T) { + tests := []struct { + input string + want time.Duration + }{ + {"5m", 5 * time.Minute}, + {"10m", 10 * time.Minute}, + {"1h", 1 * time.Hour}, + // Floor: anything < 30s clamps to the default to protect a + // busy wiki from CPU churn. + {"1s", 5 * time.Minute}, + {"", 5 * time.Minute}, // empty → default + {"junk", 5 * time.Minute}, // invalid → default + } + for _, tc := range tests { + d := DigestConfig{CloudRefresh: tc.input} + if got := d.ParseCloudRefresh(); got != tc.want { + t.Errorf("ParseCloudRefresh(%q) = %v, want %v", tc.input, got, tc.want) + } + } +} + +func TestDigestConfig_RoundtripJSON(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.json") + + cfg := DefaultConfig() + cfg.Digest.CloudSize = 75 + cfg.Digest.RecentsSize = 30 + cfg.Digest.CloudRefresh = "10m" + cfg.Digest.StopwordsExtra = []string{"TODO", "FIXME"} + cfg.Digest.MaxRenderBytes = 8192 + + if err := Save(path, cfg); err != nil { + t.Fatalf("Save: %v", err) + } + loaded, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + if loaded.Digest.CloudSize != 75 { + t.Errorf("CloudSize = %d, want 75", loaded.Digest.CloudSize) + } + if loaded.Digest.RecentsSize != 30 { + t.Errorf("RecentsSize = %d, want 30", loaded.Digest.RecentsSize) + } + if loaded.Digest.ParseCloudRefresh() != 10*time.Minute { + t.Errorf("CloudRefresh = %v, want 10m", loaded.Digest.ParseCloudRefresh()) + } + if len(loaded.Digest.StopwordsExtra) != 2 || loaded.Digest.StopwordsExtra[0] != "TODO" { + t.Errorf("StopwordsExtra = %v", loaded.Digest.StopwordsExtra) + } + if loaded.Digest.MaxRenderBytes != 8192 { + t.Errorf("MaxRenderBytes = %d, want 8192", loaded.Digest.MaxRenderBytes) + } +} + +func TestDigestConfig_BackwardsCompatible(t *testing.T) { + // A config file written before the digest section existed must + // still load without errors and yield zero-valued digest fields + // (which the consumers treat as "use defaults"). + dir := t.TempDir() + path := filepath.Join(dir, "config.json") + if err := os.WriteFile(path, []byte(`{"sync":{"enabled":false,"interval":"30s"}}`), 0o600); err != nil { + t.Fatalf("write legacy config: %v", err) + } + loaded, err := Load(path) + if err != nil { + t.Fatalf("Load legacy config: %v", err) + } + if loaded.Digest.CloudSize != 0 { + t.Errorf("expected zero CloudSize on legacy config, got %d", loaded.Digest.CloudSize) + } + if loaded.Digest.ParseCloudRefresh() != 5*time.Minute { + t.Errorf("expected default 5m on legacy config, got %v", loaded.Digest.ParseCloudRefresh()) + } +} diff --git a/internal/digest/manager.go b/internal/digest/manager.go new file mode 100644 index 0000000..9282464 --- /dev/null +++ b/internal/digest/manager.go @@ -0,0 +1,203 @@ +// Package digest runs the background maintenance for a wiki's per- +// conversation orientation digest: a periodic rebuild of the word/ +// phrase cloud and a periodic flush of the active-use recents LRU +// to SQLite. +// +// The package mirrors internal/sync in shape: a Manager constructed +// over a *wiki.Wiki, with Start(ctx) / Stop() lifecycle that the +// embedder (cmd/mind-map, internal/httpapi) supervises. Keeping the +// tickers out of the Wiki itself preserves the same separation sync +// already established — the storage engine has no goroutines of its +// own; lifecycle is the embedder's concern. +package digest + +import ( + "context" + "log/slog" + "sync" + "time" + + "github.com/aniongithub/mind-map/internal/wiki" +) + +// Default tick intervals match the plan. Config-driven overrides +// land in Step 7; until then these are the only knobs and they're +// reasonable for any wiki size below the millions of pages. +const ( + defaultCloudRefresh = 5 * time.Minute + defaultRecentsRefresh = 30 * time.Second + + // defaultCloudSize matches the plan's cloud_size default. The + // top-K selection is the only knob that materially affects the + // rendered digest's word density; everything else is plumbing. + defaultCloudSize = 50 +) + +// Manager runs the two background tickers (cloud rebuild + recents +// flush) for a single wiki. Construct one with NewManager, hand its +// Start a context tied to the process lifetime, and call Stop before +// closing the wiki — closing the wiki out from under a mid-rebuild +// ticker is a `sql: database is closed` race waiting to happen. +// +// Safe for concurrent Start/Stop (idempotent via sync.Once); a single +// Manager is one-shot — once Stop has been called, the Manager cannot +// be Started again. Construct a fresh one if you need a restart. +type Manager struct { + w *wiki.Wiki + + cloudRefresh time.Duration + recentsRefresh time.Duration + cloudSize int + stopwordsExtra []string + + startOnce sync.Once + stopOnce sync.Once + cancel context.CancelFunc + done chan struct{} +} + +// Options tunes Manager behavior. Zero-value Options uses the +// package defaults (5m cloud rebuild, 30s recents flush, top-50 +// cloud terms). Step 7 will wire these through config.json. +type Options struct { + CloudRefresh time.Duration + RecentsRefresh time.Duration + CloudSize int + // StopwordsExtra appends to the built-in English stopword list. + // Mirrors plan's digest.stopwords_extra config knob. + StopwordsExtra []string +} + +// NewManager constructs an unstarted Manager. Pass zero Options for +// defaults. +func NewManager(w *wiki.Wiki, opts Options) *Manager { + if opts.CloudRefresh <= 0 { + opts.CloudRefresh = defaultCloudRefresh + } + if opts.RecentsRefresh <= 0 { + opts.RecentsRefresh = defaultRecentsRefresh + } + if opts.CloudSize <= 0 { + opts.CloudSize = defaultCloudSize + } + return &Manager{ + w: w, + cloudRefresh: opts.CloudRefresh, + recentsRefresh: opts.RecentsRefresh, + cloudSize: opts.CloudSize, + stopwordsExtra: opts.StopwordsExtra, + } +} + +// Start kicks off the two tickers. Idempotent: a second call is a +// no-op. Returns immediately after spawning goroutines; use Stop to +// wait for clean shutdown. +// +// The cloud is rebuilt synchronously once before the goroutine loop +// starts so a freshly-opened wiki has cloud terms in its digest +// without a 5-minute warm-up. On cold start over a 1k-page wiki this +// takes < 100ms; we accept that latency on Start so the first +// post-open digest read is useful. +func (m *Manager) Start(ctx context.Context) { + m.startOnce.Do(func() { + ctx, m.cancel = context.WithCancel(ctx) + m.done = make(chan struct{}) + + // Synchronous first build so cold-start digests have an + // About: line. We deliberately don't gate on whether a + // persisted cloud was loaded: even if it was, the on-disk + // content may have shifted while the server was off, and + // the cost is small. A failure here logs and continues — + // the tickers below will retry. + m.rebuildCloud(ctx) + + go m.run(ctx) + slog.Info("digest manager started", + slog.Duration("cloud_refresh", m.cloudRefresh), + slog.Duration("recents_refresh", m.recentsRefresh), + slog.Int("cloud_size", m.cloudSize), + ) + }) +} + +// Stop cancels the tickers and blocks until the loop goroutine has +// exited. Idempotent. Safe to call after Start, after another Stop, +// or even without ever calling Start (in which case it returns +// immediately). +// +// A final recents flush runs as the loop exits so the last few touches +// between ticker fires aren't lost on shutdown. The Wiki's own Close() +// also calls persistRecents — both paths converge on the same row, +// and the SQLite write is atomic, so the redundancy is harmless. +func (m *Manager) Stop() { + m.stopOnce.Do(func() { + if m.cancel == nil { + return // Stop without Start: nothing to do. + } + m.cancel() + <-m.done + slog.Info("digest manager stopped") + }) +} + +// run is the goroutine that drives both tickers. The cloud rebuild +// is much heavier than the recents flush, but both are well below the +// 30s recents tick on any reasonable wiki size, so a shared goroutine +// with two tickers is simpler than two goroutines and adequately +// non-blocking for the workload. +func (m *Manager) run(ctx context.Context) { + defer close(m.done) + + cloudTick := time.NewTicker(m.cloudRefresh) + defer cloudTick.Stop() + recentsTick := time.NewTicker(m.recentsRefresh) + defer recentsTick.Stop() + + for { + select { + case <-ctx.Done(): + // Final flush so we don't lose the last ~30s of + // touches. Use a detached background context: the + // loop's ctx is already cancelled, but the DB write + // itself should still get a chance to complete. + m.flushRecents(context.Background()) + return + case <-cloudTick.C: + m.rebuildCloud(ctx) + case <-recentsTick.C: + m.flushRecents(ctx) + } + } +} + +// rebuildCloud runs one cloud rebuild + persistence cycle. Failures +// are logged and swallowed — the digest must degrade gracefully on +// transient errors rather than crashing a long-running service. +func (m *Manager) rebuildCloud(ctx context.Context) { + start := time.Now() + terms, err := m.w.BuildCloud(ctx, m.cloudSize, m.stopwordsExtra) + if err != nil { + slog.Warn("digest cloud rebuild failed", slog.Any("error", err)) + return + } + m.w.SetCloud(terms) + if err := m.w.PersistCloud(ctx); err != nil { + slog.Warn("digest cloud persist failed", slog.Any("error", err)) + } + slog.Info("digest cloud rebuilt", + slog.Int("terms", len(terms)), + slog.Duration("elapsed", time.Since(start)), + ) +} + +// flushRecents writes the LRU to wiki_state if it's been touched +// since the last write. The dirty gate avoids gratuitous SQLite writes +// on an idle server. +func (m *Manager) flushRecents(ctx context.Context) { + if !m.w.RecentsDirty() { + return + } + if err := m.w.PersistRecents(ctx); err != nil { + slog.Warn("digest recents persist failed", slog.Any("error", err)) + } +} diff --git a/internal/digest/manager_test.go b/internal/digest/manager_test.go new file mode 100644 index 0000000..d0f2759 --- /dev/null +++ b/internal/digest/manager_test.go @@ -0,0 +1,139 @@ +package digest + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/aniongithub/mind-map/internal/wiki" +) + +// testWiki creates a temporary wiki with a few seed pages so the +// cloud rebuild has something to count. Kept private to this test +// file — the public wiki package has its own testWiki, but we can't +// import test helpers across packages. +func testWiki(t *testing.T) *wiki.Wiki { + t.Helper() + dir := t.TempDir() + + pages := map[string]string{ + "index.md": "# Home\n\nThis wiki is about mind-map, digest, and SQLite.\n", + "projects/mind-map.md": "# mind-map\n\nA wiki engine. SQLite-backed. Digest support.\n", + "notes/sqlite.md": "# SQLite\n\nSQLite is fast and embedded. mind-map uses SQLite.\n", + } + for name, content := range pages { + full := filepath.Join(dir, name) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatalf("mkdir %s: %v", name, err) + } + if err := os.WriteFile(full, []byte(content), 0o644); err != nil { + t.Fatalf("seed %s: %v", name, err) + } + } + + w, err := wiki.Open(dir) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { w.Close() }) + return w +} + +func TestManager_StartTriggersImmediateCloudRebuild(t *testing.T) { + w := testWiki(t) + + m := NewManager(w, Options{ + // Long tick so the ticker doesn't fire during the test — + // we want to assert the *synchronous* initial build only. + CloudRefresh: time.Hour, + RecentsRefresh: time.Hour, + }) + m.Start(context.Background()) + defer m.Stop() + + // After Start, the cloud cache should be populated and the digest + // markdown should contain an About: line. + d, err := w.Digest(context.Background()) + if err != nil { + t.Fatalf("Digest: %v", err) + } + if len(d.Cloud) == 0 { + t.Fatalf("cloud should be populated after Start, got empty") + } + if !strings.Contains(d.Markdown, "About:") { + t.Fatalf("digest missing About: line:\n%s", d.Markdown) + } +} + +func TestManager_StopIsIdempotent(t *testing.T) { + w := testWiki(t) + m := NewManager(w, Options{CloudRefresh: time.Hour, RecentsRefresh: time.Hour}) + m.Start(context.Background()) + + m.Stop() + m.Stop() // second Stop must not panic or block +} + +func TestManager_StopWithoutStartIsNoOp(t *testing.T) { + w := testWiki(t) + m := NewManager(w, Options{}) + m.Stop() // must not panic, must not hang +} + +func TestManager_RecentsFlushOnTick(t *testing.T) { + w := testWiki(t) + ctx := context.Background() + + m := NewManager(w, Options{ + CloudRefresh: time.Hour, + RecentsRefresh: 50 * time.Millisecond, + }) + m.Start(ctx) + defer m.Stop() + + // Touch via a real Wiki op so dirty flips on. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if !w.RecentsDirty() { + t.Fatalf("LRU should be dirty after GetPage") + } + + // Wait for the ticker to fire and flush. + deadline := time.Now().Add(time.Second) + for time.Now().Before(deadline) { + if !w.RecentsDirty() { + return // success: ticker flushed and cleared dirty + } + time.Sleep(20 * time.Millisecond) + } + t.Fatalf("LRU still dirty after 1s; ticker did not flush") +} + +func TestManager_StopFlushesRecents(t *testing.T) { + w := testWiki(t) + ctx := context.Background() + + m := NewManager(w, Options{ + // Long ticks so only the Stop-time flush can save us. + CloudRefresh: time.Hour, + RecentsRefresh: time.Hour, + }) + m.Start(ctx) + + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if !w.RecentsDirty() { + t.Fatalf("LRU should be dirty after touch") + } + + m.Stop() + + if w.RecentsDirty() { + t.Fatalf("Stop should have flushed dirty LRU; still dirty") + } +} diff --git a/internal/httpapi/server.go b/internal/httpapi/server.go index cf96774..8b4874a 100644 --- a/internal/httpapi/server.go +++ b/internal/httpapi/server.go @@ -161,6 +161,7 @@ func (s *Server) shutdown() { func (s *Server) register(mux *http.ServeMux) { mux.HandleFunc("GET /api/version", s.getVersion) mux.HandleFunc("GET /api/context", s.getContext) + mux.HandleFunc("GET /api/digest", s.getDigest) mux.HandleFunc("GET /api/pages", s.listPages) mux.HandleFunc("GET /api/pages/{path...}", s.getPage) mux.HandleFunc("POST /api/pages", s.createPage) @@ -306,6 +307,27 @@ func (s *Server) getContext(rw http.ResponseWriter, r *http.Request) { writeJSON(rw, wctx) } +// getDigest handles GET /api/digest. Returns the full Digest struct +// (page count, cloud terms, recents LRU, per-area summaries, rendered +// markdown). Intended for two callers: +// +// - Agents / MCP clients that prefer the HTTP path over the MCP +// tool (e.g. tests, scripts, or alternate clients). +// - The WebUI, which can render its own widgets (e.g. a word-cloud +// visualization) off the structured fields rather than parsing +// the markdown. +// +// Cheap on cache hit, sub-millisecond on miss. Safe to call frequently +// (e.g. WebUI polling); the in-memory digestCache absorbs the load. +func (s *Server) getDigest(rw http.ResponseWriter, r *http.Request) { + d, err := s.deps.Wiki.Digest(r.Context()) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + writeJSON(rw, d) +} + func (s *Server) listPages(rw http.ResponseWriter, r *http.Request) { prefix := r.URL.Query().Get("prefix") pages, err := s.deps.Wiki.ListPages(r.Context(), prefix) diff --git a/internal/httpapi/server_test.go b/internal/httpapi/server_test.go index bbee42e..c08f747 100644 --- a/internal/httpapi/server_test.go +++ b/internal/httpapi/server_test.go @@ -358,3 +358,50 @@ func TestReindexDetectsDirectFilesystemChanges(t *testing.T) { t.Errorf("page still not indexed after reindex (got %d body=%s)", rec.Code, rec.Body.String()) } } + +func TestGetDigest(t *testing.T) { + h := newTestServer(t) + + // Seed a page so the digest has something to summarize. + rec := doJSON(t, h, "POST", "/api/pages", map[string]string{ + "path": "topics/sqlite", + "content": "# SQLite\n\nSQLite is a fast embedded database.\n", + }) + if rec.Code != 201 { + t.Fatalf("seed: %d %s", rec.Code, rec.Body.String()) + } + + rec = doJSON(t, h, "GET", "/api/digest", nil) + if rec.Code != 200 { + t.Fatalf("digest: %d %s", rec.Code, rec.Body.String()) + } + + var d wiki.Digest + if err := json.Unmarshal(rec.Body.Bytes(), &d); err != nil { + t.Fatalf("unmarshal: %v\n%s", err, rec.Body.String()) + } + + if d.PageCount < 1 { + t.Errorf("page count = %d, want >= 1", d.PageCount) + } + if d.Markdown == "" { + t.Errorf("markdown empty") + } + if !strings.Contains(d.Markdown, "This wiki contains") { + t.Errorf("markdown missing header sentence:\n%s", d.Markdown) + } + if len(d.Areas) == 0 { + t.Errorf("expected at least one area, got none") + } + // Recently active should include the page we just created + // (CreatePage touches the LRU). + found := false + for _, p := range d.Recents { + if p == "topics/sqlite" { + found = true + } + } + if !found { + t.Errorf("recents missing topics/sqlite: %v", d.Recents) + } +} diff --git a/internal/mcp/server.go b/internal/mcp/server.go index a58839c..86d622f 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -61,9 +61,14 @@ func (s *Server) registerTools() { mcp.AddTool(s.server, &mcp.Tool{ Name: "get_wiki_context", - Description: "Get wiki orientation: page count, top-level directories, and 20 most recently modified pages.", + Description: "Get wiki orientation: page count, top-level directories, and 20 most recently modified pages. Also returns the digest (cloud_terms, recents LRU, per-area counts, rendered markdown) for new clients — older clients can ignore the extra fields.", }, s.getWikiContext) + mcp.AddTool(s.server, &mcp.Tool{ + Name: "get_wiki_digest", + Description: "Get a compact, always-current per-conversation orientation of this wiki. Returns: a rendered markdown blob (suitable to paste into context), a word/phrase cloud across all page bodies (what this wiki is about), an LRU of pages the user or agent has actively touched (intent, not file-mtime), and per-area page counts. Call this at the start of every new conversation. Cheaper and more deterministic than searching blindly; complements search_pages once you know what to look for.", + }, s.getWikiDigest) + mcp.AddTool(s.server, &mcp.Tool{ Name: "get_page", Description: "Read a wiki page with parsed frontmatter, body, outgoing links, and backlinks.", @@ -175,6 +180,24 @@ func (s *Server) getWikiContext(ctx context.Context, _ *mcp.CallToolRequest, _ a return textResult(wctx) } +func (s *Server) getWikiDigest(ctx context.Context, _ *mcp.CallToolRequest, _ any) (*mcp.CallToolResult, any, error) { + start := time.Now() + d, err := s.wiki.Digest(ctx) + if err != nil { + slog.Error("tool.get_wiki_digest failed", slog.Any("error", err)) + return nil, nil, err + } + slog.Info("tool.get_wiki_digest", + slog.Int("page_count", d.PageCount), + slog.Int("cloud_terms", len(d.Cloud)), + slog.Int("recents", len(d.Recents)), + slog.Int("areas", len(d.Areas)), + slog.Int("bytes", len(d.Markdown)), + slog.Duration("elapsed", time.Since(start)), + ) + return textResult(d) +} + func (s *Server) getPage(ctx context.Context, _ *mcp.CallToolRequest, input pagePathInput) (*mcp.CallToolResult, any, error) { start := time.Now() page, err := s.wiki.GetPage(ctx, input.Path) diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go index dd30bc8..8bd38b0 100644 --- a/internal/mcp/server_test.go +++ b/internal/mcp/server_test.go @@ -176,6 +176,7 @@ func TestListTools(t *testing.T) { expected := map[string]bool{ "search_pages": false, "get_wiki_context": false, + "get_wiki_digest": false, "get_page": false, "create_page": false, "update_page": false, @@ -207,6 +208,40 @@ func TestGetWikiContext(t *testing.T) { if ctx.PageCount != 4 { t.Errorf("PageCount = %d, want 4", ctx.PageCount) } + // New digest fields should be populated on the same response so + // existing get_wiki_context callers get the orientation upgrade + // for free (plan open question #4 — keep old shape, add fields). + if ctx.Markdown == "" { + t.Errorf("expected digest markdown to be populated on get_wiki_context") + } + if len(ctx.Areas) == 0 { + t.Errorf("expected areas to be populated on get_wiki_context") + } +} + +func TestGetWikiDigest(t *testing.T) { + session := setupTestServer(t) + text := callTool(t, session, "get_wiki_digest", nil) + + var d wiki.Digest + if err := json.Unmarshal([]byte(text), &d); err != nil { + t.Fatalf("unmarshal: %v\n%s", err, text) + } + if d.PageCount != 4 { + t.Errorf("PageCount = %d, want 4", d.PageCount) + } + if d.Markdown == "" { + t.Errorf("Markdown empty") + } + if !strings.Contains(d.Markdown, "This wiki contains") { + t.Errorf("markdown missing header sentence:\n%s", d.Markdown) + } + if !strings.Contains(d.Markdown, "## Areas") { + t.Errorf("markdown missing Areas section:\n%s", d.Markdown) + } + if len(d.Areas) == 0 { + t.Errorf("expected at least one area in structured output") + } } func TestGetPage(t *testing.T) { diff --git a/internal/wiki/cloud.go b/internal/wiki/cloud.go new file mode 100644 index 0000000..5fab2ba --- /dev/null +++ b/internal/wiki/cloud.go @@ -0,0 +1,318 @@ +package wiki + +import ( + "context" + "sort" + "strings" + "sync" + "unicode" +) + +// CloudTerm is a single entry in the rendered word/phrase cloud. +type CloudTerm struct { + Term string `json:"term"` + Count int `json:"count"` +} + +// defaultStopwords is the built-in English stopword list applied to +// every wiki's cloud. Users add domain-specific extras via config +// (digest.stopwords_extra) which are merged on top. +// +// Kept intentionally conservative: only true function words and the +// most generic English filler. Domain terms (even common ones like +// "wiki" or "page") are left to the per-wiki frequency signal to +// dampen — a wiki that's literally *about* wikis should be allowed +// to say so. +var defaultStopwords = map[string]struct{}{ + "a": {}, "an": {}, "and": {}, "are": {}, "as": {}, "at": {}, + "be": {}, "but": {}, "by": {}, "can": {}, + "do": {}, "does": {}, "for": {}, "from": {}, + "had": {}, "has": {}, "have": {}, "he": {}, "her": {}, "here": {}, + "hers": {}, "him": {}, "his": {}, "how": {}, + "i": {}, "if": {}, "in": {}, "into": {}, "is": {}, "it": {}, "its": {}, + "just": {}, "may": {}, "might": {}, "must": {}, + "no": {}, "not": {}, "now": {}, "of": {}, "off": {}, "on": {}, "one": {}, + "only": {}, "or": {}, "other": {}, "our": {}, "ours": {}, "out": {}, + "over": {}, "own": {}, + "s": {}, "she": {}, "should": {}, "so": {}, "some": {}, "such": {}, + "t": {}, "than": {}, "that": {}, "the": {}, "their": {}, "them": {}, + "then": {}, "there": {}, "these": {}, "they": {}, "this": {}, "those": {}, + "to": {}, "too": {}, + "under": {}, "until": {}, "up": {}, "upon": {}, + "was": {}, "we": {}, "were": {}, "what": {}, "when": {}, "where": {}, + "which": {}, "while": {}, "who": {}, "whom": {}, "why": {}, "will": {}, + "with": {}, "would": {}, + "you": {}, "your": {}, "yours": {}, +} + +// cloudBuilder accumulates unigram and bigram counts across pages. +// It is reset and re-run from scratch on each rebuild; the plan's +// 5-minute ticker (Step 6) calls Build() and stores the result. +type cloudBuilder struct { + stopwords map[string]struct{} +} + +// newCloudBuilder constructs a builder with the default stopword set +// merged with the user's extras. Extras are case-folded to match the +// tokenizer's lowercase output. +func newCloudBuilder(extra []string) *cloudBuilder { + sw := make(map[string]struct{}, len(defaultStopwords)+len(extra)) + for k := range defaultStopwords { + sw[k] = struct{}{} + } + for _, w := range extra { + w = strings.ToLower(strings.TrimSpace(w)) + if w != "" { + sw[w] = struct{}{} + } + } + return &cloudBuilder{stopwords: sw} +} + +// isStopword reports whether t is filtered out of the cloud. In +// addition to the configured stopword set, single-character tokens +// and pure-numeric tokens are dropped: neither carries useful "about" +// signal and both massively inflate the long tail. +func (b *cloudBuilder) isStopword(t string) bool { + if len(t) < 2 { + return true + } + if _, ok := b.stopwords[t]; ok { + return true + } + allDigit := true + for _, r := range t { + if !unicode.IsDigit(r) { + allDigit = false + break + } + } + return allDigit +} + +// tokenize splits a body into lowercase word tokens. The rules are +// deliberately simple and deterministic: +// +// - Lowercase everything. +// - A token is a maximal run of letters / digits / underscores / +// hyphens. Hyphens and underscores are kept because identifiers +// like "mind-map" or "page_count" are exactly the kinds of terms +// we want to surface intact. +// - Wikilink markup ([[...]]) is stripped but the target text +// inside is tokenized normally — a link to [[projects/mind-map]] +// contributes "projects" and "mind-map" to the page's tokens. +// - Markdown punctuation (#, *, _, `, etc.) becomes a separator. +// - Code fences and inline code are NOT stripped: code identifiers +// are part of what a technical wiki is about, and dropping them +// flattens the cloud. +func (b *cloudBuilder) tokenize(body string) []string { + // Cheaply strip the wikilink delimiters so [[a/b]] surfaces both + // "a" and "b" without us having to special-case the parser. The + // pipe form [[display|target]] is left as-is; the tokenizer's + // non-alnum-split will handle both halves. + body = strings.ReplaceAll(body, "[[", " ") + body = strings.ReplaceAll(body, "]]", " ") + + tokens := make([]string, 0, len(body)/6) + var cur strings.Builder + flush := func() { + if cur.Len() > 0 { + tokens = append(tokens, cur.String()) + cur.Reset() + } + } + for _, r := range body { + switch { + case unicode.IsLetter(r) || unicode.IsDigit(r): + cur.WriteRune(unicode.ToLower(r)) + case r == '-' || r == '_': + // Mid-token punctuation: keep only if it joins two + // alnum runs. Leading/trailing get trimmed below. + cur.WriteRune(r) + default: + flush() + } + } + flush() + + // Trim leading/trailing hyphens and underscores (e.g. "--foo") + // that survived the above without splitting cleanly. + for i, t := range tokens { + tokens[i] = strings.Trim(t, "-_") + } + return tokens +} + +// addPage folds one page's tokens into the running unigram and bigram +// counts. +// +// Bigrams require BOTH ends to pass the stopword filter (plan open +// question #2 lean): otherwise common phrases like "the wiki" would +// dominate purely because "the" is high-frequency, even though the +// pair is no more informative than "wiki" alone. +func (b *cloudBuilder) addPage(body string, unigrams, bigrams map[string]int) { + tokens := b.tokenize(body) + + var prev string + for _, t := range tokens { + if t == "" { + prev = "" + continue + } + stop := b.isStopword(t) + if !stop { + unigrams[t]++ + } + if prev != "" && !stop && !b.isStopword(prev) { + bigrams[prev+" "+t]++ + } + prev = t + } +} + +// topK selects the K highest-count entries from the given map. Ties +// break alphabetically so the output is stable across rebuilds — +// otherwise a digest cache invalidation could shuffle the cloud for +// no reason a user would understand. +func topK(counts map[string]int, k int) []CloudTerm { + if k <= 0 || len(counts) == 0 { + return nil + } + terms := make([]CloudTerm, 0, len(counts)) + for t, n := range counts { + terms = append(terms, CloudTerm{Term: t, Count: n}) + } + sort.Slice(terms, func(i, j int) bool { + if terms[i].Count != terms[j].Count { + return terms[i].Count > terms[j].Count + } + return terms[i].Term < terms[j].Term + }) + if len(terms) > k { + terms = terms[:k] + } + return terms +} + +// buildCloud computes the top-K most frequent terms across all page +// bodies. The result mixes unigrams and bigrams: bigrams are scored +// by their own frequency (no boost), so a phrase only beats a single +// word when it genuinely occurs more often. +// +// Caller owns the goroutine and the slot it's stored in; this function +// just does the work. Step 6 wires it to the 5-minute ticker. +// BuildCloud computes the top-K most frequent terms across all page +// bodies. Exposed for the digest.Manager ticker — the implementation +// lives on the Wiki because it reads `pages` directly; the supervisor +// owns the scheduling. +// +// The result mixes unigrams and bigrams: bigrams are scored by their +// own frequency (no boost), so a phrase only beats a single word when +// it genuinely occurs more often. +func (w *Wiki) BuildCloud(ctx context.Context, k int, stopwordsExtra []string) ([]CloudTerm, error) { + return w.buildCloud(ctx, k, stopwordsExtra) +} + +// SetCloud installs a freshly-built cloud into the in-memory cache. +// Pairs with BuildCloud; the supervisor calls Build → Set → Persist. +func (w *Wiki) SetCloud(terms []CloudTerm) { + w.cloud.Set(terms) +} + +// PersistCloud writes the current cloud cache to wiki_state. Called +// by the digest.Manager after a successful rebuild. +func (w *Wiki) PersistCloud(ctx context.Context) error { + return w.persistCloud(ctx) +} + +func (w *Wiki) buildCloud(ctx context.Context, k int, stopwordsExtra []string) ([]CloudTerm, error) { + if err := ctx.Err(); err != nil { + return nil, err + } + + rows, err := w.db.QueryContext(ctx, "SELECT body FROM pages") + if err != nil { + return nil, err + } + defer rows.Close() + + b := newCloudBuilder(stopwordsExtra) + unigrams := make(map[string]int) + bigrams := make(map[string]int) + + for rows.Next() { + if err := ctx.Err(); err != nil { + return nil, err + } + var body string + if err := rows.Scan(&body); err != nil { + continue + } + b.addPage(body, unigrams, bigrams) + } + if err := rows.Err(); err != nil { + return nil, err + } + + // Merge the two count maps before selecting top-K. This lets a + // strong bigram outrank a weak unigram, and vice versa, on a + // single global scale. + merged := make(map[string]int, len(unigrams)+len(bigrams)) + for t, n := range unigrams { + merged[t] = n + } + for t, n := range bigrams { + merged[t] = n + } + return topK(merged, k), nil +} + +// cloudCache is a single-slot cache for the rebuilt cloud. The +// 5-minute ticker (Step 6) calls Set; readers (digest renderer) call +// Get. A read returns whatever was last set even if the ticker is +// behind — the digest's job is "good orientation," not "perfectly +// fresh stats." +type cloudCache struct { + mu sync.RWMutex + terms []CloudTerm + // set is true once Set has been called at least once. Readers + // distinguish "no cloud yet" (cold start) from "cloud is empty" + // (truly empty wiki) by checking set. + set bool + // version is bumped on each Set. The digest cache uses it as a + // change signal so it can invalidate rendered output without + // re-comparing slices. + version uint64 +} + +func (c *cloudCache) Set(terms []CloudTerm) { + c.mu.Lock() + defer c.mu.Unlock() + // Defensive copy: caller may continue to mutate the slice. + cp := make([]CloudTerm, len(terms)) + copy(cp, terms) + c.terms = cp + c.set = true + c.version++ +} + +// Get returns a copy of the current cloud and whether one has been +// computed yet. +func (c *cloudCache) Get() ([]CloudTerm, bool) { + c.mu.RLock() + defer c.mu.RUnlock() + if !c.set { + return nil, false + } + cp := make([]CloudTerm, len(c.terms)) + copy(cp, c.terms) + return cp, true +} + +// Version returns the monotonic change counter. Pairs with +// recentsLRU.version() for digest cache invalidation. +func (c *cloudCache) Version() uint64 { + c.mu.RLock() + defer c.mu.RUnlock() + return c.version +} diff --git a/internal/wiki/cloud_test.go b/internal/wiki/cloud_test.go new file mode 100644 index 0000000..208ccae --- /dev/null +++ b/internal/wiki/cloud_test.go @@ -0,0 +1,212 @@ +package wiki + +import ( + "context" + "reflect" + "strings" + "testing" +) + +func TestTokenize_Basic(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("Hello, world! This is mind-map.") + want := []string{"hello", "world", "this", "is", "mind-map"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("tokenize: got %v, want %v", got, want) + } +} + +func TestTokenize_KeepsHyphensAndUnderscores(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("page_count and mind-map are tokens") + if !contains(got, "page_count") { + t.Fatalf("expected page_count intact: %v", got) + } + if !contains(got, "mind-map") { + t.Fatalf("expected mind-map intact: %v", got) + } +} + +func TestTokenize_StripsWikilinkBrackets(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("see [[projects/mind-map]] for details") + // '/' is a separator, so we get the segments individually. + if !contains(got, "projects") || !contains(got, "mind-map") { + t.Fatalf("wikilink target words missing: %v", got) + } + for _, tok := range got { + if strings.ContainsAny(tok, "[]") { + t.Fatalf("bracket leaked into token %q", tok) + } + } +} + +func TestTokenize_LowercasesUnicode(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("Привет МИР") + want := []string{"привет", "мир"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("tokenize unicode: got %v, want %v", got, want) + } +} + +func TestIsStopword(t *testing.T) { + b := newCloudBuilder([]string{"TODO"}) + cases := map[string]bool{ + "the": true, // default + "todo": true, // user-added, case-folded + "wiki": false, // domain term, not filtered + "a": true, // length<2 short-circuit (and in defaults) + "x": true, // length<2 + "42": true, // all-digit + "v1": false, // alnum mix, keep + "mind": false, + } + for tok, want := range cases { + if got := b.isStopword(tok); got != want { + t.Errorf("isStopword(%q) = %v, want %v", tok, got, want) + } + } +} + +func TestAddPage_UnigramAndBigramCounts(t *testing.T) { + b := newCloudBuilder(nil) + uni := map[string]int{} + bi := map[string]int{} + b.addPage("wiki engine. wiki engine.", uni, bi) + + if uni["wiki"] != 2 || uni["engine"] != 2 { + t.Fatalf("unigram counts wrong: %v", uni) + } + if bi["wiki engine"] != 2 { + t.Fatalf("bigram count wrong: %v", bi) + } + // "engine wiki" crosses a sentence boundary but our tokenizer + // treats '.' as a separator, not a sentence-aware split. The + // resulting bigram across "." is intentional — we don't have + // sentence info and a bigram across punctuation is still a + // real adjacent-token pair in the text. + if bi["engine wiki"] != 1 { + t.Fatalf("expected one engine->wiki bigram: %v", bi) + } +} + +func TestAddPage_StopwordsFilterBothBigramEnds(t *testing.T) { + b := newCloudBuilder(nil) + uni := map[string]int{} + bi := map[string]int{} + // "the wiki" → unigram "wiki" counts (the is stopword), + // but bigram "the wiki" must NOT be recorded. + b.addPage("the wiki is here. the wiki is here.", uni, bi) + + if _, ok := bi["the wiki"]; ok { + t.Fatalf("stopword-led bigram leaked: %v", bi) + } + if _, ok := bi["wiki is"]; ok { + t.Fatalf("stopword-tailed bigram leaked: %v", bi) + } + if uni["wiki"] != 2 { + t.Fatalf("unigram counts off: %v", uni) + } +} + +func TestTopK_OrderingAndTieBreak(t *testing.T) { + counts := map[string]int{ + "banana": 5, + "apple": 5, + "cherry": 3, + "date": 1, + } + got := topK(counts, 3) + want := []CloudTerm{ + {Term: "apple", Count: 5}, + {Term: "banana", Count: 5}, + {Term: "cherry", Count: 3}, + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("topK: got %v, want %v", got, want) + } +} + +func TestTopK_Empty(t *testing.T) { + if got := topK(nil, 5); got != nil { + t.Fatalf("nil input should return nil, got %v", got) + } + if got := topK(map[string]int{"a": 1}, 0); got != nil { + t.Fatalf("k=0 should return nil, got %v", got) + } +} + +func TestBuildCloud_EndToEnd(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Seed extra content that should dominate the cloud. + if err := w.CreatePage(ctx, "topics/sqlite", + "# SQLite\n\nSQLite is a database. SQLite is fast. SQLite is small.\n"); err != nil { + t.Fatalf("seed: %v", err) + } + + terms, err := w.buildCloud(ctx, 10, nil) + if err != nil { + t.Fatalf("buildCloud: %v", err) + } + if len(terms) == 0 { + t.Fatalf("expected non-empty cloud") + } + + // "sqlite" should be the top unigram now (4+ occurrences across pages). + found := false + for _, term := range terms { + if term.Term == "sqlite" { + found = true + if term.Count < 3 { + t.Errorf("sqlite count surprisingly low: %d", term.Count) + } + } + } + if !found { + t.Fatalf("sqlite missing from top-10: %v", terms) + } + + // No stopwords leaked. + for _, term := range terms { + for _, sw := range []string{"the", "is", "a", "and"} { + if term.Term == sw { + t.Errorf("stopword %q in cloud", sw) + } + } + } +} + +func TestCloudCache_RoundTrip(t *testing.T) { + c := &cloudCache{} + if got, ok := c.Get(); ok { + t.Fatalf("uninitialized cache should report not-set; got %v", got) + } + c.Set([]CloudTerm{{Term: "x", Count: 1}}) + got, ok := c.Get() + if !ok { + t.Fatalf("after Set, Get should report set") + } + if !reflect.DeepEqual(got, []CloudTerm{{Term: "x", Count: 1}}) { + t.Fatalf("roundtrip mismatch: %v", got) + } + + // Mutating the returned slice must not affect the cache. + got[0].Term = "MUTATED" + again, _ := c.Get() + if again[0].Term != "x" { + t.Fatalf("cache leaked internal state: %v", again) + } +} + +func contains(ss []string, s string) bool { + for _, x := range ss { + if x == s { + return true + } + } + return false +} diff --git a/internal/wiki/digest.go b/internal/wiki/digest.go new file mode 100644 index 0000000..468a98c --- /dev/null +++ b/internal/wiki/digest.go @@ -0,0 +1,309 @@ +package wiki + +import ( + "context" + "fmt" + "sort" + "strings" + "sync" +) + +// AreaSummary is one entry under `## Areas` in the rendered digest: +// a top-level directory, how many pages live under it, and (if the +// directory has an `index.md`) that index page's title as a one-line +// description. +type AreaSummary struct { + Path string `json:"path"` + PageCount int `json:"page_count"` + IndexTitle string `json:"index_title,omitempty"` +} + +// Digest is the structured form of the per-conversation orientation +// blob. The MCP `get_wiki_digest` tool and HTTP `/api/digest` endpoint +// return this — the markdown is what an LLM consumes; the typed fields +// let the WebUI render its own views (e.g. a word-cloud widget) without +// re-parsing the markdown. +type Digest struct { + PageCount int `json:"page_count"` + Cloud []CloudTerm `json:"cloud_terms"` + Recents []string `json:"recents"` + Areas []AreaSummary `json:"areas"` + Markdown string `json:"markdown"` +} + +// defaultMaxRenderBytes is the soft cap on the rendered markdown. +// Trim order when over: recents -> cloud -> areas (never). Matches +// the plan's ~4 KB target. Tunable via config (Step 7). +const defaultMaxRenderBytes = 4096 + +// digestCache is a single-slot cache for the rendered Digest, keyed +// by the (cloud version, recents seq) tuple at render time. The +// digest itself is a few-hundred-byte structure; what we're saving +// is the SQL roundtrip for area counts and the render loop, not the +// allocation. +type digestCache struct { + mu sync.Mutex + cloudVer uint64 + recentsSeq uint64 + pageCount int + cached *Digest +} + +// get returns the cached digest if (cloudVer, recentsSeq, pageCount) +// match the supplied values. pageCount is part of the key because a +// page added or removed without touching the LRU (rare, but happens +// on reindex for pure-content-change pages) still changes the header +// sentence ("This wiki contains N pages..."). +// +// Returns (nil, false) on a miss. +func (c *digestCache) get(cloudVer, recentsSeq uint64, pageCount int) (*Digest, bool) { + c.mu.Lock() + defer c.mu.Unlock() + if c.cached == nil { + return nil, false + } + if c.cloudVer != cloudVer || c.recentsSeq != recentsSeq || c.pageCount != pageCount { + return nil, false + } + return c.cached, true +} + +func (c *digestCache) set(cloudVer, recentsSeq uint64, pageCount int, d *Digest) { + c.mu.Lock() + defer c.mu.Unlock() + c.cloudVer = cloudVer + c.recentsSeq = recentsSeq + c.pageCount = pageCount + c.cached = d +} + +// invalidate clears the cache. Used in tests and on schema rebuilds +// (Step 4); CRUD doesn't need to call this because version bumps +// already cover the cache invalidation contract. +func (c *digestCache) invalidate() { + c.mu.Lock() + defer c.mu.Unlock() + c.cached = nil +} + +// Digest returns the current orientation digest. Cheap on cache hit; +// on miss, builds in O(pages) for the area counts and O(K) for the +// render. Safe for concurrent callers. +// +// This is the function HTTP `/api/digest` and the MCP `get_wiki_digest` +// tool call. It is also called transitively from the existing +// `get_wiki_context` (see Step 5) so old clients see the new data +// shape without breakage. +func (w *Wiki) Digest(ctx context.Context) (*Digest, error) { + if err := ctx.Err(); err != nil { + return nil, err + } + + pageCount, err := w.pageCount(ctx) + if err != nil { + return nil, fmt.Errorf("digest page count: %w", err) + } + + cloudVer := w.cloud.Version() + recentsSeq := w.recents.version() + + if d, ok := w.digest.get(cloudVer, recentsSeq, pageCount); ok { + return d, nil + } + + areas, err := w.areaSummaries(ctx) + if err != nil { + return nil, fmt.Errorf("digest areas: %w", err) + } + + cloudTerms, _ := w.cloud.Get() // ok == false → empty, render copes + recents := w.recents.snapshot() + + d := &Digest{ + PageCount: pageCount, + Cloud: cloudTerms, + Recents: recents, + Areas: areas, + } + d.Markdown = renderDigestMarkdown(d, w.renderCap()) + + w.digest.set(cloudVer, recentsSeq, pageCount, d) + return d, nil +} + +// renderCap returns the effective byte cap to pass into the markdown +// renderer. Normalized in Open() to: +// +// > 0 → trim to that size +// == 0 → defaulted, never observed here +// < 0 → no trimming +// +// The renderer treats <= 0 uniformly as "no trim," so we forward +// negative values straight through. +func (w *Wiki) renderCap() int { + return w.maxRenderBytes +} + +// pageCount runs the same SELECT COUNT(*) the Context handler uses. +// Lifted into a helper so Digest can share it. +func (w *Wiki) pageCount(ctx context.Context) (int, error) { + var n int + if err := w.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM pages").Scan(&n); err != nil { + return 0, err + } + return n, nil +} + +// areaSummaries returns one entry per top-level directory in the wiki, +// with the page count and (if the directory has `/index.md`) +// the title of that index page. Sorted by descending page count, then +// by name — same shape as the rendered markdown. +// +// An area with no pages under it cannot exist (the source data is the +// `pages` table; empty dirs aren't tracked). A flat-rooted page like +// "readme" with no slash is not an area; only paths containing `/` +// contribute. This matches what topLevelDirs() exposes via filesystem +// listing — the two should agree, but areaSummaries is the source of +// truth for the digest because it's driven by indexed content, not +// filesystem state. +func (w *Wiki) areaSummaries(ctx context.Context) ([]AreaSummary, error) { + rows, err := w.db.QueryContext(ctx, "SELECT path, title FROM pages") + if err != nil { + return nil, err + } + defer rows.Close() + + type acc struct { + count int + indexTitle string + } + bucket := map[string]*acc{} + + for rows.Next() { + var path, title string + if err := rows.Scan(&path, &title); err != nil { + continue + } + slash := strings.IndexByte(path, '/') + if slash < 0 { + continue // flat-rooted, not an area + } + area := path[:slash] + a, ok := bucket[area] + if !ok { + a = &acc{} + bucket[area] = a + } + a.count++ + // The area's index page is `/index`. Record its title + // once; if for some reason there are multiple (shouldn't be, + // PRIMARY KEY on path prevents it), the last one wins. + if path == area+"/index" { + a.indexTitle = title + } + } + if err := rows.Err(); err != nil { + return nil, err + } + + out := make([]AreaSummary, 0, len(bucket)) + for name, a := range bucket { + out = append(out, AreaSummary{ + Path: name, + PageCount: a.count, + IndexTitle: a.indexTitle, + }) + } + sort.Slice(out, func(i, j int) bool { + if out[i].PageCount != out[j].PageCount { + return out[i].PageCount > out[j].PageCount + } + return out[i].Path < out[j].Path + }) + return out, nil +} + +// renderDigestMarkdown produces the markdown blob shown to LLMs. The +// shape mirrors the example in the plan; ordering of sections is +// header -> cloud line -> Areas -> Recently active -> footer. +// +// When the assembled body exceeds maxBytes the renderer trims: +// 1. drop recents from the tail until under cap, then +// 2. drop cloud terms from the tail until under cap. +// +// Areas are never trimmed — they are the smallest section and the +// most structurally important: an agent that loses the area list +// loses the map of the wiki. The footer hint is also preserved. +// +// If maxBytes <= 0 no trimming is applied. Useful for tests that want +// to verify full content. +func renderDigestMarkdown(d *Digest, maxBytes int) string { + cloud := d.Cloud + recents := d.Recents + + for { + var sb strings.Builder + writeDigestBody(&sb, d.PageCount, cloud, d.Areas, recents) + out := sb.String() + if maxBytes <= 0 || len(out) <= maxBytes { + return out + } + // Trim recents first. + if len(recents) > 0 { + recents = recents[:len(recents)-1] + continue + } + // Then trim cloud. + if len(cloud) > 0 { + cloud = cloud[:len(cloud)-1] + continue + } + // Already minimal; return what we have, even if over cap. + // Areas + header alone exceeding 4 KB would require a + // wiki with hundreds of top-level dirs — unlikely, but + // truncating areas would be a worse failure mode. + return out + } +} + +func writeDigestBody(sb *strings.Builder, pageCount int, cloud []CloudTerm, areas []AreaSummary, recents []string) { + areaCount := len(areas) + if areaCount == 1 { + fmt.Fprintf(sb, "This wiki contains %d pages across 1 area.", pageCount) + } else { + fmt.Fprintf(sb, "This wiki contains %d pages across %d areas.", pageCount, areaCount) + } + + if len(cloud) > 0 { + sb.WriteString(" About:\n") + for i, t := range cloud { + if i > 0 { + sb.WriteString(", ") + } + sb.WriteString(t.Term) + } + sb.WriteString("\n") + } else { + sb.WriteString("\n") + } + + if len(areas) > 0 { + sb.WriteString("\n## Areas\n") + for _, a := range areas { + fmt.Fprintf(sb, "- %s (%d)", a.Path, a.PageCount) + if a.IndexTitle != "" { + fmt.Fprintf(sb, " — %s/index: %q", a.Path, a.IndexTitle) + } + sb.WriteString("\n") + } + } + + if len(recents) > 0 { + sb.WriteString("\n## Recently active\n") + for _, p := range recents { + fmt.Fprintf(sb, "- %s\n", p) + } + } + + sb.WriteString("\nFull skill: SKILL.md. Use `get_wiki_digest` for the live version.\n") +} diff --git a/internal/wiki/digest_test.go b/internal/wiki/digest_test.go new file mode 100644 index 0000000..1f2ab37 --- /dev/null +++ b/internal/wiki/digest_test.go @@ -0,0 +1,297 @@ +package wiki + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" +) + +func TestDigest_StructuralFields(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + d, err := w.Digest(ctx) + if err != nil { + t.Fatalf("Digest: %v", err) + } + + if d.PageCount == 0 { + t.Fatal("page count should be > 0") + } + if d.Markdown == "" { + t.Fatal("markdown should not be empty") + } + // testWiki creates pages under projects/ and people/ — at least + // two areas should surface. + if len(d.Areas) < 2 { + t.Fatalf("expected >= 2 areas, got %d: %v", len(d.Areas), d.Areas) + } + // Cloud is empty because the ticker hasn't run yet (cold start). + // That's the expected behavior; the digest should still render. + if d.Cloud != nil && len(d.Cloud) != 0 { + t.Fatalf("cloud should be empty on cold start, got %v", d.Cloud) + } +} + +func TestDigest_MarkdownShape(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Seed cloud so we exercise the "About:" line too. + w.cloud.Set([]CloudTerm{ + {Term: "wiki", Count: 10}, + {Term: "mind-map", Count: 7}, + }) + // Seed recents. + w.recents.touch("projects/mind-map") + w.recents.touch("index") + // Bust the digest cache because we mutated state directly. + w.digest.invalidate() + + d, err := w.Digest(ctx) + if err != nil { + t.Fatalf("Digest: %v", err) + } + + md := d.Markdown + t.Logf("rendered:\n%s", md) + + mustContain := []string{ + "This wiki contains", + "About:", + "wiki, mind-map", + "## Areas", + "## Recently active", + "- index", + "- projects/mind-map", + "Full skill: SKILL.md", + "get_wiki_digest", + } + for _, s := range mustContain { + if !strings.Contains(md, s) { + t.Errorf("markdown missing %q\n---\n%s\n---", s, md) + } + } +} + +func TestDigest_AreaCountsAndIndexTitle(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Add an index page under "projects" with a known title. + if err := w.CreatePage(ctx, "projects/index", `--- +title: Active Projects +--- +# Active Projects +`); err != nil { + t.Fatalf("create projects/index: %v", err) + } + + d, err := w.Digest(ctx) + if err != nil { + t.Fatalf("Digest: %v", err) + } + + var found *AreaSummary + for i := range d.Areas { + if d.Areas[i].Path == "projects" { + found = &d.Areas[i] + break + } + } + if found == nil { + t.Fatalf("projects area missing: %+v", d.Areas) + } + if found.IndexTitle != "Active Projects" { + t.Errorf("expected index title 'Active Projects', got %q", found.IndexTitle) + } + if found.PageCount < 2 { + t.Errorf("projects should have >=2 pages (mind-map + index), got %d", found.PageCount) + } + + // The rendered area line should include the index title quoted. + if !strings.Contains(d.Markdown, `projects/index: "Active Projects"`) { + t.Errorf("markdown missing index title:\n%s", d.Markdown) + } +} + +func TestDigest_CacheHit(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // First call populates the cache. + first, err := w.Digest(ctx) + if err != nil { + t.Fatalf("first Digest: %v", err) + } + + // Second call with no state change returns the *same* pointer + // (the cache stores the *Digest; a hit returns it as-is). + second, err := w.Digest(ctx) + if err != nil { + t.Fatalf("second Digest: %v", err) + } + if first != second { + t.Errorf("expected cache hit to return same *Digest pointer") + } +} + +func TestDigest_CacheInvalidatedByLRUChange(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + first, err := w.Digest(ctx) + if err != nil { + t.Fatalf("first: %v", err) + } + + // Touching the LRU bumps recents seq → cache miss next read. + w.recents.touch("index") + + second, err := w.Digest(ctx) + if err != nil { + t.Fatalf("second: %v", err) + } + if first == second { + t.Errorf("expected fresh *Digest after recents change") + } + if !strings.Contains(second.Markdown, "- index") { + t.Errorf("new recents not reflected in markdown:\n%s", second.Markdown) + } +} + +func TestDigest_CacheInvalidatedByCloudChange(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + first, err := w.Digest(ctx) + if err != nil { + t.Fatalf("first: %v", err) + } + + w.cloud.Set([]CloudTerm{{Term: "wiki", Count: 1}}) + + second, err := w.Digest(ctx) + if err != nil { + t.Fatalf("second: %v", err) + } + if first == second { + t.Errorf("expected fresh *Digest after cloud set") + } + if !strings.Contains(second.Markdown, "About:") { + t.Errorf("cloud not reflected in markdown:\n%s", second.Markdown) + } +} + +func TestRenderDigest_TrimToMaxBytes(t *testing.T) { + // Build a digest that's deliberately over-cap. + cloud := make([]CloudTerm, 50) + for i := range cloud { + cloud[i] = CloudTerm{Term: strings.Repeat("x", 20), Count: 1} + } + recents := make([]string, 50) + for i := range recents { + recents[i] = strings.Repeat("path", 20) + } + d := &Digest{ + PageCount: 100, + Cloud: cloud, + Recents: recents, + Areas: []AreaSummary{{Path: "a", PageCount: 5}}, + } + + const cap = 512 + md := renderDigestMarkdown(d, cap) + if len(md) > cap { + // The trimmer is best-effort: if the unavoidable parts + // (areas + header + footer) already exceed cap we accept + // being over. But in this test those are tiny, so we + // should be under. + t.Errorf("rendered len=%d > cap=%d", len(md), cap) + } + // Areas + header + footer must still be intact. + mustContain := []string{"## Areas", "- a (5)", "Full skill"} + for _, s := range mustContain { + if !strings.Contains(md, s) { + t.Errorf("trim dropped required section %q:\n%s", s, md) + } + } +} + +func TestRenderDigest_NoCloudNoRecents(t *testing.T) { + d := &Digest{ + PageCount: 3, + Areas: []AreaSummary{ + {Path: "notes", PageCount: 3}, + }, + } + md := renderDigestMarkdown(d, 0) + if strings.Contains(md, "About:") { + t.Errorf("empty cloud should not produce About: line:\n%s", md) + } + if strings.Contains(md, "## Recently active") { + t.Errorf("empty recents should not produce section:\n%s", md) + } + if !strings.Contains(md, "## Areas") || !strings.Contains(md, "- notes (3)") { + t.Errorf("areas missing:\n%s", md) + } +} + +func TestAreaSummaries_FlatRootedPagesIgnored(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // `index` is flat-rooted; should not produce an "index" area. + areas, err := w.areaSummaries(ctx) + if err != nil { + t.Fatalf("areaSummaries: %v", err) + } + for _, a := range areas { + if a.Path == "index" { + t.Fatalf("flat-rooted page leaked into areas: %+v", areas) + } + } +} + +func TestReindex_RemovesFromLRU(t *testing.T) { + w, dir := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Touch and verify presence. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + found := false + for _, p := range w.recents.snapshot() { + if p == "index" { + found = true + } + } + if !found { + t.Fatalf("index should be in LRU after GetPage") + } + + // Raw-filesystem delete + reindex (simulating sync removing a file). + if err := os.Remove(filepath.Join(dir, "index.md")); err != nil { + t.Fatalf("remove file: %v", err) + } + if _, err := w.Reindex(ctx); err != nil { + t.Fatalf("reindex: %v", err) + } + + for _, p := range w.recents.snapshot() { + if p == "index" { + t.Fatalf("reindex should have purged stale LRU entry: %v", w.recents.snapshot()) + } + } +} diff --git a/internal/wiki/index.go b/internal/wiki/index.go index e847f92..0379dc6 100644 --- a/internal/wiki/index.go +++ b/internal/wiki/index.go @@ -159,6 +159,13 @@ func (w *Wiki) Reindex(ctx context.Context) (ReindexStats, error) { slog.Warn("reindex remove error", slog.String("page", pagePath), slog.Any("error", err)) continue } + // Keep the recents LRU consistent with `pages`: a page + // that vanishes via raw-filesystem delete + reindex + // (common after `git pull` in sync) must drop from the + // LRU here, since DeletePage() was never called. Without + // this hook the digest's "recently active" can point at + // a 404. + w.recents.remove(pagePath) removed++ } } diff --git a/internal/wiki/pages.go b/internal/wiki/pages.go index 38291ec..d030cbd 100644 --- a/internal/wiki/pages.go +++ b/internal/wiki/pages.go @@ -50,6 +50,11 @@ func (w *Wiki) GetPage(ctx context.Context, pagePath string) (*Page, error) { slog.Warn("failed to get backlinks", slog.String("page", pagePath), slog.Any("error", err)) } + // LRU touch reflects that the agent actually saw this page. We + // only reach here on a successful row scan, so a typo'd path that + // hit the "page not found" branch above will not pollute recents. + w.recents.touch(pagePath) + return &Page{ Path: pagePath, Title: title, @@ -148,7 +153,11 @@ func (w *Wiki) CreatePage(ctx context.Context, pagePath string, content string) } slog.Info("page created", slog.String("page", pagePath)) - return w.indexPage(ctx, pagePath) + if err := w.indexPage(ctx, pagePath); err != nil { + return err + } + w.recents.touch(pagePath) + return nil } // UpdatePage replaces the content of an existing page. @@ -178,7 +187,11 @@ func (w *Wiki) UpdatePage(ctx context.Context, pagePath string, content string) } slog.Info("page updated", slog.String("page", pagePath)) - return w.indexPage(ctx, pagePath) + if err := w.indexPage(ctx, pagePath); err != nil { + return err + } + w.recents.touch(pagePath) + return nil } // DeletePage removes a page from the filesystem and index. @@ -204,7 +217,13 @@ func (w *Wiki) DeletePage(ctx context.Context, pagePath string) error { } slog.Info("page deleted", slog.String("page", pagePath)) - return w.removePageIndex(ctx, pagePath) + if err := w.removePageIndex(ctx, pagePath); err != nil { + return err + } + // The page is gone; leaving it in recents would point the agent + // at a 404. Drop the entry rather than promote it. + w.recents.remove(pagePath) + return nil } // ErrDestinationExists is returned by MovePage when the destination @@ -310,6 +329,10 @@ func (w *Wiki) MovePage(ctx context.Context, fromPath, toPath string, opts MoveO return fmt.Errorf("index new page: %w", err) } + // Treat a move as one continuous "active use" rather than dropping + // the old name and freshly inserting the new one. See recentsLRU.rename. + w.recents.rename(from, to) + slog.Info("page moved", slog.String("from", from), slog.String("to", to), @@ -359,7 +382,14 @@ func (w *Wiki) GetBacklinks(ctx context.Context, pagePath string) ([]string, err return nil, err } - return w.getBacklinks(ctx, pagePath) + backlinks, err := w.getBacklinks(ctx, pagePath) + if err != nil { + return nil, err + } + // GetBacklinks is "I'm looking at this page's incoming links" — + // an active use of the target page, even if its body wasn't read. + w.recents.touch(pagePath) + return backlinks, nil } // Link is a single source→target edge between two pages. @@ -393,7 +423,13 @@ func (w *Wiki) AllLinks(ctx context.Context) ([]Link, error) { return links, nil } -// Context returns a WikiContext overview. +// Context returns a WikiContext overview. The legacy fields +// (PageCount, RecentPages, TopLevelDirs) come from disk — recent_pages +// is mtime-sorted, top_level_dirs is read from the filesystem — and +// preserve the shape clients in the wild already depend on. The new +// fields (Cloud, Recents, Areas, Markdown) come from the digest so +// existing get_wiki_context callers get the orientation upgrade +// without switching tool names. func (w *Wiki) Context(ctx context.Context) (*WikiContext, error) { if err := ctx.Err(); err != nil { return nil, err @@ -430,11 +466,25 @@ func (w *Wiki) Context(ctx context.Context) (*WikiContext, error) { // Top-level dirs dirs := w.topLevelDirs() - return &WikiContext{ + wctx := &WikiContext{ PageCount: count, RecentPages: recent, TopLevelDirs: dirs, - }, nil + } + + // Layer the digest's signals on top. A failure here doesn't fail + // the whole Context() call — the legacy fields are still valuable + // on their own, and the digest is an enhancement, not a contract. + if d, err := w.Digest(ctx); err == nil { + wctx.Cloud = d.Cloud + wctx.Recents = d.Recents + wctx.Areas = d.Areas + wctx.Markdown = d.Markdown + } else { + slog.Warn("context digest enrichment failed", slog.Any("error", err)) + } + + return wctx, nil } // --- locking --- diff --git a/internal/wiki/recents.go b/internal/wiki/recents.go new file mode 100644 index 0000000..2a45055 --- /dev/null +++ b/internal/wiki/recents.go @@ -0,0 +1,230 @@ +package wiki + +import ( + "container/list" + "sync" +) + +// recentsLRU is a fixed-capacity, most-recently-used-first ring of page +// paths. It tracks pages the user or agent has *actively* touched — +// Create, Update, Get, Move (both ends), Delete, GetBacklinks — rather +// than what disk mtime says was changed most recently. The distinction +// matters when sync's copyToWiki bumps mtimes for files the agent never +// looked at; an mtime-based "recents" would surface those, an LRU +// reflects intent. +// +// The structure is a doubly-linked list plus a path->element index, so +// touch / remove / rename are all O(1). It is safe for concurrent use. +// +// Persistence (snapshot to SQLite on a slow ticker) lives in state.go +// and the ticker lives in the wiki lifecycle code; recentsLRU itself +// is intentionally storage-agnostic. +type recentsLRU struct { + mu sync.Mutex + cap int + // ll holds paths with the most recently used at the front. + ll *list.List + // idx maps path -> list element for O(1) promote/remove. + idx map[string]*list.Element + // dirty is true when the in-memory state has diverged from the last + // persisted snapshot. The persistence ticker reads + clears it. + dirty bool + // seq is a monotonic counter bumped on every state-changing + // operation (touch / remove / rename / load). The digest cache + // uses it as a cheap "did anything change?" signal so it can + // invalidate rendered output without re-comparing snapshots. + // Wraps at uint64 max — irrelevant in practice. + seq uint64 +} + +// newRecentsLRU constructs an empty LRU with the given capacity. +// A non-positive cap is clamped to the default (20). +func newRecentsLRU(cap int) *recentsLRU { + if cap <= 0 { + cap = 20 + } + return &recentsLRU{ + cap: cap, + ll: list.New(), + idx: make(map[string]*list.Element), + } +} + +// touch records that the given page was just used. If the page is +// already in the ring it's promoted to the front; otherwise it's +// inserted at the front and the oldest entry is evicted if the ring +// is at capacity. +// +// Empty paths are ignored — callers don't need to guard the call site. +func (r *recentsLRU) touch(path string) { + if path == "" { + return + } + r.mu.Lock() + defer r.mu.Unlock() + + if elem, ok := r.idx[path]; ok { + r.ll.MoveToFront(elem) + r.dirty = true + r.seq++ + return + } + elem := r.ll.PushFront(path) + r.idx[path] = elem + if r.ll.Len() > r.cap { + oldest := r.ll.Back() + if oldest != nil { + r.ll.Remove(oldest) + delete(r.idx, oldest.Value.(string)) + } + } + r.dirty = true + r.seq++ +} + +// remove drops a path from the ring. Called when a page is deleted; +// the path is gone so including it in recents would mislead the agent. +// No-op if the path isn't tracked. +func (r *recentsLRU) remove(path string) { + if path == "" { + return + } + r.mu.Lock() + defer r.mu.Unlock() + + elem, ok := r.idx[path] + if !ok { + return + } + r.ll.Remove(elem) + delete(r.idx, path) + r.dirty = true + r.seq++ +} + +// rename relabels an entry in place, preserving its position in the +// ring. Called on MovePage so the move shows up as one touch (at the +// new name) rather than two (old name drops out, new name is fresh). +// +// If `from` isn't tracked, behaves as touch(to). If `to` is already +// tracked, the older `from` entry is removed and `to` is promoted — +// this is the same as if the agent had used `to` directly. +func (r *recentsLRU) rename(from, to string) { + if from == to { + r.touch(to) + return + } + r.mu.Lock() + defer r.mu.Unlock() + + fromElem, hasFrom := r.idx[from] + toElem, hasTo := r.idx[to] + + switch { + case hasFrom && hasTo: + // Both present: drop `from`, promote `to`. + r.ll.Remove(fromElem) + delete(r.idx, from) + r.ll.MoveToFront(toElem) + case hasFrom: + // Relabel in place at the same position. + fromElem.Value = to + delete(r.idx, from) + r.idx[to] = fromElem + r.ll.MoveToFront(fromElem) + case hasTo: + r.ll.MoveToFront(toElem) + default: + // Neither tracked: insert `to` fresh. + elem := r.ll.PushFront(to) + r.idx[to] = elem + if r.ll.Len() > r.cap { + oldest := r.ll.Back() + if oldest != nil { + r.ll.Remove(oldest) + delete(r.idx, oldest.Value.(string)) + } + } + } + r.dirty = true + r.seq++ +} + +// snapshot returns the tracked paths, most recent first. The returned +// slice is owned by the caller and safe to mutate. +func (r *recentsLRU) snapshot() []string { + r.mu.Lock() + defer r.mu.Unlock() + + out := make([]string, 0, r.ll.Len()) + for e := r.ll.Front(); e != nil; e = e.Next() { + out = append(out, e.Value.(string)) + } + return out +} + +// load replaces the ring's contents with the given paths (treated as +// most-recent-first). Used by the persistence layer on Wiki.Open to +// restore the last snapshot. Clears the dirty flag — what we just +// loaded matches what's on disk. +func (r *recentsLRU) load(paths []string) { + r.mu.Lock() + defer r.mu.Unlock() + + r.ll.Init() + r.idx = make(map[string]*list.Element, len(paths)) + for _, p := range paths { + if p == "" { + continue + } + if _, dup := r.idx[p]; dup { + continue + } + elem := r.ll.PushBack(p) + r.idx[p] = elem + if r.ll.Len() >= r.cap { + break + } + } + r.dirty = false + r.seq++ +} + +// version returns the monotonic change counter. The digest cache uses +// this as an invalidation signal: cache the rendered output keyed by +// (cloudVersion, recentsVersion), and rebuild when either advances. +// +// Cheap (one lock + load) so callers can invoke it on every read. +func (r *recentsLRU) version() uint64 { + r.mu.Lock() + defer r.mu.Unlock() + return r.seq +} + +// takeDirty returns whether the ring has unsaved changes and clears +// the flag in the same operation. The persistence ticker uses this to +// skip writes when nothing has changed. +func (r *recentsLRU) takeDirty() bool { + r.mu.Lock() + defer r.mu.Unlock() + was := r.dirty + r.dirty = false + return was +} + +// peekDirty returns whether the ring has unsaved changes without +// clearing the flag. Used by the digest.Manager's tick gate so the +// "did anything change?" probe doesn't race with the write that +// follows. +func (r *recentsLRU) peekDirty() bool { + r.mu.Lock() + defer r.mu.Unlock() + return r.dirty +} + +// len returns the number of tracked paths. Test helper. +func (r *recentsLRU) len() int { + r.mu.Lock() + defer r.mu.Unlock() + return r.ll.Len() +} diff --git a/internal/wiki/recents_test.go b/internal/wiki/recents_test.go new file mode 100644 index 0000000..e016a9e --- /dev/null +++ b/internal/wiki/recents_test.go @@ -0,0 +1,230 @@ +package wiki + +import ( + "context" + "reflect" + "testing" +) + +func TestRecentsLRU_TouchAndOrder(t *testing.T) { + r := newRecentsLRU(3) + + r.touch("a") + r.touch("b") + r.touch("c") + if got, want := r.snapshot(), []string{"c", "b", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after initial touches: got %v, want %v", got, want) + } + + // Re-touching an existing entry promotes it. + r.touch("a") + if got, want := r.snapshot(), []string{"a", "c", "b"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after promote: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_Eviction(t *testing.T) { + r := newRecentsLRU(2) + r.touch("a") + r.touch("b") + r.touch("c") // evicts "a" + + got := r.snapshot() + if len(got) != 2 || got[0] != "c" || got[1] != "b" { + t.Fatalf("expected [c b], got %v", got) + } +} + +func TestRecentsLRU_EmptyTouchIgnored(t *testing.T) { + r := newRecentsLRU(5) + r.touch("") + if r.len() != 0 { + t.Fatalf("empty path should not be tracked, len=%d", r.len()) + } +} + +func TestRecentsLRU_Remove(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + r.touch("b") + r.touch("c") + + r.remove("b") + if got, want := r.snapshot(), []string{"c", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after remove b: got %v, want %v", got, want) + } + + // Remove of missing path is a no-op. + r.remove("zzz") + if got, want := r.snapshot(), []string{"c", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("noop remove changed state: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_RenameInPlace(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + r.touch("b") + r.touch("c") // order: c b a + + // Rename "b" -> "x": should land at the front (promoted) per the + // plan's "treat a move as active use of the new name" rule. + r.rename("b", "x") + if got, want := r.snapshot(), []string{"x", "c", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after rename b->x: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_RenameDestExists(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + r.touch("b") + r.touch("c") // c b a + + // Rename "a" -> "c" (overwrite move): the old "a" entry should + // drop out, "c" should be promoted. + r.rename("a", "c") + if got, want := r.snapshot(), []string{"c", "b"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after rename a->c (dest exists): got %v, want %v", got, want) + } +} + +func TestRecentsLRU_RenameFromMissing(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + // Rename of an untracked source: equivalent to touching the dest. + r.rename("zzz", "b") + if got, want := r.snapshot(), []string{"b", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after rename zzz->b: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_LoadSnapshotRoundtrip(t *testing.T) { + r := newRecentsLRU(5) + r.load([]string{"a", "b", "c"}) + + if got, want := r.snapshot(), []string{"a", "b", "c"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after load: got %v, want %v", got, want) + } + if r.takeDirty() { + t.Fatalf("load should clear dirty flag") + } +} + +func TestRecentsLRU_LoadRespectsCapacity(t *testing.T) { + r := newRecentsLRU(2) + r.load([]string{"a", "b", "c", "d"}) + if r.len() != 2 { + t.Fatalf("load should stop at cap; len=%d", r.len()) + } +} + +func TestRecentsLRU_DirtyTracking(t *testing.T) { + r := newRecentsLRU(3) + if r.takeDirty() { + t.Fatalf("fresh ring should not be dirty") + } + r.touch("a") + if !r.takeDirty() { + t.Fatalf("touch should mark dirty") + } + if r.takeDirty() { + t.Fatalf("takeDirty should clear the flag") + } +} + +// --- integration: touches fire on the right Wiki ops --- + +func TestWiki_LRUIntegration(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // testWiki() seeded the wiki, and Reindex on Open() does not touch + // the LRU (indexing is plumbing, not "user used the page"). + if got := w.recents.snapshot(); len(got) != 0 { + t.Fatalf("LRU should be empty after Open; got %v", got) + } + + // GetPage touches. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if got := w.recents.snapshot(); !reflect.DeepEqual(got, []string{"index"}) { + t.Fatalf("after GetPage: got %v", got) + } + + // Failed GetPage does NOT touch. + if _, err := w.GetPage(ctx, "does/not/exist"); err == nil { + t.Fatalf("expected error on missing page") + } + if got := w.recents.snapshot(); !reflect.DeepEqual(got, []string{"index"}) { + t.Fatalf("failed GetPage polluted LRU: %v", got) + } + + // CreatePage touches. + if err := w.CreatePage(ctx, "scratch", "# Scratch\n"); err != nil { + t.Fatalf("CreatePage: %v", err) + } + if got := w.recents.snapshot(); got[0] != "scratch" { + t.Fatalf("CreatePage should put scratch at front: %v", got) + } + + // UpdatePage touches. + if err := w.UpdatePage(ctx, "index", "# Welcome (updated)\n"); err != nil { + t.Fatalf("UpdatePage: %v", err) + } + if got := w.recents.snapshot(); got[0] != "index" { + t.Fatalf("UpdatePage should promote index: %v", got) + } + + // GetBacklinks touches. + if _, err := w.GetBacklinks(ctx, "projects/mind-map"); err != nil { + t.Fatalf("GetBacklinks: %v", err) + } + if got := w.recents.snapshot(); got[0] != "projects/mind-map" { + t.Fatalf("GetBacklinks should promote target: %v", got) + } + + // MovePage renames in the LRU. + if err := w.MovePage(ctx, "scratch", "notes/scratch", MoveOptions{}); err != nil { + t.Fatalf("MovePage: %v", err) + } + snap := w.recents.snapshot() + for _, p := range snap { + if p == "scratch" { + t.Fatalf("old name still in LRU after move: %v", snap) + } + } + if snap[0] != "notes/scratch" { + t.Fatalf("move dest should be at front: %v", snap) + } + + // DeletePage removes. + if err := w.DeletePage(ctx, "notes/scratch"); err != nil { + t.Fatalf("DeletePage: %v", err) + } + for _, p := range w.recents.snapshot() { + if p == "notes/scratch" { + t.Fatalf("deleted page still in LRU: %v", w.recents.snapshot()) + } + } +} + +// CreatePage that fails (page already exists) must NOT touch. +func TestWiki_LRUNoTouchOnFailedCreate(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Drain the LRU to a known state. + w.recents.load(nil) + + // "index" already exists in testWiki. + if err := w.CreatePage(ctx, "index", "# dup\n"); err == nil { + t.Fatalf("expected CreatePage to fail on existing page") + } + if got := w.recents.snapshot(); len(got) != 0 { + t.Fatalf("failed CreatePage polluted LRU: %v", got) + } +} diff --git a/internal/wiki/state.go b/internal/wiki/state.go new file mode 100644 index 0000000..79bb030 --- /dev/null +++ b/internal/wiki/state.go @@ -0,0 +1,225 @@ +package wiki + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "log/slog" + "time" +) + +// wiki_state schema: a small key/value table for cross-restart persistence +// of derived structures (recents LRU, word/phrase cloud). Distinct from +// the `pages` index, which is rebuildable from disk — wiki_state holds +// signals that *can't* be recovered from the markdown files alone: +// +// - "recent_lru" — the active-use ring (intent, not mtime). Lost on +// restart without persistence; that's exactly the case the digest +// plan is designed to avoid. +// - "cloud" — the word/phrase cloud is rebuildable but expensive +// (one full table scan + tokenization). Persisting it means a +// freshly-restarted server has a digest immediately, not after +// the first ticker tick (up to 5 minutes later). +// +// We intentionally do NOT persist the rendered digest markdown: it's +// sub-ms to re-assemble from cloud + LRU, and the in-memory +// digestCache already covers "don't re-format on every hit". + +const ( + stateKeyRecentLRU = "recent_lru" + stateKeyCloud = "cloud" +) + +// initStateSchema creates the wiki_state table. Called from initSchema. +// Idempotent. +func (w *Wiki) initStateSchema() error { + _, err := w.db.Exec(` + CREATE TABLE IF NOT EXISTS wiki_state ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated TEXT NOT NULL + );`) + return err +} + +// recentsState is the on-disk shape of the persisted LRU. Stored as a +// JSON document under wiki_state["recent_lru"].value. Items are listed +// most-recent-first, matching recentsLRU.snapshot(). +type recentsState struct { + Items []string `json:"items"` +} + +// cloudState is the on-disk shape of the persisted cloud. +type cloudState struct { + Terms []CloudTerm `json:"terms"` +} + +// loadState pulls the persisted LRU + cloud out of wiki_state into +// memory. Called once at the end of Open(), after Reindex. Failures +// are logged but non-fatal — a missing or corrupt row just means the +// process starts with an empty signal, which is the same state a +// brand-new wiki ships with. +func (w *Wiki) loadState(ctx context.Context) { + if items, ok := w.readStateKey(ctx, stateKeyRecentLRU); ok { + var s recentsState + if err := json.Unmarshal([]byte(items), &s); err != nil { + slog.Warn("wiki_state recent_lru parse failed", slog.Any("error", err)) + } else { + // Filter against the current index so paths that vanished + // while the server was off (deleted, renamed via raw + // filesystem, or sync-pulled away) don't reappear in the + // LRU as 404 candidates. Reindex has already run by this + // point, so `pages` is the authoritative set. + filtered := w.filterAgainstIndex(ctx, s.Items) + w.recents.load(filtered) + slog.Info("recents loaded from wiki_state", + slog.Int("persisted", len(s.Items)), + slog.Int("kept", len(filtered)), + ) + } + } + + if terms, ok := w.readStateKey(ctx, stateKeyCloud); ok { + var s cloudState + if err := json.Unmarshal([]byte(terms), &s); err != nil { + slog.Warn("wiki_state cloud parse failed", slog.Any("error", err)) + } else { + // Use the persisted cloud as-is. The cloud is global + // frequency counts, not per-page references — even if + // some pages have vanished the previous distribution + // is still a reasonable approximation until the next + // rebuild ticker fires (default: within 5 minutes of + // startup). + w.cloud.Set(s.Terms) + slog.Info("cloud loaded from wiki_state", slog.Int("terms", len(s.Terms))) + } + } +} + +// filterAgainstIndex returns only those paths that currently exist in +// the `pages` table, preserving input order. Used on Open() to drop +// stale persisted recents whose underlying pages vanished while the +// server was off. +// +// One query: SELECT path FROM pages where path IN (...). We do it via +// a map probe rather than a SQL IN-clause because (a) the input slice +// is small (~20 entries by default) and (b) building a variable-length +// IN-clause with placeholders for SQLite is awkward. +func (w *Wiki) filterAgainstIndex(ctx context.Context, paths []string) []string { + if len(paths) == 0 { + return nil + } + rows, err := w.db.QueryContext(ctx, "SELECT path FROM pages") + if err != nil { + slog.Warn("filterAgainstIndex query failed", slog.Any("error", err)) + return paths // fail open: keep all, let the next CRUD reconcile + } + defer rows.Close() + present := make(map[string]struct{}) + for rows.Next() { + var p string + if rows.Scan(&p) == nil { + present[p] = struct{}{} + } + } + out := make([]string, 0, len(paths)) + for _, p := range paths { + if _, ok := present[p]; ok { + out = append(out, p) + } + } + return out +} + +// readStateKey returns the value for a wiki_state key, or "", false if +// not present or the read failed. Read errors other than "no row" are +// logged so a real DB problem doesn't silently degrade the digest. +func (w *Wiki) readStateKey(ctx context.Context, key string) (string, bool) { + var value string + err := w.db.QueryRowContext(ctx, "SELECT value FROM wiki_state WHERE key = ?", key).Scan(&value) + if err == nil { + return value, true + } + // sql.ErrNoRows is the common case (first run on a wiki) — silent. + if errors.Is(err, sql.ErrNoRows) { + return "", false + } + slog.Warn("wiki_state read failed", slog.String("key", key), slog.Any("error", err)) + return "", false +} + +// writeStateKey upserts a wiki_state row. The (key, value, updated) +// triple is atomic via INSERT OR REPLACE — readers either see the old +// or the new value, never a torn write. +func (w *Wiki) writeStateKey(ctx context.Context, key, value string) error { + now := time.Now().UTC().Format(time.RFC3339Nano) + _, err := w.db.ExecContext(ctx, + "INSERT OR REPLACE INTO wiki_state (key, value, updated) VALUES (?, ?, ?)", + key, value, now, + ) + return err +} + +// persistRecents writes the current LRU snapshot to wiki_state. Called +// by the 30s persistence ticker (Step 6) and from Close() for a clean +// shutdown. Safe to call concurrently with reads — the LRU snapshot is +// taken under its own lock and the SQLite write is atomic. +// +// If the LRU's dirty flag is unset, this is still safe to call (we'll +// rewrite the same bytes); callers wanting to skip a redundant write +// should gate on takeDirty() before calling. +func (w *Wiki) persistRecents(ctx context.Context) error { + state := recentsState{Items: w.recents.snapshot()} + data, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("marshal recents: %w", err) + } + return w.writeStateKey(ctx, stateKeyRecentLRU, string(data)) +} + +// persistCloud writes the current cloud cache to wiki_state. Called +// after a successful rebuild (Step 6). No-ops if the cloud has never +// been populated — there's nothing meaningful to write yet, and we +// don't want to clobber a previously-good persisted copy with an +// empty placeholder. +func (w *Wiki) persistCloud(ctx context.Context) error { + terms, ok := w.cloud.Get() + if !ok { + return nil + } + state := cloudState{Terms: terms} + data, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("marshal cloud: %w", err) + } + return w.writeStateKey(ctx, stateKeyCloud, string(data)) +} + +// PersistRecents is the exported entry point for the digest.Manager's +// 30-second flush ticker. The internal persistRecents helper is also +// called by Close() for a clean shutdown flush. +// +// PersistRecents clears the LRU's dirty flag on success: a follow-up +// RecentsDirty() will report false until the next touch. Callers that +// want to skip a redundant write should peek with RecentsDirty before +// calling this; PersistRecents itself always writes. +func (w *Wiki) PersistRecents(ctx context.Context) error { + if err := w.persistRecents(ctx); err != nil { + return err + } + // Clear dirty only after a successful write — if the write failed, + // the in-memory state is still ahead of disk and the next tick + // should retry. + w.recents.takeDirty() + return nil +} + +// RecentsDirty reports whether the LRU has unsaved changes since the +// last successful PersistRecents. Read-only — does not clear the flag. +// The digest.Manager uses this to skip redundant writes on an idle +// server. +func (w *Wiki) RecentsDirty() bool { + return w.recents.peekDirty() +} diff --git a/internal/wiki/state_test.go b/internal/wiki/state_test.go new file mode 100644 index 0000000..59e809c --- /dev/null +++ b/internal/wiki/state_test.go @@ -0,0 +1,171 @@ +package wiki + +import ( + "context" + "reflect" + "testing" +) + +func TestState_PersistAndLoadRecents(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Touch a few pages, then close the wiki — Close() flushes the LRU. + if _, err := w.GetPage(ctx, "projects/mind-map"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if _, err := w.GetPage(ctx, "people/alice"); err != nil { + t.Fatalf("GetPage: %v", err) + } + beforeClose := w.recents.snapshot() + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + // Reopen the same wiki directory; the LRU should rehydrate. + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen: %v", err) + } + defer w2.Close() + + got := w2.recents.snapshot() + if !reflect.DeepEqual(got, beforeClose) { + t.Fatalf("LRU not restored:\n before: %v\n after: %v", beforeClose, got) + } +} + +func TestState_PersistAndLoadCloud(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Seed and persist the cloud directly (the ticker isn't running + // in tests; Step 6 owns that wiring). + terms := []CloudTerm{ + {Term: "wiki", Count: 5}, + {Term: "mind-map", Count: 3}, + } + w.cloud.Set(terms) + if err := w.persistCloud(ctx); err != nil { + t.Fatalf("persistCloud: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen: %v", err) + } + defer w2.Close() + + loaded, ok := w2.cloud.Get() + if !ok { + t.Fatalf("cloud not restored (ok=false)") + } + if !reflect.DeepEqual(loaded, terms) { + t.Fatalf("cloud roundtrip mismatch:\n before: %v\n after: %v", terms, loaded) + } +} + +func TestState_LoadFiltersStalePaths(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Touch a real page and a fake one. We can't get a fake into the + // LRU via Wiki methods (they validate), so use the LRU directly. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + w.recents.touch("ghost/page/that/does/not/exist") + + if err := w.persistRecents(ctx); err != nil { + t.Fatalf("persistRecents: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + // Reopen; the ghost path should be dropped on load because it + // isn't in `pages`. + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen: %v", err) + } + defer w2.Close() + + for _, p := range w2.recents.snapshot() { + if p == "ghost/page/that/does/not/exist" { + t.Fatalf("stale path leaked through filter: %v", w2.recents.snapshot()) + } + } + // The real one survives. + found := false + for _, p := range w2.recents.snapshot() { + if p == "index" { + found = true + } + } + if !found { + t.Fatalf("real path dropped by filter: %v", w2.recents.snapshot()) + } +} + +func TestState_EmptyWikiNoErrors(t *testing.T) { + // A fresh wiki has no wiki_state rows. Open() must not error, + // and the LRU / cloud must be empty. + dir := t.TempDir() + w, err := Open(dir) + if err != nil { + t.Fatalf("Open empty wiki: %v", err) + } + defer w.Close() + + if w.recents.len() != 0 { + t.Fatalf("expected empty LRU on fresh wiki, got %v", w.recents.snapshot()) + } + if _, ok := w.cloud.Get(); ok { + t.Fatalf("expected unpopulated cloud on fresh wiki") + } +} + +func TestState_CorruptRecentsRowFallsBack(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Inject a malformed JSON row directly. + if err := w.writeStateKey(ctx, stateKeyRecentLRU, "{not valid json"); err != nil { + t.Fatalf("writeStateKey: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + // Reopen must not error; LRU should be empty (load failed silently). + // Close flushes the (just-emptied) LRU, so the corrupt row gets + // overwritten by a valid one on shutdown — that's also fine. + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen with corrupt row: %v", err) + } + defer w2.Close() + + if w2.recents.len() != 0 { + t.Fatalf("expected empty LRU after corrupt row; got %v", w2.recents.snapshot()) + } +} + +func TestState_PersistCloudNoOpWhenUnset(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // cloud has never been Set on this wiki; persisting must not + // write a placeholder (would clobber a previously-good copy). + if err := w.persistCloud(ctx); err != nil { + t.Fatalf("persistCloud unset: %v", err) + } + if _, ok := w.readStateKey(ctx, stateKeyCloud); ok { + t.Fatalf("expected no wiki_state[cloud] row when cloud is unset") + } +} diff --git a/internal/wiki/wiki.go b/internal/wiki/wiki.go index 84b4778..2c668ac 100644 --- a/internal/wiki/wiki.go +++ b/internal/wiki/wiki.go @@ -13,6 +13,7 @@ import ( "log/slog" "os" "path/filepath" + "sync" "time" _ "modernc.org/sqlite" // pure-Go SQLite driver (no CGO required) @@ -43,23 +44,130 @@ type SearchResult struct { Snippet string `json:"snippet"` } -// WikiContext provides an overview of the wiki for orientation. +// WikiContext provides an overview of the wiki for orientation. The +// legacy fields (PageCount, RecentPages, TopLevelDirs) reflect disk +// state — recent_pages is sorted by file mtime, top_level_dirs is read +// from the filesystem — and remain available for clients that already +// depend on that shape (opencode, Claude Code in the wild, per the +// plan's open question #4). +// +// The newer fields (Cloud, Recents, Areas, Markdown) are the digest +// signals: cloud terms across all page bodies, the active-use LRU +// (intent, not mtime), per-area page counts pulled from the index, +// and the rendered markdown an LLM can use directly. New clients +// should prefer `get_wiki_digest` for these, but `get_wiki_context` +// returns them too so existing tool wiring still benefits from the +// orientation upgrade without a client change. type WikiContext struct { PageCount int `json:"page_count"` RecentPages []Page `json:"recent_pages"` TopLevelDirs []string `json:"top_level_dirs"` + + // Cloud is the top-K word/phrase cloud across all page bodies. + // Empty until the first ticker fires on a freshly-opened wiki. + Cloud []CloudTerm `json:"cloud_terms,omitempty"` + // Recents is the active-use LRU — paths the user/agent actually + // touched (Create/Update/Get/Move/GetBacklinks). Distinct from + // RecentPages which is mtime-based. + Recents []string `json:"recents,omitempty"` + // Areas is the per-top-level-directory page count + index title. + // Driven by the indexed `pages` table, not filesystem listing. + Areas []AreaSummary `json:"areas,omitempty"` + // Markdown is the rendered digest blob — the same string an LLM + // would consume from `get_wiki_digest`. Included here so the + // existing get_wiki_context call gives clients an upgrade path + // without a tool-name change. + Markdown string `json:"markdown,omitempty"` +} + +// Options tunes Wiki construction. All fields are optional; the zero +// value gives the built-in defaults (recents capacity 20, render cap +// 4 KB, no extra stopwords). Pass with WithOptions to Open(): +// +// w, err := wiki.Open(dir, wiki.WithOptions(wiki.Options{ +// RecentsSize: 50, +// MaxRenderBytes: 8192, +// })) +// +// Options is value-passed; mutating it after Open has no effect. +type Options struct { + // RecentsSize is the active-use LRU capacity. Default 20. + RecentsSize int + // MaxRenderBytes caps the rendered digest markdown. Default 4096. + MaxRenderBytes int + // StopwordsExtra is forwarded to the cloud builder when invoked + // directly via BuildCloud. The digest.Manager passes its own + // copy through Options on its Manager; this field is here so + // non-Manager callers (tests, ad-hoc tools) get the same set. + StopwordsExtra []string +} + +// OpenOption configures wiki.Open. Use WithOptions or future targeted +// helpers; the variadic form keeps Open(dir) source-compatible. +type OpenOption func(*Options) + +// WithOptions sets the entire Options struct in one call. The most +// common embedder path: read config, build Options, pass to Open. +func WithOptions(opts Options) OpenOption { + return func(o *Options) { *o = opts } } // Wiki is the core engine. Create one with Open(). type Wiki struct { - root string // absolute path to wiki directory - db *sql.DB // SQLite database with FTS5 - sessionID string // unique ID for this process, used for page locks + root string // absolute path to wiki directory + db *sql.DB // SQLite database with FTS5 + sessionID string // unique ID for this process, used for page locks + // recents tracks pages the user/agent has actively touched. See + // recents.go for the rationale (intent vs. disk mtime). Persistence + // to SQLite is layered on in state.go; here it just lives in memory. + recents *recentsLRU + // cloud holds the most recent word/phrase cloud rebuild. Populated + // by the 5-minute ticker (Step 6); cold start renders without it. + cloud *cloudCache + // digest caches the rendered markdown blob, invalidated by cloud + // version + recents seq changes. See digest.go. + digest *digestCache + // maxRenderBytes is the soft cap applied by Digest(); 0 means no + // trim (used by tests). + maxRenderBytes int + // stopwordsExtra is forwarded to buildCloud when called directly + // without an explicit extras list. + stopwordsExtra []string + // closed guards Close() against double-invocation: testWiki and + // other callers commonly stack defer Close on top of t.Cleanup. + // Without this guard, the second Close() runs persistRecents + // against an already-closed DB and logs a spurious warning. + closeOnce sync.Once + closeErr error } // Open opens (or creates) a wiki rooted at the given directory. // It initializes the SQLite index and performs an initial scan. -func Open(root string) (*Wiki, error) { +// Pass OpenOption values (typically a single WithOptions) to tune the +// digest signals; the default options match the digest plan's +// recommended values (LRU=20, render cap=4096, no extra stopwords). +func Open(root string, opts ...OpenOption) (*Wiki, error) { + o := Options{ + RecentsSize: 20, + MaxRenderBytes: defaultMaxRenderBytes, + } + for _, fn := range opts { + fn(&o) + } + if o.RecentsSize <= 0 { + o.RecentsSize = 20 + } + // MaxRenderBytes semantics: + // > 0 → trim to that many bytes + // == 0 → fall back to default (4096) — most likely an + // uninitialized Options struct + // < 0 → no trimming (tests / power users) + // The field is normalized to those three states here so digest + // rendering can just check the sign without re-deriving intent. + if o.MaxRenderBytes == 0 { + o.MaxRenderBytes = defaultMaxRenderBytes + } + absRoot, err := filepath.Abs(root) if err != nil { return nil, fmt.Errorf("resolve wiki root: %w", err) @@ -79,7 +187,16 @@ func Open(root string) (*Wiki, error) { } sessionID := fmt.Sprintf("pid-%d-%d", os.Getpid(), time.Now().UnixNano()) - w := &Wiki{root: absRoot, db: db, sessionID: sessionID} + w := &Wiki{ + root: absRoot, + db: db, + sessionID: sessionID, + recents: newRecentsLRU(o.RecentsSize), + cloud: &cloudCache{}, + digest: &digestCache{}, + maxRenderBytes: o.MaxRenderBytes, + stopwordsExtra: o.StopwordsExtra, + } if err := w.initSchema(); err != nil { db.Close() return nil, fmt.Errorf("init schema: %w", err) @@ -97,15 +214,35 @@ func Open(root string) (*Wiki, error) { return nil, fmt.Errorf("initial index: %w", err) } + // Load persisted derived state (recents LRU, word cloud) after + // reindex so any stale entries pointing at pages that vanished + // while the server was off get filtered against the fresh index. + // Failures are logged but non-fatal — a corrupt state row just + // degrades to "fresh-wiki" behavior, not a crash. + w.loadState(context.Background()) + slog.Info("wiki opened", slog.String("root", absRoot)) return w, nil } // Close releases page locks held by this session and closes the database. +// Idempotent — safe to call multiple times (e.g. when a test stacks +// defer Close on top of testWiki's t.Cleanup). func (w *Wiki) Close() error { - slog.Info("wiki closing", slog.String("root", w.root)) - w.db.Exec("DELETE FROM page_locks WHERE holder = ?", w.sessionID) - return w.db.Close() + w.closeOnce.Do(func() { + slog.Info("wiki closing", slog.String("root", w.root)) + // Flush the LRU one last time so a clean shutdown doesn't + // lose the last ~30 seconds of touches between ticker fires. + // Errors are logged, not propagated — we'd rather close + // cleanly with a slightly stale snapshot than leak the DB + // handle. + if err := w.persistRecents(context.Background()); err != nil { + slog.Warn("recents flush on close failed", slog.Any("error", err)) + } + w.db.Exec("DELETE FROM page_locks WHERE holder = ?", w.sessionID) + w.closeErr = w.db.Close() + }) + return w.closeErr } // Root returns the wiki's root directory. @@ -165,6 +302,10 @@ func (w *Wiki) initSchema() error { return err } + if err := w.initStateSchema(); err != nil { + return fmt.Errorf("wiki_state schema: %w", err) + } + // Clean up stale locks (older than 5 minutes) from crashed processes _, err := w.db.Exec("DELETE FROM page_locks WHERE acquired < ?", time.Now().Add(-5*time.Minute).UTC().Format(time.RFC3339)) diff --git a/webui/src/App.tsx b/webui/src/App.tsx index 4d6c126..e293d34 100644 --- a/webui/src/App.tsx +++ b/webui/src/App.tsx @@ -4,6 +4,7 @@ import { Logo } from './Logo'; import { PageBrowser } from './PageBrowser'; import { GraphView } from './GraphView'; import { searchTokens, searchRegex, Highlighted } from './search'; +import { TagInput } from './TagInput'; import { marked } from 'marked'; import mermaid from 'mermaid'; @@ -16,8 +17,22 @@ interface SyncSettings { mappings?: { prefix: string; remote: string }[]; } +// DigestSettings mirrors internal/config.DigestConfig. All fields are +// optional on the wire — a config file without a digest section +// loads with zero values, which the consumers interpret as "use the +// built-in defaults". The UI surfaces the same contract: empty +// numeric inputs and an empty tag list keep server defaults intact. +interface DigestSettings { + cloud_size?: number; + recents_size?: number; + cloud_refresh?: string; + stopwords_extra?: string[]; + max_render_bytes?: number; +} + interface Settings { sync: SyncSettings; + digest?: DigestSettings; } async function loadSettings(): Promise { @@ -339,6 +354,21 @@ export function App() { setSettingsSaved(false); }; + // updateDigest is the per-field mutator for the Digest section. + // It accepts the field's actual value type (number for numeric + // knobs, string for cloud_refresh, string[] for stopwords). The + // server omits the digest section entirely when it's never been + // set, so we lazily materialize an empty object on first touch. + const updateDigest = (field: K, value: DigestSettings[K]) => { + if (!settings) return; + setSettings({ + ...settings, + digest: { ...(settings.digest ?? {}), [field]: value }, + }); + setSettingsDirty(true); + setSettingsSaved(false); + }; + const renderMarkdown = (body: string): string => { // Convert [[wikilinks]] to clickable links before rendering const withLinks = body.replace(/\[\[([^\]|]+)(?:\|([^\]]+))?\]\]/g, (_, target, display) => { @@ -562,6 +592,81 @@ export function App() { )} +
+
Digest
+
+ The per-conversation orientation digest summarizes what this wiki is about. A background job rebuilds the word/phrase cloud on a schedule; the recents LRU updates on every page op. All fields are optional — leave blank to use defaults. +
+ +
+ +
+ Domain-specific noise to exclude from the cloud (e.g. TODO, FIXME, see, also). Comma, space, or Enter to add a tag; Backspace on empty input removes the last one. +
+ updateDigest('stopwords_extra', next)} + placeholder="Type a word and press space, comma, or Enter" + /> +
+ +
+ +
Top-K terms in the word/phrase cloud. Default 50.
+ { + const v = (e.target as HTMLInputElement).value; + updateDigest('cloud_size', v === '' ? undefined : parseInt(v, 10)); + }} + placeholder="50" + /> +
+ +
+ +
Active-use LRU capacity. Default 20. Applied on next restart.
+ { + const v = (e.target as HTMLInputElement).value; + updateDigest('recents_size', v === '' ? undefined : parseInt(v, 10)); + }} + placeholder="20" + /> +
+ +
+ +
How often the cloud rebuilds (e.g. 5m, 10m). Floor: 30s. Default 5m.
+ updateDigest('cloud_refresh', (e.target as HTMLInputElement).value || undefined)} + placeholder="5m" + /> +
+ +
+ +
Soft cap on the rendered markdown blob. Default 4096 (~1K tokens). Set to 0 to disable trimming.
+ { + const v = (e.target as HTMLInputElement).value; + updateDigest('max_render_bytes', v === '' ? undefined : parseInt(v, 10)); + }} + placeholder="4096" + /> +
+
+
Index
diff --git a/webui/src/TagInput.tsx b/webui/src/TagInput.tsx new file mode 100644 index 0000000..96048e0 --- /dev/null +++ b/webui/src/TagInput.tsx @@ -0,0 +1,130 @@ +import { useState, useRef } from 'preact/hooks'; + +interface TagInputProps { + value: string[]; + onChange: (next: string[]) => void; + placeholder?: string; + // Maximum number of tags. When reached, further input is blocked + // until the user removes a tag. Omitted = no limit. + maxTags?: number; +} + +// TagInput is a controlled "chips + textbox" control: type a word, +// hit comma, space, or Enter, and it becomes a tag. Backspace on an +// empty input deletes the previous tag (standard chip-input UX — +// matches Gmail's To: line, GitHub's labels, etc.). Pasting a +// comma- or whitespace-separated string creates multiple tags in +// one shot. +// +// Values are de-duplicated case-insensitively but preserved in the +// case the user typed — we don't want to fold "JWT" into "jwt" on +// the way back to the server. The consumer of the values (the cloud +// builder) is the one that case-folds for matching; storing the +// user's intent verbatim respects what they typed. +export function TagInput({ value, onChange, placeholder, maxTags }: TagInputProps) { + const [draft, setDraft] = useState(''); + const inputRef = useRef(null); + + const commit = (raw: string) => { + // Split on commas and whitespace so pasting a list works + // even if the user pasted "TODO, FIXME see" (mixed + // separators). Empty fragments are filtered out by trim. + const fragments = raw + .split(/[\s,]+/) + .map(s => s.trim()) + .filter(Boolean); + if (fragments.length === 0) return; + + const lowerExisting = new Set(value.map(v => v.toLowerCase())); + const additions: string[] = []; + for (const f of fragments) { + if (lowerExisting.has(f.toLowerCase())) continue; + if (maxTags && value.length + additions.length >= maxTags) break; + lowerExisting.add(f.toLowerCase()); + additions.push(f); + } + if (additions.length > 0) onChange([...value, ...additions]); + setDraft(''); + }; + + const removeAt = (idx: number) => { + const next = value.slice(); + next.splice(idx, 1); + onChange(next); + // Keep focus on the input so the user can keep editing. + inputRef.current?.focus(); + }; + + const onKeyDown = (e: KeyboardEvent) => { + // Commit triggers: Enter, comma, space. Comma and space need + // to be intercepted so they don't actually land in the input. + if (e.key === 'Enter' || e.key === ',' || e.key === ' ') { + // Don't commit on a leading space inside an in-progress + // word — user might be pasting and the paste handler + // will fire separately. Specifically: only commit when + // there's something to commit. + if (draft.trim() !== '') { + e.preventDefault(); + commit(draft); + } else if (e.key === ',' || e.key === ' ') { + // Swallow stray separators on an empty input so the + // box doesn't fill with whitespace. + e.preventDefault(); + } + return; + } + if (e.key === 'Backspace' && draft === '' && value.length > 0) { + e.preventDefault(); + removeAt(value.length - 1); + } + }; + + const onPaste = (e: ClipboardEvent) => { + const pasted = e.clipboardData?.getData('text') ?? ''; + if (/[\s,]/.test(pasted)) { + // The paste contains separators — handle the whole + // string as tags rather than letting it land in the + // input field where the user would have to manually + // split it. + e.preventDefault(); + commit(pasted); + } + }; + + return ( +
inputRef.current?.focus()}> + {value.map((tag, idx) => ( + + {tag} + + + ))} + setDraft((e.target as HTMLInputElement).value)} + onKeyDown={onKeyDown} + onPaste={onPaste} + onBlur={() => { + // Commit any in-progress draft on blur so the user + // doesn't have to remember the keyboard ritual when + // they tab away or click Save. + if (draft.trim() !== '') commit(draft); + }} + /> +
+ ); +} diff --git a/webui/src/styles.css b/webui/src/styles.css index 97e54ce..a9abb7e 100644 --- a/webui/src/styles.css +++ b/webui/src/styles.css @@ -817,3 +817,74 @@ mark { @media (prefers-color-scheme: dark) { .settings-reindex-error { color: #ff8080; } } + +/* Tag input (Digest > Extra Stopwords). + * Looks and behaves like a single .settings-field input: same border, + * same focus accent, same width cap. The internal chips wrap and the + * input field stretches to fill the remaining row. */ +.tag-input { + width: 100%; + max-width: 480px; + min-height: 36px; + padding: 4px 6px; + border: 1px solid var(--border); + background: var(--bg); + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 4px; + cursor: text; +} + +.tag-input:focus-within { + border-color: var(--accent); +} + +.tag { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 2px 4px 2px 8px; + background: var(--accent); + color: white; + font-family: var(--font-mono); + font-size: 12px; + line-height: 1.4; + border-radius: 2px; + user-select: none; +} + +.tag-label { + /* Allow long tags to wrap or truncate gracefully if someone + * pastes a paragraph by mistake. Word-break here keeps the tag + * pill compact in the row. */ + overflow-wrap: anywhere; + max-width: 200px; +} + +.tag-remove { + background: transparent; + border: none; + color: white; + cursor: pointer; + padding: 0 4px; + font-size: 14px; + line-height: 1; + opacity: 0.8; +} + +.tag-remove:hover { + opacity: 1; +} + +.tag-input-field { + flex: 1; + min-width: 120px; + border: none; + outline: none; + background: transparent; + color: var(--fg); + font-family: var(--font-mono); + font-size: 13px; + padding: 4px 2px; +}