Add dev docs to the assistant knowledge base (baserow#4321)

bram2w · web-flow · commit ad495e4cdf4e · 2025-12-02T17:13:37.000+01:00
diff --git a/changelog/entries/unreleased/feature/sync_dev_docs_with_knowledge_base_documents.json b/changelog/entries/unreleased/feature/sync_dev_docs_with_knowledge_base_documents.json
@@ -0,0 +1,9 @@
+{
+  "type": "feature",
+  "message": "Synchronizes the dev docs with the assistant knowledge base documents.",
+  "issue_origin": "github",
+  "issue_number": null,
+  "domain": "core",
+  "bullet_points": [],
+  "created_at": "2025-11-24"
+}
diff --git a/enterprise/backend/src/baserow_enterprise/assistant/models.py b/enterprise/backend/src/baserow_enterprise/assistant/models.py
@@ -191,6 +191,8 @@ class DocumentCategory(NamedTuple):
     DocumentCategory("enterprise", "billing"),
     # FAQ
     DocumentCategory("faq", None),
+    # Dev Docs
+    DocumentCategory("dev_docs", None),
 ]
 
 
@@ -264,6 +266,10 @@ class DocumentType(models.TextChoices):
         """
         Documents downloaded from `baserow.io/user-docs`, our online Knowledge Base.
         """
+        BASEROW_DEV_DOCS = "baserow_dev_docs", "Baserow Dev Docs"
+        """
+        Documents downloaded from `baserow.io/docs`, the dev docs.
+        """
         FAQ = "faq", "FAQ"
         """
         Frequently Asked Question. It could be a single question or multiple ones for
diff --git a/enterprise/backend/src/baserow_enterprise/assistant/tools/search_user_docs/handler.py b/enterprise/backend/src/baserow_enterprise/assistant/tools/search_user_docs/handler.py
@@ -7,6 +7,7 @@
 from django.db import transaction
 
 from httpx import Client as httpxClient
+from loguru import logger
 from pgvector.django import L2Distance
 
 from baserow_enterprise.assistant.models import (
@@ -238,7 +239,7 @@ def load_categories(self, categories_serialized: Iterable[Tuple[str, str | None]
             categories_with_parents, ["parent_id"]
         )
 
-    def sync_knowledge_base(self):
+    def sync_knowledge_base_from_csv(self):
         """
         Sync entries from `website_export.csv` with the knowledgebase documents and
         chunks. The idea is that this `website_export.csv` file can easily be
@@ -249,9 +250,6 @@ def sync_knowledge_base(self):
         removed from the source, it will also be removed in the documents.
         """
 
-        # Ensure default categories exist (parents set by load_categories)
-        self.load_categories(DEFAULT_CATEGORIES)
-
         csv_path = self._csv_path()
         with csv_path.open("r", encoding="utf-8", newline="") as f:
             reader = csv.DictReader(f)
@@ -430,16 +428,185 @@ def sync_knowledge_base(self):
             if not chunks:
                 return
 
-            embeddings = self.vector_handler.embed_texts(texts)
-            if KnowledgeBaseChunk.can_search_vectors():
-                for c, e in zip(chunks, embeddings):
-                    c.embedding = list(e)
-                    c._embedding_array = list(e)
-            else:
-                for c, e in zip(chunks, embeddings):
-                    c._embedding_array = list(e)
+            self._update_chunks(texts, chunks)
+
+    def sync_knowledge_base_from_dev_docs(self):
+        """
+        Sync the developer documentation from the local `docs/` folder with the
+        knowledgebase documents and chunks. Every .md file will be included. It will
+        automatically figure out a title, slug, etc. It automatically checks if the
+        entry already exists, and will create, update or delete accordingly.
+        """
+
+        docs_root = self._get_docs_path()
+        if docs_root is None:
+            logger.warning(
+                f"The {docs_root} folder does not exist, skip synchronizing the dev "
+                f"docs"
+            )
+            return
+
+        doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS
+
+        pages: dict[str, dict] = {}
+        slugs: set[str] = set()
+
+        for md_path in docs_root.rglob("*.md"):
+            rel = md_path.relative_to(docs_root)
+            rel_str = rel.as_posix()
+            if not rel_str.lower().endswith(".md"):
+                continue
+
+            rel_without_md = rel_str[:-3]
+            slug = f"dev/{rel_without_md}"
+            slugs.add(slug)
+
+            stem = md_path.stem  # e.g. "ci-cd"
+            stem_normalized = stem.replace("_", "-")
+            parts = [p for p in stem_normalized.split("-") if p]
+            title = (
+                " ".join(p[:1].upper() + p[1:].lower() for p in parts)
+                if parts
+                else stem[:1].upper() + stem[1:].lower()
+            )
+
+            with md_path.open("r", encoding="utf-8") as f:
+                body = f.read()
 
-            KnowledgeBaseChunk.objects.bulk_create(chunks)
+            source_url = f"https://baserow.io/docs/{rel_without_md}"
+
+            pages[slug] = {
+                "title": title,
+                "body": body,
+                "source_url": source_url,
+            }
+
+        dev_docs_category = KnowledgeBaseCategory.objects.filter(
+            name="dev_docs"
+        ).first()
+
+        with transaction.atomic():
+            existing = {
+                d.slug: d for d in KnowledgeBaseDocument.objects.filter(type=doc_type)
+            }
+
+            # Delete docs that no longer have a corresponding markdown file. This is
+            # needed because a file could be removed because it's no longer relevant.
+            # It should then not show up in the docs anymore.
+            to_delete_slugs = [s for s in existing.keys() if s not in slugs]
+            if to_delete_slugs:
+                KnowledgeBaseDocument.objects.filter(
+                    type=doc_type, slug__in=to_delete_slugs
+                ).delete()
+                for s in to_delete_slugs:
+                    existing.pop(s, None)
+
+            create, update = [], []
+            doc_ids_needing_chunks: set[int] = set()
+
+            for slug, p in pages.items():
+                d = existing.get(slug)
+                if d:
+                    changed = False
+                    body_changed = False
+                    if d.title != p["title"]:
+                        d.title = p["title"]
+                        changed = True
+                    if d.raw_content != p["body"]:
+                        d.raw_content = p["body"]
+                        changed = True
+                        body_changed = True
+                    if d.content != p["body"]:
+                        d.content = p["body"]
+                        changed = True
+                        body_changed = True
+                    if dev_docs_category and d.category_id != dev_docs_category.id:
+                        d.category = dev_docs_category
+                        changed = True
+                    if d.process_document:
+                        d.process_document = False
+                        changed = True
+                    if d.status != KnowledgeBaseDocument.Status.READY:
+                        d.status = KnowledgeBaseDocument.Status.READY
+                        changed = True
+                    if d.source_url != p["source_url"]:
+                        d.source_url = p["source_url"]
+                        changed = True
+
+                    if changed:
+                        update.append(d)
+                    if body_changed:
+                        doc_ids_needing_chunks.add(d.id)
+                else:
+                    new_doc = KnowledgeBaseDocument(
+                        title=p["title"],
+                        slug=slug,
+                        type=doc_type,
+                        raw_content=p["body"],
+                        process_document=False,
+                        content=p["body"],
+                        status=KnowledgeBaseDocument.Status.READY,
+                        category=dev_docs_category,
+                        source_url=p["source_url"],
+                    )
+                    create.append(new_doc)
+
+            if create:
+                KnowledgeBaseDocument.objects.bulk_create(create)
+                fresh = KnowledgeBaseDocument.objects.filter(
+                    type=doc_type, slug__in=[d.slug for d in create]
+                )
+                for d in fresh:
+                    existing[d.slug] = d
+                    doc_ids_needing_chunks.add(d.id)
+
+            if update:
+                # The `updated_on` field is not saved during the bulk update, so we
+                # would need to pre_save this value before.
+                for d in update:
+                    d.updated_on = KnowledgeBaseDocument._meta.get_field(
+                        "updated_on"
+                    ).pre_save(d, add=False)
+
+                KnowledgeBaseDocument.objects.bulk_update(
+                    update,
+                    [
+                        "title",
+                        "raw_content",
+                        "process_document",
+                        "content",
+                        "status",
+                        "category",
+                        "source_url",
+                        "updated_on",
+                    ],
+                )
+
+            # If there are no chunks to rebuild, we can skip the final part because
+            # there is no need to delete and recreate the missing chunks.
+            if not doc_ids_needing_chunks:
+                return
+
+            KnowledgeBaseChunk.objects.filter(
+                source_document_id__in=list(doc_ids_needing_chunks)
+            ).delete()
+
+            chunks, texts = [], []
+            for slug, d in existing.items():
+                if d.id not in doc_ids_needing_chunks:
+                    continue
+                body = pages[slug]["body"]
+                chunks.append(
+                    KnowledgeBaseChunk(
+                        source_document=d, index=0, content=body, metadata={}
+                    )
+                )
+                texts.append(body)
+
+            if not chunks:
+                return
+
+            self._update_chunks(texts, chunks)
 
     def _csv_path(self):
         path = Path(__file__).resolve().parents[5] / "website_export.csv"
@@ -449,6 +616,17 @@ def _csv_path(self):
 
         return path
 
+    def _get_docs_path(self) -> Path | None:
+        """
+        Returns the path to the `docs` directory if it exists, otherwise None.
+        The folder is expected at `../../../../../../../docs` from this handler file.
+        """
+
+        path = Path(__file__).resolve().parents[7] / "docs"
+        if not path.exists() or not path.is_dir():
+            return None
+        return path
+
     def _csv_type_to_enum(self, csv_value: str | None) -> str:
         v = (csv_value or "").strip()
         if not v:
@@ -457,3 +635,22 @@ def _csv_type_to_enum(self, csv_value: str | None) -> str:
             if v.lower() == dt.value.lower():
                 return dt.value
         return KnowledgeBaseDocument.DocumentType.RAW_DOCUMENT
+
+    def _update_chunks(self, texts, chunks):
+        embeddings = self.vector_handler.embed_texts(texts)
+        if KnowledgeBaseChunk.can_search_vectors():
+            for c, e in zip(chunks, embeddings):
+                c.embedding = list(e)
+                c._embedding_array = list(e)
+        else:
+            for c, e in zip(chunks, embeddings):
+                c._embedding_array = list(e)
+
+        KnowledgeBaseChunk.objects.bulk_create(chunks)
+
+    def sync_knowledge_base(self):
+        # Ensure default categories exist (parents set by load_categories)
+        self.load_categories(DEFAULT_CATEGORIES)
+
+        self.sync_knowledge_base_from_csv()
+        self.sync_knowledge_base_from_dev_docs()
diff --git a/enterprise/backend/src/baserow_enterprise/migrations/0056_alter_knowledgebasedocument_type.py b/enterprise/backend/src/baserow_enterprise/migrations/0056_alter_knowledgebasedocument_type.py
@@ -0,0 +1,27 @@
+# Generated by Django 5.0.14 on 2025-11-24 14:10
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("baserow_enterprise", "0055_assistantchatmessage_action_group_id_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="knowledgebasedocument",
+            name="type",
+            field=models.CharField(
+                choices=[
+                    ("raw_document", "Raw Document"),
+                    ("baserow_user_docs", "Baserow User Docs"),
+                    ("baserow_dev_docs", "Baserow Dev Docs"),
+                    ("faq", "FAQ"),
+                    ("template", "Template"),
+                ],
+                default="raw_document",
+                max_length=20,
+            ),
+        ),
+    ]
diff --git a/enterprise/backend/tests/baserow_enterprise_tests/assistant/test_sync_knowledge_base.py b/enterprise/backend/tests/baserow_enterprise_tests/assistant/test_sync_knowledge_base.py