diff --git a/changelog/entries/unreleased/feature/sync_dev_docs_with_knowledge_base_documents.json b/changelog/entries/unreleased/feature/sync_dev_docs_with_knowledge_base_documents.json new file mode 100644 index 0000000000..cfe951a429 --- /dev/null +++ b/changelog/entries/unreleased/feature/sync_dev_docs_with_knowledge_base_documents.json @@ -0,0 +1,9 @@ +{ + "type": "feature", + "message": "Synchronizes the dev docs with the assistant knowledge base documents.", + "issue_origin": "github", + "issue_number": null, + "domain": "core", + "bullet_points": [], + "created_at": "2025-11-24" +} diff --git a/deploy/helm/baserow/README.md b/deploy/helm/baserow/README.md index 198d42f860..a6d3880dd3 100644 --- a/deploy/helm/baserow/README.md +++ b/deploy/helm/baserow/README.md @@ -10,7 +10,7 @@ To install the chart with the release name `my-baserow` run the following comman From repo ```bash -helm repo add baserow-chart https://baserow.gitlab.io/baserow-chart +helm repo add baserow-chart https://baserow.github.io/baserow-chart helm install my-baserow baserow-chart/baserow --namespace baserow --create-namespace --values config.yaml ``` diff --git a/docs/installation/install-with-helm.md b/docs/installation/install-with-helm.md index be464e1f78..706f09bdc5 100644 --- a/docs/installation/install-with-helm.md +++ b/docs/installation/install-with-helm.md @@ -28,7 +28,7 @@ Before installing Baserow with Helm, ensure you have: First, add the Baserow Helm chart repository: ```bash -helm repo add baserow-chart https://baserow.gitlab.io/baserow-chart +helm repo add baserow-chart https://baserow.github.io/baserow-chart helm repo update ``` @@ -325,7 +325,7 @@ Add the Baserow Helm repository and install: ```bash # Add Baserow chart repository -helm repo add baserow-chart https://baserow.gitlab.io/baserow-chart +helm repo add baserow-chart https://baserow.github.io/baserow-chart helm repo update # Install Baserow diff --git a/enterprise/backend/src/baserow_enterprise/assistant/models.py b/enterprise/backend/src/baserow_enterprise/assistant/models.py index 13beff1f0d..9c48402a96 100644 --- a/enterprise/backend/src/baserow_enterprise/assistant/models.py +++ b/enterprise/backend/src/baserow_enterprise/assistant/models.py @@ -191,6 +191,8 @@ class DocumentCategory(NamedTuple): DocumentCategory("enterprise", "billing"), # FAQ DocumentCategory("faq", None), + # Dev Docs + DocumentCategory("dev_docs", None), ] @@ -264,6 +266,10 @@ class DocumentType(models.TextChoices): """ Documents downloaded from `baserow.io/user-docs`, our online Knowledge Base. """ + BASEROW_DEV_DOCS = "baserow_dev_docs", "Baserow Dev Docs" + """ + Documents downloaded from `baserow.io/docs`, the dev docs. + """ FAQ = "faq", "FAQ" """ Frequently Asked Question. It could be a single question or multiple ones for diff --git a/enterprise/backend/src/baserow_enterprise/assistant/tools/search_user_docs/handler.py b/enterprise/backend/src/baserow_enterprise/assistant/tools/search_user_docs/handler.py index b554c57af6..6ad53bd1db 100644 --- a/enterprise/backend/src/baserow_enterprise/assistant/tools/search_user_docs/handler.py +++ b/enterprise/backend/src/baserow_enterprise/assistant/tools/search_user_docs/handler.py @@ -7,6 +7,7 @@ from django.db import transaction from httpx import Client as httpxClient +from loguru import logger from pgvector.django import L2Distance from baserow_enterprise.assistant.models import ( @@ -238,7 +239,7 @@ def load_categories(self, categories_serialized: Iterable[Tuple[str, str | None] categories_with_parents, ["parent_id"] ) - def sync_knowledge_base(self): + def sync_knowledge_base_from_csv(self): """ Sync entries from `website_export.csv` with the knowledgebase documents and chunks. The idea is that this `website_export.csv` file can easily be @@ -249,9 +250,6 @@ def sync_knowledge_base(self): removed from the source, it will also be removed in the documents. """ - # Ensure default categories exist (parents set by load_categories) - self.load_categories(DEFAULT_CATEGORIES) - csv_path = self._csv_path() with csv_path.open("r", encoding="utf-8", newline="") as f: reader = csv.DictReader(f) @@ -430,16 +428,185 @@ def sync_knowledge_base(self): if not chunks: return - embeddings = self.vector_handler.embed_texts(texts) - if KnowledgeBaseChunk.can_search_vectors(): - for c, e in zip(chunks, embeddings): - c.embedding = list(e) - c._embedding_array = list(e) - else: - for c, e in zip(chunks, embeddings): - c._embedding_array = list(e) + self._update_chunks(texts, chunks) + + def sync_knowledge_base_from_dev_docs(self): + """ + Sync the developer documentation from the local `docs/` folder with the + knowledgebase documents and chunks. Every .md file will be included. It will + automatically figure out a title, slug, etc. It automatically checks if the + entry already exists, and will create, update or delete accordingly. + """ + + docs_root = self._get_docs_path() + if docs_root is None: + logger.warning( + f"The {docs_root} folder does not exist, skip synchronizing the dev " + f"docs" + ) + return + + doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS + + pages: dict[str, dict] = {} + slugs: set[str] = set() + + for md_path in docs_root.rglob("*.md"): + rel = md_path.relative_to(docs_root) + rel_str = rel.as_posix() + if not rel_str.lower().endswith(".md"): + continue + + rel_without_md = rel_str[:-3] + slug = f"dev/{rel_without_md}" + slugs.add(slug) + + stem = md_path.stem # e.g. "ci-cd" + stem_normalized = stem.replace("_", "-") + parts = [p for p in stem_normalized.split("-") if p] + title = ( + " ".join(p[:1].upper() + p[1:].lower() for p in parts) + if parts + else stem[:1].upper() + stem[1:].lower() + ) + + with md_path.open("r", encoding="utf-8") as f: + body = f.read() - KnowledgeBaseChunk.objects.bulk_create(chunks) + source_url = f"https://baserow.io/docs/{rel_without_md}" + + pages[slug] = { + "title": title, + "body": body, + "source_url": source_url, + } + + dev_docs_category = KnowledgeBaseCategory.objects.filter( + name="dev_docs" + ).first() + + with transaction.atomic(): + existing = { + d.slug: d for d in KnowledgeBaseDocument.objects.filter(type=doc_type) + } + + # Delete docs that no longer have a corresponding markdown file. This is + # needed because a file could be removed because it's no longer relevant. + # It should then not show up in the docs anymore. + to_delete_slugs = [s for s in existing.keys() if s not in slugs] + if to_delete_slugs: + KnowledgeBaseDocument.objects.filter( + type=doc_type, slug__in=to_delete_slugs + ).delete() + for s in to_delete_slugs: + existing.pop(s, None) + + create, update = [], [] + doc_ids_needing_chunks: set[int] = set() + + for slug, p in pages.items(): + d = existing.get(slug) + if d: + changed = False + body_changed = False + if d.title != p["title"]: + d.title = p["title"] + changed = True + if d.raw_content != p["body"]: + d.raw_content = p["body"] + changed = True + body_changed = True + if d.content != p["body"]: + d.content = p["body"] + changed = True + body_changed = True + if dev_docs_category and d.category_id != dev_docs_category.id: + d.category = dev_docs_category + changed = True + if d.process_document: + d.process_document = False + changed = True + if d.status != KnowledgeBaseDocument.Status.READY: + d.status = KnowledgeBaseDocument.Status.READY + changed = True + if d.source_url != p["source_url"]: + d.source_url = p["source_url"] + changed = True + + if changed: + update.append(d) + if body_changed: + doc_ids_needing_chunks.add(d.id) + else: + new_doc = KnowledgeBaseDocument( + title=p["title"], + slug=slug, + type=doc_type, + raw_content=p["body"], + process_document=False, + content=p["body"], + status=KnowledgeBaseDocument.Status.READY, + category=dev_docs_category, + source_url=p["source_url"], + ) + create.append(new_doc) + + if create: + KnowledgeBaseDocument.objects.bulk_create(create) + fresh = KnowledgeBaseDocument.objects.filter( + type=doc_type, slug__in=[d.slug for d in create] + ) + for d in fresh: + existing[d.slug] = d + doc_ids_needing_chunks.add(d.id) + + if update: + # The `updated_on` field is not saved during the bulk update, so we + # would need to pre_save this value before. + for d in update: + d.updated_on = KnowledgeBaseDocument._meta.get_field( + "updated_on" + ).pre_save(d, add=False) + + KnowledgeBaseDocument.objects.bulk_update( + update, + [ + "title", + "raw_content", + "process_document", + "content", + "status", + "category", + "source_url", + "updated_on", + ], + ) + + # If there are no chunks to rebuild, we can skip the final part because + # there is no need to delete and recreate the missing chunks. + if not doc_ids_needing_chunks: + return + + KnowledgeBaseChunk.objects.filter( + source_document_id__in=list(doc_ids_needing_chunks) + ).delete() + + chunks, texts = [], [] + for slug, d in existing.items(): + if d.id not in doc_ids_needing_chunks: + continue + body = pages[slug]["body"] + chunks.append( + KnowledgeBaseChunk( + source_document=d, index=0, content=body, metadata={} + ) + ) + texts.append(body) + + if not chunks: + return + + self._update_chunks(texts, chunks) def _csv_path(self): path = Path(__file__).resolve().parents[5] / "website_export.csv" @@ -449,6 +616,17 @@ def _csv_path(self): return path + def _get_docs_path(self) -> Path | None: + """ + Returns the path to the `docs` directory if it exists, otherwise None. + The folder is expected at `../../../../../../../docs` from this handler file. + """ + + path = Path(__file__).resolve().parents[7] / "docs" + if not path.exists() or not path.is_dir(): + return None + return path + def _csv_type_to_enum(self, csv_value: str | None) -> str: v = (csv_value or "").strip() if not v: @@ -457,3 +635,22 @@ def _csv_type_to_enum(self, csv_value: str | None) -> str: if v.lower() == dt.value.lower(): return dt.value return KnowledgeBaseDocument.DocumentType.RAW_DOCUMENT + + def _update_chunks(self, texts, chunks): + embeddings = self.vector_handler.embed_texts(texts) + if KnowledgeBaseChunk.can_search_vectors(): + for c, e in zip(chunks, embeddings): + c.embedding = list(e) + c._embedding_array = list(e) + else: + for c, e in zip(chunks, embeddings): + c._embedding_array = list(e) + + KnowledgeBaseChunk.objects.bulk_create(chunks) + + def sync_knowledge_base(self): + # Ensure default categories exist (parents set by load_categories) + self.load_categories(DEFAULT_CATEGORIES) + + self.sync_knowledge_base_from_csv() + self.sync_knowledge_base_from_dev_docs() diff --git a/enterprise/backend/src/baserow_enterprise/migrations/0056_alter_knowledgebasedocument_type.py b/enterprise/backend/src/baserow_enterprise/migrations/0056_alter_knowledgebasedocument_type.py new file mode 100644 index 0000000000..44075bb0f8 --- /dev/null +++ b/enterprise/backend/src/baserow_enterprise/migrations/0056_alter_knowledgebasedocument_type.py @@ -0,0 +1,27 @@ +# Generated by Django 5.0.14 on 2025-11-24 14:10 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("baserow_enterprise", "0055_assistantchatmessage_action_group_id_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="knowledgebasedocument", + name="type", + field=models.CharField( + choices=[ + ("raw_document", "Raw Document"), + ("baserow_user_docs", "Baserow User Docs"), + ("baserow_dev_docs", "Baserow Dev Docs"), + ("faq", "FAQ"), + ("template", "Template"), + ], + default="raw_document", + max_length=20, + ), + ), + ] diff --git a/enterprise/backend/tests/baserow_enterprise_tests/assistant/test_sync_knowledge_base.py b/enterprise/backend/tests/baserow_enterprise_tests/assistant/test_sync_knowledge_base.py index e89edee9fc..51942379f0 100644 --- a/enterprise/backend/tests/baserow_enterprise_tests/assistant/test_sync_knowledge_base.py +++ b/enterprise/backend/tests/baserow_enterprise_tests/assistant/test_sync_knowledge_base.py @@ -23,6 +23,17 @@ def handler_and_csv(tmp_path, monkeypatch): return handler, csv_path +@pytest.fixture +def handler_and_docs_root(tmp_path, monkeypatch): + docs_root = tmp_path / "docs" + docs_root.mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr(KnowledgeBaseHandler, "_get_docs_path", lambda self: docs_root) + + handler = KnowledgeBaseHandler() + return handler, docs_root + + def write_csv(path: Path, rows: list[dict]): headers = [ "id", @@ -352,3 +363,145 @@ def test_sync_knowledge_base_with_real_file(monkeypatch): assert count_documents == KnowledgeBaseDocument.objects.all().count() assert count_chunks == KnowledgeBaseChunk.objects.all().count() + + +@pytest.mark.django_db +def test_sync_dev_docs_creates_documents_and_chunks(handler_and_docs_root, monkeypatch): + handler, docs_root = handler_and_docs_root + + handler.load_categories(DEFAULT_CATEGORIES) + assert KnowledgeBaseCategory.objects.filter(name="dev_docs").exists() + + dev_dir = docs_root / "development" + api_dir = dev_dir / "api" + dev_dir.mkdir() + api_dir.mkdir() + + file1 = dev_dir / "ci-cd.md" + file1.write_text("# CI/CD guide", encoding="utf-8") + + file2 = api_dir / "this-is-a-name.md" + file2.write_text("API doc body", encoding="utf-8") + + monkeypatch.setattr(handler.vector_handler, "embed_texts", fake_embed_texts) + handler.sync_knowledge_base_from_dev_docs() + + doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS + + d1 = KnowledgeBaseDocument.objects.get(type=doc_type, slug="dev/development/ci-cd") + d2 = KnowledgeBaseDocument.objects.get( + type=doc_type, slug="dev/development/api/this-is-a-name" + ) + + assert d1.title == "Ci Cd" + assert d2.title == "This Is A Name" + + assert d1.category.name == "dev_docs" + assert d2.category.name == "dev_docs" + + assert d1.source_url == "https://baserow.io/docs/development/ci-cd" + assert d2.source_url == "https://baserow.io/docs/development/api/this-is-a-name" + + assert KnowledgeBaseChunk.objects.filter(source_document=d1).count() == 1 + assert KnowledgeBaseChunk.objects.filter(source_document=d2).count() == 1 + + +@pytest.mark.django_db +def test_sync_dev_docs_no_reembedding_when_body_unchanged( + handler_and_docs_root, monkeypatch +): + handler, docs_root = handler_and_docs_root + + handler.load_categories(DEFAULT_CATEGORIES) + + dev_dir = docs_root / "development" + dev_dir.mkdir() + + doc_file = dev_dir / "ci-cd.md" + doc_file.write_text("Initial body", encoding="utf-8") + + monkeypatch.setattr(handler.vector_handler, "embed_texts", fake_embed_texts) + handler.sync_knowledge_base_from_dev_docs() + + doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS + doc = KnowledgeBaseDocument.objects.get(type=doc_type, slug="dev/development/ci-cd") + chunk_before = KnowledgeBaseChunk.objects.get(source_document=doc) + chunk_before_id = chunk_before.id + + monkeypatch.setattr(handler.vector_handler, "embed_texts", fake_embed_texts) + handler.sync_knowledge_base_from_dev_docs() + + chunk_after = KnowledgeBaseChunk.objects.get(source_document=doc) + assert chunk_after.id == chunk_before_id + + +@pytest.mark.django_db +def test_sync_dev_docs_reembeds_on_body_change(handler_and_docs_root, monkeypatch): + handler, docs_root = handler_and_docs_root + + handler.load_categories(DEFAULT_CATEGORIES) + + dev_dir = docs_root / "development" + dev_dir.mkdir() + + doc_file = dev_dir / "ci-cd.md" + doc_file.write_text("Original body", encoding="utf-8") + + monkeypatch.setattr(handler.vector_handler, "embed_texts", fake_embed_texts) + handler.sync_knowledge_base_from_dev_docs() + + doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS + doc = KnowledgeBaseDocument.objects.get(type=doc_type, slug="dev/development/ci-cd") + old_chunk = KnowledgeBaseChunk.objects.get(source_document=doc) + old_chunk_id = old_chunk.id + assert "Original body" in old_chunk.content + + doc_file.write_text("Updated body text", encoding="utf-8") + + monkeypatch.setattr(handler.vector_handler, "embed_texts", fake_embed_texts) + handler.sync_knowledge_base_from_dev_docs() + + new_chunk = KnowledgeBaseChunk.objects.get(source_document=doc) + assert new_chunk.id != old_chunk_id + assert "Updated body text" in new_chunk.content + + +@pytest.mark.django_db +def test_sync_dev_docs_deletes_docs_when_file_removed( + handler_and_docs_root, monkeypatch +): + handler, docs_root = handler_and_docs_root + + handler.load_categories(DEFAULT_CATEGORIES) + + dev_dir = docs_root / "development" + dev_dir.mkdir() + + file1 = dev_dir / "ci-cd.md" + file2 = dev_dir / "other-page.md" + file1.write_text("A", encoding="utf-8") + file2.write_text("B", encoding="utf-8") + + monkeypatch.setattr(handler.vector_handler, "embed_texts", fake_embed_texts) + handler.sync_knowledge_base_from_dev_docs() + + doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS + assert KnowledgeBaseDocument.objects.filter( + type=doc_type, slug="dev/development/ci-cd" + ).exists() + assert KnowledgeBaseDocument.objects.filter( + type=doc_type, slug="dev/development/other-page" + ).exists() + + file1.unlink() + + monkeypatch.setattr(handler.vector_handler, "embed_texts", fake_embed_texts) + handler.sync_knowledge_base_from_dev_docs() + + # Document for removed file should be deleted; other remains + assert not KnowledgeBaseDocument.objects.filter( + type=doc_type, slug="dev/development/ci-cd" + ).exists() + assert KnowledgeBaseDocument.objects.filter( + type=doc_type, slug="dev/development/other-page" + ).exists()