Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"type": "feature",
"message": "Synchronizes the dev docs with the assistant knowledge base documents.",
"issue_origin": "github",
"issue_number": null,
"domain": "core",
"bullet_points": [],
"created_at": "2025-11-24"
}
2 changes: 1 addition & 1 deletion deploy/helm/baserow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ To install the chart with the release name `my-baserow` run the following comman

From repo
```bash
helm repo add baserow-chart https://baserow.gitlab.io/baserow-chart
helm repo add baserow-chart https://baserow.github.io/baserow-chart
helm install my-baserow baserow-chart/baserow --namespace baserow --create-namespace --values config.yaml
```

Expand Down
4 changes: 2 additions & 2 deletions docs/installation/install-with-helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Before installing Baserow with Helm, ensure you have:
First, add the Baserow Helm chart repository:

```bash
helm repo add baserow-chart https://baserow.gitlab.io/baserow-chart
helm repo add baserow-chart https://baserow.github.io/baserow-chart
helm repo update
```

Expand Down Expand Up @@ -325,7 +325,7 @@ Add the Baserow Helm repository and install:

```bash
# Add Baserow chart repository
helm repo add baserow-chart https://baserow.gitlab.io/baserow-chart
helm repo add baserow-chart https://baserow.github.io/baserow-chart
helm repo update

# Install Baserow
Expand Down
6 changes: 6 additions & 0 deletions enterprise/backend/src/baserow_enterprise/assistant/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ class DocumentCategory(NamedTuple):
DocumentCategory("enterprise", "billing"),
# FAQ
DocumentCategory("faq", None),
# Dev Docs
DocumentCategory("dev_docs", None),
]


Expand Down Expand Up @@ -264,6 +266,10 @@ class DocumentType(models.TextChoices):
"""
Documents downloaded from `baserow.io/user-docs`, our online Knowledge Base.
"""
BASEROW_DEV_DOCS = "baserow_dev_docs", "Baserow Dev Docs"
"""
Documents downloaded from `baserow.io/docs`, the dev docs.
"""
FAQ = "faq", "FAQ"
"""
Frequently Asked Question. It could be a single question or multiple ones for
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from django.db import transaction

from httpx import Client as httpxClient
from loguru import logger
from pgvector.django import L2Distance

from baserow_enterprise.assistant.models import (
Expand Down Expand Up @@ -238,7 +239,7 @@ def load_categories(self, categories_serialized: Iterable[Tuple[str, str | None]
categories_with_parents, ["parent_id"]
)

def sync_knowledge_base(self):
def sync_knowledge_base_from_csv(self):
"""
Sync entries from `website_export.csv` with the knowledgebase documents and
chunks. The idea is that this `website_export.csv` file can easily be
Expand All @@ -249,9 +250,6 @@ def sync_knowledge_base(self):
removed from the source, it will also be removed in the documents.
"""

# Ensure default categories exist (parents set by load_categories)
self.load_categories(DEFAULT_CATEGORIES)

csv_path = self._csv_path()
with csv_path.open("r", encoding="utf-8", newline="") as f:
reader = csv.DictReader(f)
Expand Down Expand Up @@ -430,16 +428,185 @@ def sync_knowledge_base(self):
if not chunks:
return

embeddings = self.vector_handler.embed_texts(texts)
if KnowledgeBaseChunk.can_search_vectors():
for c, e in zip(chunks, embeddings):
c.embedding = list(e)
c._embedding_array = list(e)
else:
for c, e in zip(chunks, embeddings):
c._embedding_array = list(e)
self._update_chunks(texts, chunks)

def sync_knowledge_base_from_dev_docs(self):
"""
Sync the developer documentation from the local `docs/` folder with the
knowledgebase documents and chunks. Every .md file will be included. It will
automatically figure out a title, slug, etc. It automatically checks if the
entry already exists, and will create, update or delete accordingly.
"""

docs_root = self._get_docs_path()
if docs_root is None:
logger.warning(
f"The {docs_root} folder does not exist, skip synchronizing the dev "
f"docs"
)
return

doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS

pages: dict[str, dict] = {}
slugs: set[str] = set()

for md_path in docs_root.rglob("*.md"):
rel = md_path.relative_to(docs_root)
rel_str = rel.as_posix()
if not rel_str.lower().endswith(".md"):
continue

rel_without_md = rel_str[:-3]
slug = f"dev/{rel_without_md}"
slugs.add(slug)

stem = md_path.stem # e.g. "ci-cd"
stem_normalized = stem.replace("_", "-")
parts = [p for p in stem_normalized.split("-") if p]
title = (
" ".join(p[:1].upper() + p[1:].lower() for p in parts)
if parts
else stem[:1].upper() + stem[1:].lower()
)

with md_path.open("r", encoding="utf-8") as f:
body = f.read()

KnowledgeBaseChunk.objects.bulk_create(chunks)
source_url = f"https://baserow.io/docs/{rel_without_md}"

pages[slug] = {
"title": title,
"body": body,
"source_url": source_url,
}

dev_docs_category = KnowledgeBaseCategory.objects.filter(
name="dev_docs"
).first()

with transaction.atomic():
existing = {
d.slug: d for d in KnowledgeBaseDocument.objects.filter(type=doc_type)
}

# Delete docs that no longer have a corresponding markdown file. This is
# needed because a file could be removed because it's no longer relevant.
# It should then not show up in the docs anymore.
to_delete_slugs = [s for s in existing.keys() if s not in slugs]
if to_delete_slugs:
KnowledgeBaseDocument.objects.filter(
type=doc_type, slug__in=to_delete_slugs
).delete()
for s in to_delete_slugs:
existing.pop(s, None)

create, update = [], []
doc_ids_needing_chunks: set[int] = set()

for slug, p in pages.items():
d = existing.get(slug)
if d:
changed = False
body_changed = False
if d.title != p["title"]:
d.title = p["title"]
changed = True
if d.raw_content != p["body"]:
d.raw_content = p["body"]
changed = True
body_changed = True
if d.content != p["body"]:
d.content = p["body"]
changed = True
body_changed = True
if dev_docs_category and d.category_id != dev_docs_category.id:
d.category = dev_docs_category
changed = True
if d.process_document:
d.process_document = False
changed = True
if d.status != KnowledgeBaseDocument.Status.READY:
d.status = KnowledgeBaseDocument.Status.READY
changed = True
if d.source_url != p["source_url"]:
d.source_url = p["source_url"]
changed = True

if changed:
update.append(d)
if body_changed:
doc_ids_needing_chunks.add(d.id)
else:
new_doc = KnowledgeBaseDocument(
title=p["title"],
slug=slug,
type=doc_type,
raw_content=p["body"],
process_document=False,
content=p["body"],
status=KnowledgeBaseDocument.Status.READY,
category=dev_docs_category,
source_url=p["source_url"],
)
create.append(new_doc)

if create:
KnowledgeBaseDocument.objects.bulk_create(create)
fresh = KnowledgeBaseDocument.objects.filter(
type=doc_type, slug__in=[d.slug for d in create]
)
for d in fresh:
existing[d.slug] = d
doc_ids_needing_chunks.add(d.id)

if update:
# The `updated_on` field is not saved during the bulk update, so we
# would need to pre_save this value before.
for d in update:
d.updated_on = KnowledgeBaseDocument._meta.get_field(
"updated_on"
).pre_save(d, add=False)

KnowledgeBaseDocument.objects.bulk_update(
update,
[
"title",
"raw_content",
"process_document",
"content",
"status",
"category",
"source_url",
"updated_on",
],
)

# If there are no chunks to rebuild, we can skip the final part because
# there is no need to delete and recreate the missing chunks.
if not doc_ids_needing_chunks:
return

KnowledgeBaseChunk.objects.filter(
source_document_id__in=list(doc_ids_needing_chunks)
).delete()

chunks, texts = [], []
for slug, d in existing.items():
if d.id not in doc_ids_needing_chunks:
continue
body = pages[slug]["body"]
chunks.append(
KnowledgeBaseChunk(
source_document=d, index=0, content=body, metadata={}
)
)
texts.append(body)

if not chunks:
return

self._update_chunks(texts, chunks)

def _csv_path(self):
path = Path(__file__).resolve().parents[5] / "website_export.csv"
Expand All @@ -449,6 +616,17 @@ def _csv_path(self):

return path

def _get_docs_path(self) -> Path | None:
"""
Returns the path to the `docs` directory if it exists, otherwise None.
The folder is expected at `../../../../../../../docs` from this handler file.
"""

path = Path(__file__).resolve().parents[7] / "docs"
if not path.exists() or not path.is_dir():
return None
return path

def _csv_type_to_enum(self, csv_value: str | None) -> str:
v = (csv_value or "").strip()
if not v:
Expand All @@ -457,3 +635,22 @@ def _csv_type_to_enum(self, csv_value: str | None) -> str:
if v.lower() == dt.value.lower():
return dt.value
return KnowledgeBaseDocument.DocumentType.RAW_DOCUMENT

def _update_chunks(self, texts, chunks):
embeddings = self.vector_handler.embed_texts(texts)
if KnowledgeBaseChunk.can_search_vectors():
for c, e in zip(chunks, embeddings):
c.embedding = list(e)
c._embedding_array = list(e)
else:
for c, e in zip(chunks, embeddings):
c._embedding_array = list(e)

KnowledgeBaseChunk.objects.bulk_create(chunks)

def sync_knowledge_base(self):
# Ensure default categories exist (parents set by load_categories)
self.load_categories(DEFAULT_CATEGORIES)

self.sync_knowledge_base_from_csv()
self.sync_knowledge_base_from_dev_docs()
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Generated by Django 5.0.14 on 2025-11-24 14:10

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("baserow_enterprise", "0055_assistantchatmessage_action_group_id_and_more"),
]

operations = [
migrations.AlterField(
model_name="knowledgebasedocument",
name="type",
field=models.CharField(
choices=[
("raw_document", "Raw Document"),
("baserow_user_docs", "Baserow User Docs"),
("baserow_dev_docs", "Baserow Dev Docs"),
("faq", "FAQ"),
("template", "Template"),
],
default="raw_document",
max_length=20,
),
),
]
Loading
Loading