Skip to content

Commit ad495e4

Browse files
authored
Add dev docs to the assistant knowledge base (baserow#4321)
1 parent e9361bd commit ad495e4

File tree

5 files changed

+405
-13
lines changed

5 files changed

+405
-13
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"type": "feature",
3+
"message": "Synchronizes the dev docs with the assistant knowledge base documents.",
4+
"issue_origin": "github",
5+
"issue_number": null,
6+
"domain": "core",
7+
"bullet_points": [],
8+
"created_at": "2025-11-24"
9+
}

enterprise/backend/src/baserow_enterprise/assistant/models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,8 @@ class DocumentCategory(NamedTuple):
191191
DocumentCategory("enterprise", "billing"),
192192
# FAQ
193193
DocumentCategory("faq", None),
194+
# Dev Docs
195+
DocumentCategory("dev_docs", None),
194196
]
195197

196198

@@ -264,6 +266,10 @@ class DocumentType(models.TextChoices):
264266
"""
265267
Documents downloaded from `baserow.io/user-docs`, our online Knowledge Base.
266268
"""
269+
BASEROW_DEV_DOCS = "baserow_dev_docs", "Baserow Dev Docs"
270+
"""
271+
Documents downloaded from `baserow.io/docs`, the dev docs.
272+
"""
267273
FAQ = "faq", "FAQ"
268274
"""
269275
Frequently Asked Question. It could be a single question or multiple ones for

enterprise/backend/src/baserow_enterprise/assistant/tools/search_user_docs/handler.py

Lines changed: 210 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from django.db import transaction
88

99
from httpx import Client as httpxClient
10+
from loguru import logger
1011
from pgvector.django import L2Distance
1112

1213
from baserow_enterprise.assistant.models import (
@@ -238,7 +239,7 @@ def load_categories(self, categories_serialized: Iterable[Tuple[str, str | None]
238239
categories_with_parents, ["parent_id"]
239240
)
240241

241-
def sync_knowledge_base(self):
242+
def sync_knowledge_base_from_csv(self):
242243
"""
243244
Sync entries from `website_export.csv` with the knowledgebase documents and
244245
chunks. The idea is that this `website_export.csv` file can easily be
@@ -249,9 +250,6 @@ def sync_knowledge_base(self):
249250
removed from the source, it will also be removed in the documents.
250251
"""
251252

252-
# Ensure default categories exist (parents set by load_categories)
253-
self.load_categories(DEFAULT_CATEGORIES)
254-
255253
csv_path = self._csv_path()
256254
with csv_path.open("r", encoding="utf-8", newline="") as f:
257255
reader = csv.DictReader(f)
@@ -430,16 +428,185 @@ def sync_knowledge_base(self):
430428
if not chunks:
431429
return
432430

433-
embeddings = self.vector_handler.embed_texts(texts)
434-
if KnowledgeBaseChunk.can_search_vectors():
435-
for c, e in zip(chunks, embeddings):
436-
c.embedding = list(e)
437-
c._embedding_array = list(e)
438-
else:
439-
for c, e in zip(chunks, embeddings):
440-
c._embedding_array = list(e)
431+
self._update_chunks(texts, chunks)
432+
433+
def sync_knowledge_base_from_dev_docs(self):
434+
"""
435+
Sync the developer documentation from the local `docs/` folder with the
436+
knowledgebase documents and chunks. Every .md file will be included. It will
437+
automatically figure out a title, slug, etc. It automatically checks if the
438+
entry already exists, and will create, update or delete accordingly.
439+
"""
440+
441+
docs_root = self._get_docs_path()
442+
if docs_root is None:
443+
logger.warning(
444+
f"The {docs_root} folder does not exist, skip synchronizing the dev "
445+
f"docs"
446+
)
447+
return
448+
449+
doc_type = KnowledgeBaseDocument.DocumentType.BASEROW_DEV_DOCS
450+
451+
pages: dict[str, dict] = {}
452+
slugs: set[str] = set()
453+
454+
for md_path in docs_root.rglob("*.md"):
455+
rel = md_path.relative_to(docs_root)
456+
rel_str = rel.as_posix()
457+
if not rel_str.lower().endswith(".md"):
458+
continue
459+
460+
rel_without_md = rel_str[:-3]
461+
slug = f"dev/{rel_without_md}"
462+
slugs.add(slug)
463+
464+
stem = md_path.stem # e.g. "ci-cd"
465+
stem_normalized = stem.replace("_", "-")
466+
parts = [p for p in stem_normalized.split("-") if p]
467+
title = (
468+
" ".join(p[:1].upper() + p[1:].lower() for p in parts)
469+
if parts
470+
else stem[:1].upper() + stem[1:].lower()
471+
)
472+
473+
with md_path.open("r", encoding="utf-8") as f:
474+
body = f.read()
441475

442-
KnowledgeBaseChunk.objects.bulk_create(chunks)
476+
source_url = f"https://baserow.io/docs/{rel_without_md}"
477+
478+
pages[slug] = {
479+
"title": title,
480+
"body": body,
481+
"source_url": source_url,
482+
}
483+
484+
dev_docs_category = KnowledgeBaseCategory.objects.filter(
485+
name="dev_docs"
486+
).first()
487+
488+
with transaction.atomic():
489+
existing = {
490+
d.slug: d for d in KnowledgeBaseDocument.objects.filter(type=doc_type)
491+
}
492+
493+
# Delete docs that no longer have a corresponding markdown file. This is
494+
# needed because a file could be removed because it's no longer relevant.
495+
# It should then not show up in the docs anymore.
496+
to_delete_slugs = [s for s in existing.keys() if s not in slugs]
497+
if to_delete_slugs:
498+
KnowledgeBaseDocument.objects.filter(
499+
type=doc_type, slug__in=to_delete_slugs
500+
).delete()
501+
for s in to_delete_slugs:
502+
existing.pop(s, None)
503+
504+
create, update = [], []
505+
doc_ids_needing_chunks: set[int] = set()
506+
507+
for slug, p in pages.items():
508+
d = existing.get(slug)
509+
if d:
510+
changed = False
511+
body_changed = False
512+
if d.title != p["title"]:
513+
d.title = p["title"]
514+
changed = True
515+
if d.raw_content != p["body"]:
516+
d.raw_content = p["body"]
517+
changed = True
518+
body_changed = True
519+
if d.content != p["body"]:
520+
d.content = p["body"]
521+
changed = True
522+
body_changed = True
523+
if dev_docs_category and d.category_id != dev_docs_category.id:
524+
d.category = dev_docs_category
525+
changed = True
526+
if d.process_document:
527+
d.process_document = False
528+
changed = True
529+
if d.status != KnowledgeBaseDocument.Status.READY:
530+
d.status = KnowledgeBaseDocument.Status.READY
531+
changed = True
532+
if d.source_url != p["source_url"]:
533+
d.source_url = p["source_url"]
534+
changed = True
535+
536+
if changed:
537+
update.append(d)
538+
if body_changed:
539+
doc_ids_needing_chunks.add(d.id)
540+
else:
541+
new_doc = KnowledgeBaseDocument(
542+
title=p["title"],
543+
slug=slug,
544+
type=doc_type,
545+
raw_content=p["body"],
546+
process_document=False,
547+
content=p["body"],
548+
status=KnowledgeBaseDocument.Status.READY,
549+
category=dev_docs_category,
550+
source_url=p["source_url"],
551+
)
552+
create.append(new_doc)
553+
554+
if create:
555+
KnowledgeBaseDocument.objects.bulk_create(create)
556+
fresh = KnowledgeBaseDocument.objects.filter(
557+
type=doc_type, slug__in=[d.slug for d in create]
558+
)
559+
for d in fresh:
560+
existing[d.slug] = d
561+
doc_ids_needing_chunks.add(d.id)
562+
563+
if update:
564+
# The `updated_on` field is not saved during the bulk update, so we
565+
# would need to pre_save this value before.
566+
for d in update:
567+
d.updated_on = KnowledgeBaseDocument._meta.get_field(
568+
"updated_on"
569+
).pre_save(d, add=False)
570+
571+
KnowledgeBaseDocument.objects.bulk_update(
572+
update,
573+
[
574+
"title",
575+
"raw_content",
576+
"process_document",
577+
"content",
578+
"status",
579+
"category",
580+
"source_url",
581+
"updated_on",
582+
],
583+
)
584+
585+
# If there are no chunks to rebuild, we can skip the final part because
586+
# there is no need to delete and recreate the missing chunks.
587+
if not doc_ids_needing_chunks:
588+
return
589+
590+
KnowledgeBaseChunk.objects.filter(
591+
source_document_id__in=list(doc_ids_needing_chunks)
592+
).delete()
593+
594+
chunks, texts = [], []
595+
for slug, d in existing.items():
596+
if d.id not in doc_ids_needing_chunks:
597+
continue
598+
body = pages[slug]["body"]
599+
chunks.append(
600+
KnowledgeBaseChunk(
601+
source_document=d, index=0, content=body, metadata={}
602+
)
603+
)
604+
texts.append(body)
605+
606+
if not chunks:
607+
return
608+
609+
self._update_chunks(texts, chunks)
443610

444611
def _csv_path(self):
445612
path = Path(__file__).resolve().parents[5] / "website_export.csv"
@@ -449,6 +616,17 @@ def _csv_path(self):
449616

450617
return path
451618

619+
def _get_docs_path(self) -> Path | None:
620+
"""
621+
Returns the path to the `docs` directory if it exists, otherwise None.
622+
The folder is expected at `../../../../../../../docs` from this handler file.
623+
"""
624+
625+
path = Path(__file__).resolve().parents[7] / "docs"
626+
if not path.exists() or not path.is_dir():
627+
return None
628+
return path
629+
452630
def _csv_type_to_enum(self, csv_value: str | None) -> str:
453631
v = (csv_value or "").strip()
454632
if not v:
@@ -457,3 +635,22 @@ def _csv_type_to_enum(self, csv_value: str | None) -> str:
457635
if v.lower() == dt.value.lower():
458636
return dt.value
459637
return KnowledgeBaseDocument.DocumentType.RAW_DOCUMENT
638+
639+
def _update_chunks(self, texts, chunks):
640+
embeddings = self.vector_handler.embed_texts(texts)
641+
if KnowledgeBaseChunk.can_search_vectors():
642+
for c, e in zip(chunks, embeddings):
643+
c.embedding = list(e)
644+
c._embedding_array = list(e)
645+
else:
646+
for c, e in zip(chunks, embeddings):
647+
c._embedding_array = list(e)
648+
649+
KnowledgeBaseChunk.objects.bulk_create(chunks)
650+
651+
def sync_knowledge_base(self):
652+
# Ensure default categories exist (parents set by load_categories)
653+
self.load_categories(DEFAULT_CATEGORIES)
654+
655+
self.sync_knowledge_base_from_csv()
656+
self.sync_knowledge_base_from_dev_docs()
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Generated by Django 5.0.14 on 2025-11-24 14:10
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("baserow_enterprise", "0055_assistantchatmessage_action_group_id_and_more"),
9+
]
10+
11+
operations = [
12+
migrations.AlterField(
13+
model_name="knowledgebasedocument",
14+
name="type",
15+
field=models.CharField(
16+
choices=[
17+
("raw_document", "Raw Document"),
18+
("baserow_user_docs", "Baserow User Docs"),
19+
("baserow_dev_docs", "Baserow Dev Docs"),
20+
("faq", "FAQ"),
21+
("template", "Template"),
22+
],
23+
default="raw_document",
24+
max_length=20,
25+
),
26+
),
27+
]

0 commit comments

Comments
 (0)