77from django .db import transaction
88
99from httpx import Client as httpxClient
10+ from loguru import logger
1011from pgvector .django import L2Distance
1112
1213from baserow_enterprise .assistant .models import (
@@ -238,7 +239,7 @@ def load_categories(self, categories_serialized: Iterable[Tuple[str, str | None]
238239 categories_with_parents , ["parent_id" ]
239240 )
240241
241- def sync_knowledge_base (self ):
242+ def sync_knowledge_base_from_csv (self ):
242243 """
243244 Sync entries from `website_export.csv` with the knowledgebase documents and
244245 chunks. The idea is that this `website_export.csv` file can easily be
@@ -249,9 +250,6 @@ def sync_knowledge_base(self):
249250 removed from the source, it will also be removed in the documents.
250251 """
251252
252- # Ensure default categories exist (parents set by load_categories)
253- self .load_categories (DEFAULT_CATEGORIES )
254-
255253 csv_path = self ._csv_path ()
256254 with csv_path .open ("r" , encoding = "utf-8" , newline = "" ) as f :
257255 reader = csv .DictReader (f )
@@ -430,16 +428,185 @@ def sync_knowledge_base(self):
430428 if not chunks :
431429 return
432430
433- embeddings = self .vector_handler .embed_texts (texts )
434- if KnowledgeBaseChunk .can_search_vectors ():
435- for c , e in zip (chunks , embeddings ):
436- c .embedding = list (e )
437- c ._embedding_array = list (e )
438- else :
439- for c , e in zip (chunks , embeddings ):
440- c ._embedding_array = list (e )
431+ self ._update_chunks (texts , chunks )
432+
433+ def sync_knowledge_base_from_dev_docs (self ):
434+ """
435+ Sync the developer documentation from the local `docs/` folder with the
436+ knowledgebase documents and chunks. Every .md file will be included. It will
437+ automatically figure out a title, slug, etc. It automatically checks if the
438+ entry already exists, and will create, update or delete accordingly.
439+ """
440+
441+ docs_root = self ._get_docs_path ()
442+ if docs_root is None :
443+ logger .warning (
444+ f"The { docs_root } folder does not exist, skip synchronizing the dev "
445+ f"docs"
446+ )
447+ return
448+
449+ doc_type = KnowledgeBaseDocument .DocumentType .BASEROW_DEV_DOCS
450+
451+ pages : dict [str , dict ] = {}
452+ slugs : set [str ] = set ()
453+
454+ for md_path in docs_root .rglob ("*.md" ):
455+ rel = md_path .relative_to (docs_root )
456+ rel_str = rel .as_posix ()
457+ if not rel_str .lower ().endswith (".md" ):
458+ continue
459+
460+ rel_without_md = rel_str [:- 3 ]
461+ slug = f"dev/{ rel_without_md } "
462+ slugs .add (slug )
463+
464+ stem = md_path .stem # e.g. "ci-cd"
465+ stem_normalized = stem .replace ("_" , "-" )
466+ parts = [p for p in stem_normalized .split ("-" ) if p ]
467+ title = (
468+ " " .join (p [:1 ].upper () + p [1 :].lower () for p in parts )
469+ if parts
470+ else stem [:1 ].upper () + stem [1 :].lower ()
471+ )
472+
473+ with md_path .open ("r" , encoding = "utf-8" ) as f :
474+ body = f .read ()
441475
442- KnowledgeBaseChunk .objects .bulk_create (chunks )
476+ source_url = f"https://baserow.io/docs/{ rel_without_md } "
477+
478+ pages [slug ] = {
479+ "title" : title ,
480+ "body" : body ,
481+ "source_url" : source_url ,
482+ }
483+
484+ dev_docs_category = KnowledgeBaseCategory .objects .filter (
485+ name = "dev_docs"
486+ ).first ()
487+
488+ with transaction .atomic ():
489+ existing = {
490+ d .slug : d for d in KnowledgeBaseDocument .objects .filter (type = doc_type )
491+ }
492+
493+ # Delete docs that no longer have a corresponding markdown file. This is
494+ # needed because a file could be removed because it's no longer relevant.
495+ # It should then not show up in the docs anymore.
496+ to_delete_slugs = [s for s in existing .keys () if s not in slugs ]
497+ if to_delete_slugs :
498+ KnowledgeBaseDocument .objects .filter (
499+ type = doc_type , slug__in = to_delete_slugs
500+ ).delete ()
501+ for s in to_delete_slugs :
502+ existing .pop (s , None )
503+
504+ create , update = [], []
505+ doc_ids_needing_chunks : set [int ] = set ()
506+
507+ for slug , p in pages .items ():
508+ d = existing .get (slug )
509+ if d :
510+ changed = False
511+ body_changed = False
512+ if d .title != p ["title" ]:
513+ d .title = p ["title" ]
514+ changed = True
515+ if d .raw_content != p ["body" ]:
516+ d .raw_content = p ["body" ]
517+ changed = True
518+ body_changed = True
519+ if d .content != p ["body" ]:
520+ d .content = p ["body" ]
521+ changed = True
522+ body_changed = True
523+ if dev_docs_category and d .category_id != dev_docs_category .id :
524+ d .category = dev_docs_category
525+ changed = True
526+ if d .process_document :
527+ d .process_document = False
528+ changed = True
529+ if d .status != KnowledgeBaseDocument .Status .READY :
530+ d .status = KnowledgeBaseDocument .Status .READY
531+ changed = True
532+ if d .source_url != p ["source_url" ]:
533+ d .source_url = p ["source_url" ]
534+ changed = True
535+
536+ if changed :
537+ update .append (d )
538+ if body_changed :
539+ doc_ids_needing_chunks .add (d .id )
540+ else :
541+ new_doc = KnowledgeBaseDocument (
542+ title = p ["title" ],
543+ slug = slug ,
544+ type = doc_type ,
545+ raw_content = p ["body" ],
546+ process_document = False ,
547+ content = p ["body" ],
548+ status = KnowledgeBaseDocument .Status .READY ,
549+ category = dev_docs_category ,
550+ source_url = p ["source_url" ],
551+ )
552+ create .append (new_doc )
553+
554+ if create :
555+ KnowledgeBaseDocument .objects .bulk_create (create )
556+ fresh = KnowledgeBaseDocument .objects .filter (
557+ type = doc_type , slug__in = [d .slug for d in create ]
558+ )
559+ for d in fresh :
560+ existing [d .slug ] = d
561+ doc_ids_needing_chunks .add (d .id )
562+
563+ if update :
564+ # The `updated_on` field is not saved during the bulk update, so we
565+ # would need to pre_save this value before.
566+ for d in update :
567+ d .updated_on = KnowledgeBaseDocument ._meta .get_field (
568+ "updated_on"
569+ ).pre_save (d , add = False )
570+
571+ KnowledgeBaseDocument .objects .bulk_update (
572+ update ,
573+ [
574+ "title" ,
575+ "raw_content" ,
576+ "process_document" ,
577+ "content" ,
578+ "status" ,
579+ "category" ,
580+ "source_url" ,
581+ "updated_on" ,
582+ ],
583+ )
584+
585+ # If there are no chunks to rebuild, we can skip the final part because
586+ # there is no need to delete and recreate the missing chunks.
587+ if not doc_ids_needing_chunks :
588+ return
589+
590+ KnowledgeBaseChunk .objects .filter (
591+ source_document_id__in = list (doc_ids_needing_chunks )
592+ ).delete ()
593+
594+ chunks , texts = [], []
595+ for slug , d in existing .items ():
596+ if d .id not in doc_ids_needing_chunks :
597+ continue
598+ body = pages [slug ]["body" ]
599+ chunks .append (
600+ KnowledgeBaseChunk (
601+ source_document = d , index = 0 , content = body , metadata = {}
602+ )
603+ )
604+ texts .append (body )
605+
606+ if not chunks :
607+ return
608+
609+ self ._update_chunks (texts , chunks )
443610
444611 def _csv_path (self ):
445612 path = Path (__file__ ).resolve ().parents [5 ] / "website_export.csv"
@@ -449,6 +616,17 @@ def _csv_path(self):
449616
450617 return path
451618
619+ def _get_docs_path (self ) -> Path | None :
620+ """
621+ Returns the path to the `docs` directory if it exists, otherwise None.
622+ The folder is expected at `../../../../../../../docs` from this handler file.
623+ """
624+
625+ path = Path (__file__ ).resolve ().parents [7 ] / "docs"
626+ if not path .exists () or not path .is_dir ():
627+ return None
628+ return path
629+
452630 def _csv_type_to_enum (self , csv_value : str | None ) -> str :
453631 v = (csv_value or "" ).strip ()
454632 if not v :
@@ -457,3 +635,22 @@ def _csv_type_to_enum(self, csv_value: str | None) -> str:
457635 if v .lower () == dt .value .lower ():
458636 return dt .value
459637 return KnowledgeBaseDocument .DocumentType .RAW_DOCUMENT
638+
639+ def _update_chunks (self , texts , chunks ):
640+ embeddings = self .vector_handler .embed_texts (texts )
641+ if KnowledgeBaseChunk .can_search_vectors ():
642+ for c , e in zip (chunks , embeddings ):
643+ c .embedding = list (e )
644+ c ._embedding_array = list (e )
645+ else :
646+ for c , e in zip (chunks , embeddings ):
647+ c ._embedding_array = list (e )
648+
649+ KnowledgeBaseChunk .objects .bulk_create (chunks )
650+
651+ def sync_knowledge_base (self ):
652+ # Ensure default categories exist (parents set by load_categories)
653+ self .load_categories (DEFAULT_CATEGORIES )
654+
655+ self .sync_knowledge_base_from_csv ()
656+ self .sync_knowledge_base_from_dev_docs ()
0 commit comments