diging · Girik1105 · May 11, 2026 · May 12, 2026 · May 21, 2026 · May 21, 2026
diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py
@@ -16,7 +16,18 @@
 from django.shortcuts import render
 from django.urls import path, reverse
 
-from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource
+from ask.models import (
+    Conversation,
+    TermsAcceptance,
+    QARecord,
+    SimWorkflow,
+    WebsiteResource,
+    PDFResource,
+    DocumentType,
+    DocumentAuthorInstitution,
+    InstitutionType,
+)
+from ask.admin_csv import import_names_csv, parse_partial_date
 from ask.kb_connector import delete_kb_document
 from ask.tasks import run_kb_resource_upload
 
@@ -205,16 +216,98 @@ def delete_queryset(self, request, queryset):
                 return
 
 
+class LookupCSVImportMixin:
+    """Adds an Import CSV button + upload view to a lookup ModelAdmin.
+
+    CSV is single-column name. Duplicates are skipped, header row optional.
+    """
+
+    change_list_template = "admin/ask/lookup_change_list.html"
+
+    def get_urls(self):
+        urls = super().get_urls()
+        info = (self.model._meta.app_label, self.model._meta.model_name)
+        return [
+            path(
+                "import-csv/",
+                self.admin_site.admin_view(self.import_csv_view),
+                name=f"{info[0]}_{info[1]}_import_csv",
+            ),
+        ] + urls
+
+    def import_csv_view(self, request):
+        info = (self.model._meta.app_label, self.model._meta.model_name)
+        changelist_url = reverse(f"admin:{info[0]}_{info[1]}_changelist")
+
+        if request.method == "POST":
+            file_obj = request.FILES.get("csv_file")
+            if file_obj is None:
+                self.message_user(request, "No file provided.", level="error")
+            elif not file_obj.name.lower().endswith(".csv"):
+                self.message_user(request, "File must have a .csv extension.", level="error")
+            else:
+                try:
+                    created, skipped = import_names_csv(self.model, file_obj)
+                except Exception as e:
+                    logger.exception("CSV import failed for %s", self.model.__name__)
+                    self.message_user(request, f"Import failed: {e}", level="error")
+                else:
+                    self.message_user(
+                        request,
+                        f"Imported {created} new {self.model._meta.verbose_name_plural} "
+                        f"(skipped {skipped} duplicate or empty rows).",
+                    )
+            return HttpResponseRedirect(changelist_url)
+
+        context = {
+            **self.admin_site.each_context(request),
+            "title": f"Import {self.model._meta.verbose_name_plural} from CSV",
+            "opts": self.model._meta,
+            "changelist_url": changelist_url,
+        }
+        return render(request, "admin/ask/lookup_csv_import.html", context)
+
+
+@admin.register(DocumentType)
+class DocumentTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
+    list_display = ("name",)
+    search_fields = ("name",)
+
+
+@admin.register(DocumentAuthorInstitution)
+class DocumentAuthorInstitutionAdmin(LookupCSVImportMixin, admin.ModelAdmin):
+    list_display = ("name",)
+    search_fields = ("name",)
+
+
+@admin.register(InstitutionType)
+class InstitutionTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
+    list_display = ("name",)
+    search_fields = ("name",)
+
+
 @admin.register(WebsiteResource)
 class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
     list_display = ("title", "url", "creator", "status", "modified_at")
     list_filter = ("status",)
     search_fields = ("title", "url")
     readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
+    fieldsets = (
+        (None, {"fields": ("title", "description", "url")}),
+        ("Metadata", {"fields": (
+            "date_published", "date_published_precision",
+            "document_type", "document_author_institution", "institution_type",
+        )}),
+        ("Status", {"fields": (
+            "status", "status_message", "mcp_kb_document_id",
+            "created_at", "modified_at", "creator", "modifier",
+        )}),
+    )
     help_texts = {
         "title": "A short name to identify this website resource.",
         "description": "Optional details about what this website covers.",
         "url": "The URL the LLM will use as context when answering questions.",
+        "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.",
     }
 
     def get_form(self, request, obj=None, **kwargs):
@@ -248,16 +341,72 @@ def save_model(self, request, obj, form, change):
         )
 
 
+# Optional metadata columns the zip-CSV importer reads onto each PDFResource.
+# Controlled-list values create the matching lookup row the first time they
+# appear, so the available options grow from what the imports actually use.
+ZIP_CSV_LOOKUP_COLUMNS = {
+    "document_type": DocumentType,
+    "document_author_institution": DocumentAuthorInstitution,
+    "institution_type": InstitutionType,
+}
+
+
+def _apply_zip_csv_metadata(obj, row):
+    """Populate a resource's metadata fields from one zip-CSV row.
+
+    Every metadata column is optional. Returns a list of human-readable
+    warnings for values that could not be applied — the row is still imported,
+    just with that field left blank.
+    """
+    warnings = []
+
+    date_raw = (row.get("date_published") or "").strip()
+    if date_raw:
+        parsed_date, precision = parse_partial_date(date_raw)
+        if parsed_date:
+            obj.date_published = parsed_date
+            obj.date_published_precision = precision
+        else:
+            warnings.append(
+                f"invalid date_published '{date_raw}' "
+                "(use YYYY, YYYY-MM or YYYY-MM-DD); left blank"
+            )
+
+    for column, model in ZIP_CSV_LOOKUP_COLUMNS.items():
+        value = (row.get(column) or "").strip()
+        if not value:
+            continue
+        if len(value) > 255:
+            warnings.append(f"{column} value exceeds 255 characters; left blank")
+            continue
+        lookup, _ = model.objects.get_or_create(name=value)
+        setattr(obj, column, lookup)
+
+    return warnings
+
+
 @admin.register(PDFResource)
 class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
     list_display = ("title", "file", "creator", "status", "modified_at")
     list_filter = ("status",)
     search_fields = ("title",)
     readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
+    fieldsets = (
+        (None, {"fields": ("title", "description", "file")}),
+        ("Metadata", {"fields": (
+            "date_published", "date_published_precision",
+            "document_type", "document_author_institution", "institution_type",
+        )}),
+        ("Status", {"fields": (
+            "status", "status_message", "mcp_kb_document_id",
+            "created_at", "modified_at", "creator", "modifier",
+        )}),
+    )
     help_texts = {
         "title": "A short name to identify this PDF resource.",
         "description": "Optional details about what this PDF covers.",
         "file": "The PDF file the LLM will use as context when answering questions.",
+        "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.",
     }
 
     # Column names the bulk-import CSV must define (first = zip member, second = resource title)
@@ -350,7 +499,12 @@ def _is_real(name):
 
                 csv_text = archive.read(csv_names[0]).decode("utf-8-sig")
                 reader = csv.DictReader(io.StringIO(csv_text))
-                csv_columns = {(name or "").strip() for name in (reader.fieldnames or [])}
+                # strip header names so the column check and per-row lookups use
+                # the same keys; otherwise a header like "filename, title" leaves
+                # stray spaces and every row reads as missing its required fields
+                if reader.fieldnames:
+                    reader.fieldnames = [(name or "").strip() for name in reader.fieldnames]
+                csv_columns = set(reader.fieldnames or [])
                 if not required_columns.issubset(csv_columns):
                     missing = ", ".join(sorted(required_columns - csv_columns))
                     messages.error(request, f"CSV is missing required columns: {missing}.")
@@ -393,6 +547,8 @@ def _is_real(name):
                         status=PDFResource.Status.PROCESSING,
                         status_message="Queued for Knowledge Base upload.",
                     )
+                    for warning in _apply_zip_csv_metadata(obj, row):
+                        messages.warning(request, f"Row {total}: {warning}")
                     obj.file.save(os.path.basename(filename), ContentFile(pdf_bytes), save=True)
                     saved += 1
                     queued_ids.append(obj.pk)

diff --git a/hospexplorer/ask/admin_csv.py b/hospexplorer/ask/admin_csv.py
@@ -0,0 +1,55 @@
+import csv
+import datetime
+import io
+import re
+
+# A partial ISO date: a 4-digit year, optionally a month, optionally a day.
+# Day can only appear when month does, so "year-day" is impossible to express.
+_PARTIAL_DATE_RE = re.compile(r"^(\d{4})(?:-(\d{1,2})(?:-(\d{1,2}))?)?$")
+
+
+def parse_partial_date(value):
+    """Parse a partial ISO date string into a ``(date, precision)`` pair.
+
+    Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD``. A missing month/day
+    defaults to 1 so the value still fits a ``DateField``; ``precision``
+    ("year", "month" or "day") records how much was actually supplied so the
+    padding can be ignored later. Blank or unparseable input returns
+    ``(None, "")``.
+    """
+    match = _PARTIAL_DATE_RE.match((value or "").strip())
+    if not match:
+        return None, ""
+    year, month, day = match.groups()
+    try:
+        if day is not None:
+            return datetime.date(int(year), int(month), int(day)), "day"
+        if month is not None:
+            return datetime.date(int(year), int(month), 1), "month"
+        return datetime.date(int(year), 1, 1), "year"
+    except ValueError:
+        return None, ""
+
+
+def import_names_csv(model, file_obj):
+    """Import a one-column CSV into a model with a ``name`` field.
+
+    Returns ``(created, skipped)``. Blank rows, a leading header row of ``name``,
+    and rows whose name already exists in the table are all counted as skipped.
+    """
+    text = file_obj.read().decode("utf-8-sig", errors="replace")
+    reader = csv.reader(io.StringIO(text))
+
+    created = 0
+    skipped = 0
+    for row in reader:
+        name = row[0].strip() if row else ""
+        if not name or name.lower() == "name":
+            skipped += 1
+            continue
+        _, was_created = model.objects.get_or_create(name=name)
+        if was_created:
+            created += 1
+        else:
+            skipped += 1
+    return created, skipped
diff --git a/hospexplorer/ask/kb_connector.py b/hospexplorer/ask/kb_connector.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import time
 
@@ -31,22 +32,24 @@ def list_kb_documents(page=1, page_size=10):
     return response.json()
 
 
-def add_website_to_kb(url):
+def add_website_to_kb(url, metadata=None):
     """Send a website URL to the MCP KB server for ingestion.
 
     Calls POST /docs/website/add?url={url} on the MCP KB server.
-    The KB server fetches the page, chunks it, generates embeddings,
-    and stores it for semantic search.
+    ``metadata`` (if provided) is sent as a JSON body ``{"metadata": ...}`` so
+    the KB server can store it on the Document row.
     """
     headers = {
         "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
+        "Content-Type": "application/json",
     }
     endpoint = f"{settings.KB_MCP_HOST}/docs/website/add"
 
     with httpx.Client() as client:
         response = client.post(
             endpoint,
             params={"url": url},
+            json={"metadata": metadata} if metadata is not None else {},
             headers=headers,
             timeout=settings.KB_MCP_TIMEOUT,
         )
@@ -55,12 +58,12 @@ def add_website_to_kb(url):
     return response.json()
 
 
-def add_pdf_to_kb(file_bytes, filename, title, url=None):
+def add_pdf_to_kb(file_bytes, filename, title, url=None, metadata=None):
     """Upload a PDF to the MCP KB server for ingestion.
 
     Calls POST /docs/pdf/add on the MCP KB server with multipart form data.
-    The KB server extracts text, chunks it, generates embeddings,
-    and stores it for semantic search.
+    metadata (if provided) is JSON-encoded into a metadata form field so
+    it can travel alongside the file.
     """
     headers = {
         "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
@@ -70,6 +73,8 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None):
     data = {"title": title}
     if url:
         data["url"] = url
+    if metadata is not None:
+        data["metadata"] = json.dumps(metadata)
 
     # Only retry on transport errors (the request never completed) — a timeout
     # likely means the KB received the file and is still processing it, so