Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 158 additions & 2 deletions hospexplorer/ask/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@
from django.shortcuts import render
from django.urls import path, reverse

from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource
from ask.models import (
Conversation,
TermsAcceptance,
QARecord,
SimWorkflow,
WebsiteResource,
PDFResource,
DocumentType,
DocumentAuthorInstitution,
InstitutionType,
)
from ask.admin_csv import import_names_csv, parse_partial_date
from ask.kb_connector import delete_kb_document
from ask.tasks import run_kb_resource_upload

Expand Down Expand Up @@ -205,16 +216,98 @@ def delete_queryset(self, request, queryset):
return


class LookupCSVImportMixin:
"""Adds an Import CSV button + upload view to a lookup ModelAdmin.

CSV is single-column name. Duplicates are skipped, header row optional.
"""

change_list_template = "admin/ask/lookup_change_list.html"

def get_urls(self):
urls = super().get_urls()
info = (self.model._meta.app_label, self.model._meta.model_name)
return [
path(
"import-csv/",
self.admin_site.admin_view(self.import_csv_view),
name=f"{info[0]}_{info[1]}_import_csv",
),
] + urls

def import_csv_view(self, request):
info = (self.model._meta.app_label, self.model._meta.model_name)
changelist_url = reverse(f"admin:{info[0]}_{info[1]}_changelist")

if request.method == "POST":
file_obj = request.FILES.get("csv_file")
if file_obj is None:
self.message_user(request, "No file provided.", level="error")
elif not file_obj.name.lower().endswith(".csv"):
self.message_user(request, "File must have a .csv extension.", level="error")
else:
try:
created, skipped = import_names_csv(self.model, file_obj)
except Exception as e:
logger.exception("CSV import failed for %s", self.model.__name__)
self.message_user(request, f"Import failed: {e}", level="error")
else:
self.message_user(
request,
f"Imported {created} new {self.model._meta.verbose_name_plural} "
f"(skipped {skipped} duplicate or empty rows).",
)
return HttpResponseRedirect(changelist_url)

context = {
**self.admin_site.each_context(request),
"title": f"Import {self.model._meta.verbose_name_plural} from CSV",
"opts": self.model._meta,
"changelist_url": changelist_url,
}
return render(request, "admin/ask/lookup_csv_import.html", context)


@admin.register(DocumentType)
class DocumentTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(DocumentAuthorInstitution)
class DocumentAuthorInstitutionAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(InstitutionType)
class InstitutionTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(WebsiteResource)
class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
list_display = ("title", "url", "creator", "status", "modified_at")
list_filter = ("status",)
search_fields = ("title", "url")
readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
fieldsets = (
(None, {"fields": ("title", "description", "url")}),
("Metadata", {"fields": (
"date_published", "date_published_precision",
"document_type", "document_author_institution", "institution_type",
)}),
("Status", {"fields": (
"status", "status_message", "mcp_kb_document_id",
"created_at", "modified_at", "creator", "modifier",
)}),
)
help_texts = {
"title": "A short name to identify this website resource.",
"description": "Optional details about what this website covers.",
"url": "The URL the LLM will use as context when answering questions.",
"date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.",
}

def get_form(self, request, obj=None, **kwargs):
Expand Down Expand Up @@ -248,16 +341,72 @@ def save_model(self, request, obj, form, change):
)


# Optional metadata columns the zip-CSV importer reads onto each PDFResource.
# Controlled-list values create the matching lookup row the first time they
# appear, so the available options grow from what the imports actually use.
ZIP_CSV_LOOKUP_COLUMNS = {
"document_type": DocumentType,
"document_author_institution": DocumentAuthorInstitution,
"institution_type": InstitutionType,
}


def _apply_zip_csv_metadata(obj, row):
"""Populate a resource's metadata fields from one zip-CSV row.

Every metadata column is optional. Returns a list of human-readable
warnings for values that could not be applied — the row is still imported,
just with that field left blank.
"""
warnings = []

date_raw = (row.get("date_published") or "").strip()
if date_raw:
parsed_date, precision = parse_partial_date(date_raw)
if parsed_date:
obj.date_published = parsed_date
obj.date_published_precision = precision
else:
warnings.append(
f"invalid date_published '{date_raw}' "
"(use YYYY, YYYY-MM or YYYY-MM-DD); left blank"
)

for column, model in ZIP_CSV_LOOKUP_COLUMNS.items():
value = (row.get(column) or "").strip()
if not value:
continue
if len(value) > 255:
warnings.append(f"{column} value exceeds 255 characters; left blank")
continue
lookup, _ = model.objects.get_or_create(name=value)
setattr(obj, column, lookup)

return warnings


@admin.register(PDFResource)
class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
list_display = ("title", "file", "creator", "status", "modified_at")
list_filter = ("status",)
search_fields = ("title",)
readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
fieldsets = (
(None, {"fields": ("title", "description", "file")}),
("Metadata", {"fields": (
"date_published", "date_published_precision",
"document_type", "document_author_institution", "institution_type",
)}),
("Status", {"fields": (
"status", "status_message", "mcp_kb_document_id",
"created_at", "modified_at", "creator", "modifier",
)}),
)
help_texts = {
"title": "A short name to identify this PDF resource.",
"description": "Optional details about what this PDF covers.",
"file": "The PDF file the LLM will use as context when answering questions.",
"date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.",
}

# Column names the bulk-import CSV must define (first = zip member, second = resource title)
Expand Down Expand Up @@ -350,7 +499,12 @@ def _is_real(name):

csv_text = archive.read(csv_names[0]).decode("utf-8-sig")
reader = csv.DictReader(io.StringIO(csv_text))
csv_columns = {(name or "").strip() for name in (reader.fieldnames or [])}
# strip header names so the column check and per-row lookups use
# the same keys; otherwise a header like "filename, title" leaves
# stray spaces and every row reads as missing its required fields
if reader.fieldnames:
reader.fieldnames = [(name or "").strip() for name in reader.fieldnames]
csv_columns = set(reader.fieldnames or [])
if not required_columns.issubset(csv_columns):
missing = ", ".join(sorted(required_columns - csv_columns))
messages.error(request, f"CSV is missing required columns: {missing}.")
Expand Down Expand Up @@ -393,6 +547,8 @@ def _is_real(name):
status=PDFResource.Status.PROCESSING,
status_message="Queued for Knowledge Base upload.",
)
for warning in _apply_zip_csv_metadata(obj, row):
messages.warning(request, f"Row {total}: {warning}")
obj.file.save(os.path.basename(filename), ContentFile(pdf_bytes), save=True)
saved += 1
queued_ids.append(obj.pk)
Expand Down
55 changes: 55 additions & 0 deletions hospexplorer/ask/admin_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import csv
import datetime
import io
import re

# A partial ISO date: a 4-digit year, optionally a month, optionally a day.
# Day can only appear when month does, so "year-day" is impossible to express.
_PARTIAL_DATE_RE = re.compile(r"^(\d{4})(?:-(\d{1,2})(?:-(\d{1,2}))?)?$")


def parse_partial_date(value):
"""Parse a partial ISO date string into a ``(date, precision)`` pair.

Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD``. A missing month/day
defaults to 1 so the value still fits a ``DateField``; ``precision``
("year", "month" or "day") records how much was actually supplied so the
padding can be ignored later. Blank or unparseable input returns
``(None, "")``.
"""
match = _PARTIAL_DATE_RE.match((value or "").strip())
if not match:
return None, ""
year, month, day = match.groups()
try:
if day is not None:
return datetime.date(int(year), int(month), int(day)), "day"
if month is not None:
return datetime.date(int(year), int(month), 1), "month"
return datetime.date(int(year), 1, 1), "year"
except ValueError:
return None, ""


def import_names_csv(model, file_obj):
"""Import a one-column CSV into a model with a ``name`` field.

Returns ``(created, skipped)``. Blank rows, a leading header row of ``name``,
and rows whose name already exists in the table are all counted as skipped.
"""
text = file_obj.read().decode("utf-8-sig", errors="replace")
reader = csv.reader(io.StringIO(text))

created = 0
skipped = 0
for row in reader:
name = row[0].strip() if row else ""
if not name or name.lower() == "name":
skipped += 1
continue
_, was_created = model.objects.get_or_create(name=name)
if was_created:
created += 1
else:
skipped += 1
return created, skipped
17 changes: 11 additions & 6 deletions hospexplorer/ask/kb_connector.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import time

Expand Down Expand Up @@ -31,22 +32,24 @@ def list_kb_documents(page=1, page_size=10):
return response.json()


def add_website_to_kb(url):
def add_website_to_kb(url, metadata=None):
"""Send a website URL to the MCP KB server for ingestion.

Calls POST /docs/website/add?url={url} on the MCP KB server.
The KB server fetches the page, chunks it, generates embeddings,
and stores it for semantic search.
``metadata`` (if provided) is sent as a JSON body ``{"metadata": ...}`` so
the KB server can store it on the Document row.
"""
headers = {
"Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
"Content-Type": "application/json",
}
endpoint = f"{settings.KB_MCP_HOST}/docs/website/add"

with httpx.Client() as client:
response = client.post(
endpoint,
params={"url": url},
json={"metadata": metadata} if metadata is not None else {},
headers=headers,
timeout=settings.KB_MCP_TIMEOUT,
)
Expand All @@ -55,12 +58,12 @@ def add_website_to_kb(url):
return response.json()


def add_pdf_to_kb(file_bytes, filename, title, url=None):
def add_pdf_to_kb(file_bytes, filename, title, url=None, metadata=None):
"""Upload a PDF to the MCP KB server for ingestion.

Calls POST /docs/pdf/add on the MCP KB server with multipart form data.
The KB server extracts text, chunks it, generates embeddings,
and stores it for semantic search.
metadata (if provided) is JSON-encoded into a metadata form field so
it can travel alongside the file.
"""
headers = {
"Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
Expand All @@ -70,6 +73,8 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None):
data = {"title": title}
if url:
data["url"] = url
if metadata is not None:
data["metadata"] = json.dumps(metadata)

# Only retry on transport errors (the request never completed) — a timeout
# likely means the KB received the file and is still processing it, so
Expand Down
Loading