From 91f791527065d59a601373b43d0cc1680315cf3d Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Mon, 11 May 2026 16:32:50 -0700 Subject: [PATCH 1/4] [HOP-60] Added Models and registered in admin for 3 obj types --- hospexplorer/ask/admin.py | 54 ++++++++++- ...authorinstitution_documenttype_and_more.py | 94 +++++++++++++++++++ hospexplorer/ask/models.py | 64 +++++++++++++ 3 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index cd99b3e..3cf6c5c 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -6,7 +6,17 @@ from django.contrib.auth.models import User from django.db import transaction -from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource +from ask.models import ( + Conversation, + TermsAcceptance, + QARecord, + SimWorkflow, + WebsiteResource, + PDFResource, + DocumentType, + DocumentAuthorInstitution, + InstitutionType, +) from ask.kb_connector import delete_kb_document from ask.tasks import run_kb_resource_upload @@ -181,16 +191,46 @@ def delete_queryset(self, request, queryset): return +@admin.register(DocumentType) +class DocumentTypeAdmin(admin.ModelAdmin): + list_display = ("name",) + search_fields = ("name",) + + +@admin.register(DocumentAuthorInstitution) +class DocumentAuthorInstitutionAdmin(admin.ModelAdmin): + list_display = ("name",) + search_fields = ("name",) + + +@admin.register(InstitutionType) +class InstitutionTypeAdmin(admin.ModelAdmin): + list_display = ("name",) + search_fields = ("name",) + + @admin.register(WebsiteResource) class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): list_display = ("title", "url", "creator", "status", "modified_at") list_filter = ("status",) search_fields = ("title", "url") readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message") + fieldsets = ( + (None, {"fields": ("title", "description", "url")}), + ("Metadata", {"fields": ( + "date_published", "date_published_precision", + "document_type", "document_author_institution", "institution_type", + )}), + ("Status", {"fields": ( + "status", "status_message", "mcp_kb_document_id", + "created_at", "modified_at", "creator", "modifier", + )}), + ) help_texts = { "title": "A short name to identify this website resource.", "description": "Optional details about what this website covers.", "url": "The URL the LLM will use as context when answering questions.", + "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.", } def get_form(self, request, obj=None, **kwargs): @@ -230,10 +270,22 @@ class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): list_filter = ("status",) search_fields = ("title",) readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message") + fieldsets = ( + (None, {"fields": ("title", "description", "file")}), + ("Metadata", {"fields": ( + "date_published", "date_published_precision", + "document_type", "document_author_institution", "institution_type", + )}), + ("Status", {"fields": ( + "status", "status_message", "mcp_kb_document_id", + "created_at", "modified_at", "creator", "modifier", + )}), + ) help_texts = { "title": "A short name to identify this PDF resource.", "description": "Optional details about what this PDF covers.", "file": "The PDF file the LLM will use as context when answering questions.", + "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.", } def get_form(self, request, obj=None, **kwargs): diff --git a/hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py b/hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py new file mode 100644 index 0000000..dfe1411 --- /dev/null +++ b/hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py @@ -0,0 +1,94 @@ +# Generated by Django 6.0.2 on 2026-05-11 23:30 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('ask', '0012_pdfresource_status_pdfresource_status_message_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='DocumentAuthorInstitution', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + options={ + 'ordering': ['name'], + }, + ), + migrations.CreateModel( + name='DocumentType', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + options={ + 'ordering': ['name'], + }, + ), + migrations.CreateModel( + name='InstitutionType', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + options={ + 'ordering': ['name'], + }, + ), + migrations.AddField( + model_name='pdfresource', + name='date_published', + field=models.DateField(blank=True, null=True), + ), + migrations.AddField( + model_name='pdfresource', + name='date_published_precision', + field=models.CharField(blank=True, choices=[('year', 'Year'), ('month', 'Month'), ('day', 'Day')], default='', max_length=10), + ), + migrations.AddField( + model_name='websiteresource', + name='date_published', + field=models.DateField(blank=True, null=True), + ), + migrations.AddField( + model_name='websiteresource', + name='date_published_precision', + field=models.CharField(blank=True, choices=[('year', 'Year'), ('month', 'Month'), ('day', 'Day')], default='', max_length=10), + ), + migrations.AddField( + model_name='pdfresource', + name='document_author_institution', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documentauthorinstitution'), + ), + migrations.AddField( + model_name='websiteresource', + name='document_author_institution', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documentauthorinstitution'), + ), + migrations.AddField( + model_name='pdfresource', + name='document_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documenttype'), + ), + migrations.AddField( + model_name='websiteresource', + name='document_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documenttype'), + ), + migrations.AddField( + model_name='pdfresource', + name='institution_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.institutiontype'), + ), + migrations.AddField( + model_name='websiteresource', + name='institution_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.institutiontype'), + ), + ] diff --git a/hospexplorer/ask/models.py b/hospexplorer/ask/models.py index 70fa94b..e51d420 100644 --- a/hospexplorer/ask/models.py +++ b/hospexplorer/ask/models.py @@ -3,6 +3,36 @@ from django.conf import settings from django.db import models +class DocumentType(models.Model): + name = models.CharField(max_length=255, unique=True) + + class Meta: + ordering = ["name"] + + def __str__(self): + return self.name + + +class DocumentAuthorInstitution(models.Model): + name = models.CharField(max_length=255, unique=True) + + class Meta: + ordering = ["name"] + + def __str__(self): + return self.name + + +class InstitutionType(models.Model): + name = models.CharField(max_length=255, unique=True) + + class Meta: + ordering = ["name"] + + def __str__(self): + return self.name + + # Abstract Model, fields are inherited by subclasses class Resource(models.Model): class Status(models.TextChoices): @@ -11,6 +41,11 @@ class Status(models.TextChoices): ERROR = "error", "Error" WARNING = "warning", "Warning" + class DatePrecision(models.TextChoices): + YEAR = "year", "Year" + MONTH = "month", "Month" + DAY = "day", "Day" + title = models.CharField(max_length=255) description = models.TextField(blank=True, default="") creator = models.ForeignKey( @@ -34,6 +69,35 @@ class Status(models.TextChoices): ) status_message = models.TextField(blank=True, default="") + date_published = models.DateField(null=True, blank=True) + date_published_precision = models.CharField( + max_length=10, + choices=DatePrecision.choices, + blank=True, + default="", + ) + document_type = models.ForeignKey( + "DocumentType", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="%(class)s_resources", + ) + document_author_institution = models.ForeignKey( + "DocumentAuthorInstitution", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="%(class)s_resources", + ) + institution_type = models.ForeignKey( + "InstitutionType", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="%(class)s_resources", + ) + class Meta: abstract = True From 1eaf9b597e921dde66f744edcc2fe021c94eb716 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Tue, 12 May 2026 14:20:05 -0700 Subject: [PATCH 2/4] [HOP-63] Added csv templates for admin, added csv import funcs with edge cases, incorported metadata field in resources abstract model and added it in website and pdf resources --- hospexplorer/ask/admin.py | 62 ++++++++++++++++++- hospexplorer/ask/admin_csv.py | 26 ++++++++ hospexplorer/ask/kb_connector.py | 17 +++-- hospexplorer/ask/tasks.py | 24 ++++++- .../admin/ask/lookup_change_list.html | 9 +++ .../admin/ask/lookup_csv_import.html | 27 ++++++++ 6 files changed, 154 insertions(+), 11 deletions(-) create mode 100644 hospexplorer/ask/admin_csv.py create mode 100644 hospexplorer/ask/templates/admin/ask/lookup_change_list.html create mode 100644 hospexplorer/ask/templates/admin/ask/lookup_csv_import.html diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 3cf6c5c..6ee1a97 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -5,6 +5,9 @@ from django.contrib.auth.admin import UserAdmin from django.contrib.auth.models import User from django.db import transaction +from django.http import HttpResponseRedirect +from django.shortcuts import render +from django.urls import path, reverse from ask.models import ( Conversation, @@ -17,6 +20,7 @@ DocumentAuthorInstitution, InstitutionType, ) +from ask.admin_csv import import_names_csv from ask.kb_connector import delete_kb_document from ask.tasks import run_kb_resource_upload @@ -191,20 +195,72 @@ def delete_queryset(self, request, queryset): return +class LookupCSVImportMixin: + """Adds an Import CSV button + upload view to a lookup ModelAdmin. + + CSV is single-column name. Duplicates are skipped, header row optional. + """ + + change_list_template = "admin/ask/lookup_change_list.html" + + def get_urls(self): + urls = super().get_urls() + info = (self.model._meta.app_label, self.model._meta.model_name) + return [ + path( + "import-csv/", + self.admin_site.admin_view(self.import_csv_view), + name=f"{info[0]}_{info[1]}_import_csv", + ), + ] + urls + + def import_csv_view(self, request): + info = (self.model._meta.app_label, self.model._meta.model_name) + changelist_url = reverse(f"admin:{info[0]}_{info[1]}_changelist") + + if request.method == "POST": + file_obj = request.FILES.get("csv_file") + if file_obj is None: + self.message_user(request, "No file provided.", level="error") + elif not file_obj.name.lower().endswith(".csv"): + self.message_user(request, "File must have a .csv extension.", level="error") + else: + try: + created, skipped = import_names_csv(self.model, file_obj) + except Exception as e: + logger.exception("CSV import failed for %s", self.model.__name__) + self.message_user(request, f"Import failed: {e}", level="error") + else: + self.message_user( + request, + f"Imported {created} new {self.model._meta.verbose_name_plural} " + f"(skipped {skipped} duplicate or empty rows).", + ) + return HttpResponseRedirect(changelist_url) + + context = { + **self.admin_site.each_context(request), + "title": f"Import {self.model._meta.verbose_name_plural} from CSV", + "opts": self.model._meta, + "changelist_url": changelist_url, + } + return render(request, "admin/ask/lookup_csv_import.html", context) + + @admin.register(DocumentType) -class DocumentTypeAdmin(admin.ModelAdmin): +class DocumentTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin): list_display = ("name",) search_fields = ("name",) @admin.register(DocumentAuthorInstitution) -class DocumentAuthorInstitutionAdmin(admin.ModelAdmin): +class DocumentAuthorInstitutionAdmin(LookupCSVImportMixin, admin.ModelAdmin): list_display = ("name",) search_fields = ("name",) @admin.register(InstitutionType) -class InstitutionTypeAdmin(admin.ModelAdmin): +class InstitutionTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin): list_display = ("name",) search_fields = ("name",) diff --git a/hospexplorer/ask/admin_csv.py b/hospexplorer/ask/admin_csv.py new file mode 100644 index 0000000..beebfd6 --- /dev/null +++ b/hospexplorer/ask/admin_csv.py @@ -0,0 +1,26 @@ +import csv +import io + + +def import_names_csv(model, file_obj): + """Import a one-column CSV into a model with a ``name`` field. + + Returns ``(created, skipped)``. Blank rows, a leading header row of ``name``, + and rows whose name already exists in the table are all counted as skipped. + """ + text = file_obj.read().decode("utf-8-sig", errors="replace") + reader = csv.reader(io.StringIO(text)) + + created = 0 + skipped = 0 + for row in reader: + name = row[0].strip() if row else "" + if not name or name.lower() == "name": + skipped += 1 + continue + _, was_created = model.objects.get_or_create(name=name) + if was_created: + created += 1 + else: + skipped += 1 + return created, skipped diff --git a/hospexplorer/ask/kb_connector.py b/hospexplorer/ask/kb_connector.py index 94bdf40..39b6ea0 100644 --- a/hospexplorer/ask/kb_connector.py +++ b/hospexplorer/ask/kb_connector.py @@ -1,3 +1,4 @@ +import json import logging import httpx @@ -30,15 +31,16 @@ def list_kb_documents(page=1, page_size=10): return response.json() -def add_website_to_kb(url): +def add_website_to_kb(url, metadata=None): """Send a website URL to the MCP KB server for ingestion. Calls POST /docs/website/add?url={url} on the MCP KB server. - The KB server fetches the page, chunks it, generates embeddings, - and stores it for semantic search. + ``metadata`` (if provided) is sent as a JSON body ``{"metadata": ...}`` so + the KB server can store it on the Document row. """ headers = { "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}", + "Content-Type": "application/json", } endpoint = f"{settings.KB_MCP_HOST}/docs/website/add" @@ -46,6 +48,7 @@ def add_website_to_kb(url): response = client.post( endpoint, params={"url": url}, + json={"metadata": metadata} if metadata is not None else {}, headers=headers, timeout=settings.KB_MCP_TIMEOUT, ) @@ -54,12 +57,12 @@ def add_website_to_kb(url): return response.json() -def add_pdf_to_kb(file_bytes, filename, title, url=None): +def add_pdf_to_kb(file_bytes, filename, title, url=None, metadata=None): """Upload a PDF to the MCP KB server for ingestion. Calls POST /docs/pdf/add on the MCP KB server with multipart form data. - The KB server extracts text, chunks it, generates embeddings, - and stores it for semantic search. + metadata (if provided) is JSON-encoded into a metadata form field so + it can travel alongside the file. """ headers = { "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}", @@ -70,6 +73,8 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None): data = {"title": title} if url: data["url"] = url + if metadata is not None: + data["metadata"] = json.dumps(metadata) with httpx.Client() as client: response = client.post( diff --git a/hospexplorer/ask/tasks.py b/hospexplorer/ask/tasks.py index e290310..89b53b3 100644 --- a/hospexplorer/ask/tasks.py +++ b/hospexplorer/ask/tasks.py @@ -144,6 +144,23 @@ def run_llm_task(task_id, record_id, conversation_id): close_old_connections() +def _build_resource_metadata(obj): + """Serialize a Resource's metadata fields into a JSON-safe dict. + + FK lookups are flattened to their ``name`` so the MCP payload is + self-describing and doesn't depend on hosp-explorer's local IDs. + """ + return { + "date_published": obj.date_published.isoformat() if obj.date_published else None, + "date_published_precision": obj.date_published_precision or None, + "document_type": obj.document_type.name if obj.document_type_id else None, + "document_author_institution": ( + obj.document_author_institution.name if obj.document_author_institution_id else None + ), + "institution_type": obj.institution_type.name if obj.institution_type_id else None, + } + + def run_kb_resource_upload(model_label, resource_id): """Background thread: push a resource to the MCP KB and record its doc_id. @@ -169,15 +186,18 @@ def run_kb_resource_upload(model_label, resource_id): return try: + metadata = _build_resource_metadata(obj) if model_label == "pdf": obj.file.open("rb") try: file_bytes = obj.file.read() finally: obj.file.close() - result = add_pdf_to_kb(file_bytes, obj.file.name.split("/")[-1], obj.title) + result = add_pdf_to_kb( + file_bytes, obj.file.name.split("/")[-1], obj.title, metadata=metadata, + ) else: - result = add_website_to_kb(obj.url) + result = add_website_to_kb(obj.url, metadata=metadata) obj.mcp_kb_document_id = result.get("doc_id") obj.status = Resource.Status.SUCCESS diff --git a/hospexplorer/ask/templates/admin/ask/lookup_change_list.html b/hospexplorer/ask/templates/admin/ask/lookup_change_list.html new file mode 100644 index 0000000..766c505 --- /dev/null +++ b/hospexplorer/ask/templates/admin/ask/lookup_change_list.html @@ -0,0 +1,9 @@ +{% extends "admin/change_list.html" %} +{% load i18n %} + +{% block object-tools-items %} +
  • + {% trans "Import CSV" %} +
  • + {{ block.super }} +{% endblock %} diff --git a/hospexplorer/ask/templates/admin/ask/lookup_csv_import.html b/hospexplorer/ask/templates/admin/ask/lookup_csv_import.html new file mode 100644 index 0000000..63ae644 --- /dev/null +++ b/hospexplorer/ask/templates/admin/ask/lookup_csv_import.html @@ -0,0 +1,27 @@ +{% extends "admin/base_site.html" %} +{% load i18n %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +
    + {% csrf_token %} +

    + Upload a one-column CSV. The first column is treated as the + name. A leading header row of name is allowed + and will be skipped. Duplicate names are skipped silently. +

    +

    + +
    +{% endblock %} From 0eccab59d0df5b38c533f98df00003c271d84300 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Thu, 21 May 2026 16:49:15 -0700 Subject: [PATCH 3/4] [HOP-63] Added metadata to csv --- hospexplorer/ask/admin.py | 48 +++++- hospexplorer/ask/admin_csv.py | 29 ++++ .../admin/ask/pdfresource/upload_zip.html | 15 +- hospexplorer/ask/tests.py | 138 +++++++++++++++++- hospexplorer/hospexplorer/settings.py | 6 +- 5 files changed, 229 insertions(+), 7 deletions(-) diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index bdbf4ed..3a1c4f9 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -27,7 +27,7 @@ DocumentAuthorInstitution, InstitutionType, ) -from ask.admin_csv import import_names_csv +from ask.admin_csv import import_names_csv, parse_partial_date from ask.kb_connector import delete_kb_document from ask.tasks import run_kb_resource_upload @@ -341,6 +341,50 @@ def save_model(self, request, obj, form, change): ) +# Optional metadata columns the zip-CSV importer reads onto each PDFResource. +# Controlled-list values create the matching lookup row the first time they +# appear, so the available options grow from what the imports actually use. +ZIP_CSV_LOOKUP_COLUMNS = { + "document_type": DocumentType, + "document_author_institution": DocumentAuthorInstitution, + "institution_type": InstitutionType, +} + + +def _apply_zip_csv_metadata(obj, row): + """Populate a resource's metadata fields from one zip-CSV row. + + Every metadata column is optional. Returns a list of human-readable + warnings for values that could not be applied — the row is still imported, + just with that field left blank. + """ + warnings = [] + + date_raw = (row.get("date_published") or "").strip() + if date_raw: + parsed_date, precision = parse_partial_date(date_raw) + if parsed_date: + obj.date_published = parsed_date + obj.date_published_precision = precision + else: + warnings.append( + f"invalid date_published '{date_raw}' " + "(use YYYY, YYYY-MM or YYYY-MM-DD); left blank" + ) + + for column, model in ZIP_CSV_LOOKUP_COLUMNS.items(): + value = (row.get(column) or "").strip() + if not value: + continue + if len(value) > 255: + warnings.append(f"{column} value exceeds 255 characters; left blank") + continue + lookup, _ = model.objects.get_or_create(name=value) + setattr(obj, column, lookup) + + return warnings + + @admin.register(PDFResource) class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): list_display = ("title", "file", "creator", "status", "modified_at") @@ -498,6 +542,8 @@ def _is_real(name): status=PDFResource.Status.PROCESSING, status_message="Queued for Knowledge Base upload.", ) + for warning in _apply_zip_csv_metadata(obj, row): + messages.warning(request, f"Row {total}: {warning}") obj.file.save(os.path.basename(filename), ContentFile(pdf_bytes), save=True) saved += 1 queued_ids.append(obj.pk) diff --git a/hospexplorer/ask/admin_csv.py b/hospexplorer/ask/admin_csv.py index beebfd6..87eda10 100644 --- a/hospexplorer/ask/admin_csv.py +++ b/hospexplorer/ask/admin_csv.py @@ -1,5 +1,34 @@ import csv +import datetime import io +import re + +# A partial ISO date: a 4-digit year, optionally a month, optionally a day. +# Day can only appear when month does, so "year-day" is impossible to express. +_PARTIAL_DATE_RE = re.compile(r"^(\d{4})(?:-(\d{1,2})(?:-(\d{1,2}))?)?$") + + +def parse_partial_date(value): + """Parse a partial ISO date string into a ``(date, precision)`` pair. + + Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD``. A missing month/day + defaults to 1 so the value still fits a ``DateField``; ``precision`` + ("year", "month" or "day") records how much was actually supplied so the + padding can be ignored later. Blank or unparseable input returns + ``(None, "")``. + """ + match = _PARTIAL_DATE_RE.match((value or "").strip()) + if not match: + return None, "" + year, month, day = match.groups() + try: + if day is not None: + return datetime.date(int(year), int(month), int(day)), "day" + if month is not None: + return datetime.date(int(year), int(month), 1), "month" + return datetime.date(int(year), 1, 1), "year" + except ValueError: + return None, "" def import_names_csv(model, file_obj): diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html index 3b11f2c..587564b 100644 --- a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html @@ -12,9 +12,18 @@ {% block content %}

    {{ title }}

    - Upload a .zip containing PDF files and a single CSV metadata file - with columns {{ required_columns_label }}. Each row creates a PDF Resource - and pushes the file to the Knowledge Base. + Upload a .zip containing PDF files and a single CSV metadata file. + Each row creates a PDF Resource and pushes the file to the Knowledge Base. +

    +

    + Required CSV columns: {{ required_columns_label }}. +

    +

    + Optional metadata columns (per row, leave blank if unknown): + date_published — a year, YYYY-MM, or + YYYY-MM-DD; document_type, + document_author_institution, institution_type + — controlled-list values, created automatically as rows are imported.

    {% csrf_token %} diff --git a/hospexplorer/ask/tests.py b/hospexplorer/ask/tests.py index 749b619..589b8c0 100644 --- a/hospexplorer/ask/tests.py +++ b/hospexplorer/ask/tests.py @@ -1,12 +1,23 @@ +import datetime +import io import shutil import tempfile +import zipfile from unittest.mock import patch from django.contrib.auth.models import User from django.core.files.base import ContentFile from django.test import TestCase, override_settings +from django.urls import reverse -from ask.models import PDFResource +from ask.admin import _apply_zip_csv_metadata +from ask.admin_csv import parse_partial_date +from ask.models import ( + DocumentAuthorInstitution, + DocumentType, + InstitutionType, + PDFResource, +) class PDFResourceDeletionTests(TestCase): @@ -48,3 +59,128 @@ def test_successful_file_removal_is_not_flagged(self): pdf.file.save("report.pdf", ContentFile(b"%PDF-1.4 test"), save=True) pdf.delete() self.assertFalse(pdf.file_deletion_failed) + + +class ParsePartialDateTests(TestCase): + def test_full_date(self): + self.assertEqual( + parse_partial_date("2024-03-15"), (datetime.date(2024, 3, 15), "day") + ) + + def test_year_month(self): + self.assertEqual( + parse_partial_date("2024-03"), (datetime.date(2024, 3, 1), "month") + ) + + def test_year_only(self): + self.assertEqual( + parse_partial_date("2024"), (datetime.date(2024, 1, 1), "year") + ) + + def test_blank_or_none_returns_empty(self): + self.assertEqual(parse_partial_date(""), (None, "")) + self.assertEqual(parse_partial_date(" "), (None, "")) + self.assertEqual(parse_partial_date(None), (None, "")) + + def test_impossible_calendar_dates_rejected(self): + self.assertEqual(parse_partial_date("2024-13"), (None, "")) + self.assertEqual(parse_partial_date("2024-02-30"), (None, "")) + + def test_non_iso_input_rejected(self): + self.assertEqual(parse_partial_date("March 2024"), (None, "")) + self.assertEqual(parse_partial_date("24-03-15"), (None, "")) + + +class ApplyZipCsvMetadataTests(TestCase): + def test_creates_lookups_and_sets_fields(self): + obj = PDFResource(title="Doc") + warnings = _apply_zip_csv_metadata(obj, { + "date_published": "2023-06", + "document_type": "Report", + "document_author_institution": "WHO", + "institution_type": "NGO", + }) + self.assertEqual(warnings, []) + self.assertEqual(obj.date_published, datetime.date(2023, 6, 1)) + self.assertEqual(obj.date_published_precision, "month") + self.assertEqual(obj.document_type.name, "Report") + self.assertEqual(obj.document_author_institution.name, "WHO") + self.assertEqual(obj.institution_type.name, "NGO") + self.assertTrue(DocumentType.objects.filter(name="Report").exists()) + + def test_reuses_existing_lookup_row(self): + existing = DocumentType.objects.create(name="Report") + obj = PDFResource(title="Doc") + _apply_zip_csv_metadata(obj, {"document_type": "Report"}) + self.assertEqual(obj.document_type.pk, existing.pk) + self.assertEqual(DocumentType.objects.filter(name="Report").count(), 1) + + def test_blank_and_missing_columns_are_skipped(self): + obj = PDFResource(title="Doc") + warnings = _apply_zip_csv_metadata(obj, {"document_type": " ", "date_published": ""}) + self.assertEqual(warnings, []) + self.assertIsNone(obj.date_published) + self.assertIsNone(obj.document_type_id) + self.assertEqual(_apply_zip_csv_metadata(PDFResource(title="Doc"), {}), []) + + def test_invalid_date_warns_and_leaves_field_blank(self): + obj = PDFResource(title="Doc") + warnings = _apply_zip_csv_metadata(obj, {"date_published": "not-a-date"}) + self.assertEqual(len(warnings), 1) + self.assertIn("date_published", warnings[0]) + self.assertIsNone(obj.date_published) + + +@override_settings(PDF_ZIP_CSV_COLUMNS=("filename", "title")) +class ZipUploadViewTests(TestCase): + def setUp(self): + media_root = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, media_root, ignore_errors=True) + override = override_settings(MEDIA_ROOT=media_root) + override.enable() + self.addCleanup(override.disable) + self.admin = User.objects.create_superuser("admin", "admin@example.com", "pw") + self.client.force_login(self.admin) + + def _build_zip(self, csv_text, pdfs): + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as archive: + archive.writestr("metadata.csv", csv_text) + for name, content in pdfs.items(): + archive.writestr(name, content) + buf.seek(0) + buf.name = "upload.zip" + return buf + + def test_zip_import_applies_csv_metadata(self): + csv_text = ( + "filename,title,date_published,document_type," + "document_author_institution,institution_type\r\n" + "report.pdf,Annual Report,2022,Report,WHO,NGO\r\n" + ) + zip_file = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 test"}) + + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip_file} + ) + self.assertEqual(response.status_code, 302) + + pdf = PDFResource.objects.get(title="Annual Report") + self.assertEqual(pdf.date_published, datetime.date(2022, 1, 1)) + self.assertEqual(pdf.date_published_precision, "year") + self.assertEqual(pdf.document_type.name, "Report") + self.assertEqual(pdf.document_author_institution.name, "WHO") + self.assertEqual(pdf.institution_type.name, "NGO") + + def test_zip_import_works_without_metadata_columns(self): + csv_text = "filename,title\r\nreport.pdf,Plain Report\r\n" + zip_file = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 test"}) + + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip_file} + ) + self.assertEqual(response.status_code, 302) + + pdf = PDFResource.objects.get(title="Plain Report") + self.assertIsNone(pdf.date_published) + self.assertIsNone(pdf.document_type_id) diff --git a/hospexplorer/hospexplorer/settings.py b/hospexplorer/hospexplorer/settings.py index 4764883..3b11ace 100644 --- a/hospexplorer/hospexplorer/settings.py +++ b/hospexplorer/hospexplorer/settings.py @@ -193,8 +193,10 @@ # the upload view requires both a filename column and a title column, so # PDFResourceAdmin will raise ImproperlyConfigured at request time. # -# Extra CSV columns beyond these two are ignored. Changing this does not change -# which PDFResource fields get populated, only title is read +# Only the filename and title columns are configurable here. The importer also +# reads optional, fixed-name metadata columns when present: date_published, +# document_type, document_author_institution, institution_type. Any other +# columns are ignored. PDF_ZIP_CSV_COLUMNS = tuple( column.strip() for column in os.getenv("PDF_ZIP_CSV_COLUMNS", "filename,title").split(",") if column.strip() From 020e6145c6b8d272bebe9fbcc61852d72ba981c6 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Fri, 22 May 2026 15:06:04 -0700 Subject: [PATCH 4/4] [HOP-63] Added metadata for zip --- hospexplorer/ask/admin.py | 7 ++++++- hospexplorer/ask/tests.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 3a1c4f9..3ec5a03 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -499,7 +499,12 @@ def _is_real(name): csv_text = archive.read(csv_names[0]).decode("utf-8-sig") reader = csv.DictReader(io.StringIO(csv_text)) - csv_columns = {(name or "").strip() for name in (reader.fieldnames or [])} + # strip header names so the column check and per-row lookups use + # the same keys; otherwise a header like "filename, title" leaves + # stray spaces and every row reads as missing its required fields + if reader.fieldnames: + reader.fieldnames = [(name or "").strip() for name in reader.fieldnames] + csv_columns = set(reader.fieldnames or []) if not required_columns.issubset(csv_columns): missing = ", ".join(sorted(required_columns - csv_columns)) messages.error(request, f"CSV is missing required columns: {missing}.") diff --git a/hospexplorer/ask/tests.py b/hospexplorer/ask/tests.py index 589b8c0..f14ebe3 100644 --- a/hospexplorer/ask/tests.py +++ b/hospexplorer/ask/tests.py @@ -184,3 +184,20 @@ def test_zip_import_works_without_metadata_columns(self): pdf = PDFResource.objects.get(title="Plain Report") self.assertIsNone(pdf.date_published) self.assertIsNone(pdf.document_type_id) + + def test_zip_import_tolerates_whitespace_in_csv_header(self): + # spaces after commas in the header row must not cause rows to be skipped + csv_text = ( + "filename, title, date_published, document_type\r\n" + "report.pdf,Spaced Report,2021,Report\r\n" + ) + zip_file = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 test"}) + + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip_file} + ) + self.assertEqual(response.status_code, 302) + + pdf = PDFResource.objects.get(title="Spaced Report") + self.assertEqual(pdf.date_published, datetime.date(2021, 1, 1)) + self.assertEqual(pdf.document_type.name, "Report")