From 8baa94fa16c899ed4ac67b40973d85d8b853ae48 Mon Sep 17 00:00:00 2001
From: Ashish Acharya <ashish.acharya14@gmail.com>
Date: Mon, 26 Aug 2024 14:40:56 -0400
Subject: [PATCH 001/441] Fix fake flake8 issues

---
 config/settings/production.py                        |  4 ++--
 config_generation/db_to_xml_file_based.py            |  2 +-
 feedback/models.py                                   |  2 +-
 scraper/url_grouper.py                               |  6 ++++--
 scripts/ej/create_ej_dump.py                         |  3 ++-
 scripts/find_redirects_solar_urls.py                 |  4 ++--
 .../quality_and_indexing/restore_deleted_files.py    |  3 ++-
 sde_collections/admin.py                             |  2 +-
 sde_collections/models/collection.py                 | 12 +++++-------
 sde_collections/models/pattern.py                    |  2 +-
 sde_collections/utils/health_check.py                |  4 ++--
 sde_collections/utils/slack_utils.py                 |  4 ++--
 sde_collections/utils/title_resolver.py              |  2 +-
 13 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/config/settings/production.py b/config/settings/production.py
index aff7db28..270aa00c 100644
--- a/config/settings/production.py
+++ b/config/settings/production.py
@@ -70,11 +70,11 @@
 # ------------------------
 STATICFILES_STORAGE = "sde_indexing_helper.utils.storages.StaticRootS3Boto3Storage"
 COLLECTFAST_STRATEGY = "collectfast.strategies.boto3.Boto3Strategy"
-STATIC_URL = f"https://{aws_s3_domain}/static/"
+STATIC_URL = f"https://{aws_s3_domain}/static/"  # noqa: E231
 # MEDIA
 # ------------------------------------------------------------------------------
 DEFAULT_FILE_STORAGE = "sde_indexing_helper.utils.storages.MediaRootS3Boto3Storage"
-MEDIA_URL = f"https://{aws_s3_domain}/media/"
+MEDIA_URL = f"https://{aws_s3_domain}/media/"  # noqa: E231
 
 # EMAIL
 # ------------------------------------------------------------------------------
diff --git a/config_generation/db_to_xml_file_based.py b/config_generation/db_to_xml_file_based.py
index 88252366..14b077b7 100644
--- a/config_generation/db_to_xml_file_based.py
+++ b/config_generation/db_to_xml_file_based.py
@@ -98,7 +98,7 @@ def update_or_add_element_value(
         parent_element = xml_root if not parent_element_name else xml_root.find(parent_element_name)
 
         if parent_element is None:
-            raise ValueError(f"Parent element '{parent_element_name}' not found in XML.")
+            raise ValueError(f"Parent element '{parent_element_name}' not found in XML.")  # noqa: E713
 
         existing_element = parent_element.find(element_name)
         if not add_duplicate and existing_element:
diff --git a/feedback/models.py b/feedback/models.py
index 0666080f..de2921b5 100644
--- a/feedback/models.py
+++ b/feedback/models.py
@@ -33,7 +33,7 @@ def format_notification_message(self):
         Returns a formatted notification message containing details from this Feedback instance.
         """
         notification_message = (
-            f"<!here> New Feedback Received : \n"
+            f"<!here> New Feedback Received : \n"  # noqa: E203
             f"Name: {self.name}\n"
             f"Email: {self.email}\n"
             f"Subject: {self.subject}\n"
diff --git a/scraper/url_grouper.py b/scraper/url_grouper.py
index df3be3f9..db6188e5 100644
--- a/scraper/url_grouper.py
+++ b/scraper/url_grouper.py
@@ -42,10 +42,12 @@
     output_file.write(f"<h1>{BASE_URL}</h1>\n")
     output_file.write("<ul>\n")
     for key, value in my_dict.items():
-        output_file.write(f'<li><a href="{PROTOCOL}://{BASE_URL}{key}" target=_blank>{key}</a>\n')
+        output_file.write(f'<li><a href="{PROTOCOL}://{BASE_URL}{key}" target=_blank>{key}</a>\n')  # noqa: E231
         output_file.write("<ul>\n")
         for item in value:
-            output_file.write(f'<li><a href="{PROTOCOL}://{BASE_URL}{key}/{item}" target=_blank>{item}</a></li>\n')
+            output_file.write(
+                f'<li><a href="{PROTOCOL}://{BASE_URL}{key}/{item}" target=_blank>{item}</a></li>\n'
+            )  # noqa: E231
         output_file.write("</ul>\n")
         output_file.write("</li>\n")
     output_file.write("</ul>\n")
diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py
index bab5baac..36d7f722 100644
--- a/scripts/ej/create_ej_dump.py
+++ b/scripts/ej/create_ej_dump.py
@@ -1,6 +1,7 @@
 """
 inferences are supplied by the classification model. the contact point is Bishwas
-cmr is supplied by running https://github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py
+cmr is supplied by running
+github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py
 move to the serve like this: scp ej_dump_20240814_143036.json  sde:/home/ec2-user/sde_indexing_helper/backups/
 """
 
diff --git a/scripts/find_redirects_solar_urls.py b/scripts/find_redirects_solar_urls.py
index 3bdbc131..db78081b 100644
--- a/scripts/find_redirects_solar_urls.py
+++ b/scripts/find_redirects_solar_urls.py
@@ -43,9 +43,9 @@ def csv_to_dict_list(file_path):
             scraped_title = soup.find("title").text.strip() if soup.find("title") else ""
         except (AssertionError, Exception) as parse_error:
             scraped_title = ""
-            print(f"Error parsing URL {url_info['url']}: {parse_error}")
+            print(f"Error parsing URL {url_info['url']}: {parse_error}")  # noqa: F821
     except requests.RequestException as e:
-        print(f"Error fetching URL {url_info['url']}: {e}")
+        print(f"Error fetching URL {url_info['url']}: {e}")  # noqa: F821
         response_url = ""
         scraped_title = ""
 
diff --git a/scripts/quality_and_indexing/restore_deleted_files.py b/scripts/quality_and_indexing/restore_deleted_files.py
index 6d6fcb84..70721cc9 100644
--- a/scripts/quality_and_indexing/restore_deleted_files.py
+++ b/scripts/quality_and_indexing/restore_deleted_files.py
@@ -1,5 +1,6 @@
 """
-you need to run this script in the root of the repository that from which the file was deleted, in this case the root of the sinequa_configs repository.
+you need to run this script in the root of the repository that from which the file was deleted,
+in this case the root of the sinequa_configs repository.
 """
 
 import subprocess
diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 1b38db21..cb105f80 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -154,7 +154,7 @@ def export_as_csv(self, request, queryset):
         field_names = [field.name for field in meta.fields]
 
         response = HttpResponse(content_type="text/csv")
-        response["Content-Disposition"] = f"attachment; filename={meta}.csv"
+        response["Content-Disposition"] = f"attachment; filename={meta}.csv"  # noqa: E702
         writer = csv.writer(response)
 
         writer.writerow(field_names)
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index c5690a4b..31306b8c 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -4,7 +4,6 @@
 import requests
 from django.contrib.auth import get_user_model
 from django.db import models
-from django.db.models import Q
 from django.db.models.signals import post_save
 from django.dispatch import receiver
 from model_utils import FieldTracker
@@ -130,7 +129,7 @@ def tree_root(self) -> str:
 
     @property
     def server_url_secret_prod(self) -> str:
-        base_url = "https://sciencediscoveryengine.nasa.gov"
+        base_url = "https://sciencediscoveryengine.nasa.gov"  # noqa: E231
         payload = {
             "name": "secret-prod",
             "scope": "All",
@@ -144,7 +143,7 @@ def server_url_secret_prod(self) -> str:
 
     @property
     def server_url_prod(self) -> str:
-        base_url = "https://sciencediscoveryengine.nasa.gov"
+        base_url = "https://sciencediscoveryengine.nasa.gov"  # noqa: E231
         payload = {
             "name": "query-smd-primary",
             "scope": "All",
@@ -371,13 +370,12 @@ def candidate_urls_count(self) -> int:
 
     @property
     def sinequa_configuration(self) -> str:
-        return (
-            f"https://github.com/NASA-IMPACT/sde-backend/blob/production/sources/SDE/{self.config_folder}/default.xml"
-        )
+        URL = f"https://github.com/NASA-IMPACT/sde-backend/blob/production/sources/SDE/{self.config_folder}/default.xml"  # noqa: E231, E501
+        return URL
 
     @property
     def github_issue_link(self) -> str:
-        return f"https://github.com/NASA-IMPACT/sde-project/issues/{self.github_issue_number}"
+        return f"https://github.com/NASA-IMPACT/sde-project/issues/{self.github_issue_number}"  # noqa: E231
 
     @classmethod
     def _fetch_json_results(cls, url):
diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py
index ae5d78ef..1e14042b 100644
--- a/sde_collections/models/pattern.py
+++ b/sde_collections/models/pattern.py
@@ -143,7 +143,7 @@ def validate_title_pattern(title_pattern_string):
 
         if element_type == "xpath":
             if not is_valid_xpath(element_value):
-                raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.")
+                raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.")  # noqa: E231
         elif element_type == "brace":
             try:
                 is_valid_fstring(element_value)
diff --git a/sde_collections/utils/health_check.py b/sde_collections/utils/health_check.py
index 19c45369..0e09bd87 100644
--- a/sde_collections/utils/health_check.py
+++ b/sde_collections/utils/health_check.py
@@ -127,12 +127,12 @@ def create_exclude_pattern_report(match_pattern, url):
 
         # check with http://
         if match_pattern.find("http://") == -1:
-            url = f"http://{match_pattern}"
+            url = f"http://{match_pattern}"  # noqa: E231
             if url in candidate_urls_sinequa:
                 exclude_pattern_report.append(create_exclude_pattern_report(match_pattern, url))
 
         if match_pattern.find("https://") == -1:
-            url = f"https://{match_pattern}"
+            url = f"https://{match_pattern}"  # noqa: E231
             if url in candidate_urls_sinequa:
                 exclude_pattern_report.append(create_exclude_pattern_report(match_pattern, url))
         else:
diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py
index c4cfd78b..a8fae3ca 100644
--- a/sde_collections/utils/slack_utils.py
+++ b/sde_collections/utils/slack_utils.py
@@ -90,7 +90,7 @@
 def format_slack_message(name, details, collection_id):
     message_template = details["message"]
     tags = " ".join([f"<{user}>" for user in details["tags"]])
-    link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/"
+    link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/"  # noqa: E231
     linked_name = f"<{link}|{name}>"
     return tags + " " + message_template.format(name=linked_name)
 
@@ -101,5 +101,5 @@ def send_slack_message(message):
     response = requests.post(webhook_url, json=payload)
     if response.status_code != 200:
         raise ValueError(
-            f"Request to Slack returned an error {response.status_code}, the response is:\n{response.text}"
+            f"Request to Slack returned an error {response.status_code}, the response is:\n{response.text}"  # noqa: E231, E501
         )
diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py
index b9171de3..20211bf7 100644
--- a/sde_collections/utils/title_resolver.py
+++ b/sde_collections/utils/title_resolver.py
@@ -32,7 +32,7 @@ def is_valid_fstring(pattern: str) -> bool:
             if node.value.id not in context:
                 variables_allowed = ", ".join([key for key in context.keys()])
                 raise ValueError(
-                    f"Variable '{node.value.id}' not allowed in f-string pattern."
+                    f"Variable '{node.value.id}' not allowed in f-string pattern."  # noqa: E713
                     f" Allowed variables are: {variables_allowed}"
                 )
 

From 9cc9acd0e957338f8e858885cea484684d75b873 Mon Sep 17 00:00:00 2001
From: Ashish Acharya <ashish.acharya14@gmail.com>
Date: Mon, 26 Aug 2024 14:43:11 -0400
Subject: [PATCH 002/441] Move noqa to the right line

---
 scraper/url_grouper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scraper/url_grouper.py b/scraper/url_grouper.py
index db6188e5..01b12f8b 100644
--- a/scraper/url_grouper.py
+++ b/scraper/url_grouper.py
@@ -46,8 +46,8 @@
         output_file.write("<ul>\n")
         for item in value:
             output_file.write(
-                f'<li><a href="{PROTOCOL}://{BASE_URL}{key}/{item}" target=_blank>{item}</a></li>\n'
-            )  # noqa: E231
+                f'<li><a href="{PROTOCOL}://{BASE_URL}{key}/{item}" target=_blank>{item}</a></li>\n'  # noqa: E231
+            )
         output_file.write("</ul>\n")
         output_file.write("</li>\n")
     output_file.write("</ul>\n")

From 2a22edab9bed637f81908cedddbc1639fe11851a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 21:58:03 +0000
Subject: [PATCH 003/441] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/pre-commit/pre-commit-hooks: v4.4.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.4.0...v4.6.0)
- [github.com/asottile/pyupgrade: v3.3.1 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.3.1...v3.17.0)
- [github.com/psf/black: 23.1.0 → 24.8.0](https://github.com/psf/black/compare/23.1.0...24.8.0)
- [github.com/PyCQA/isort: 5.12.0 → 5.13.2](https://github.com/PyCQA/isort/compare/5.12.0...5.13.2)
- [github.com/PyCQA/flake8: 6.0.0 → 7.1.1](https://github.com/PyCQA/flake8/compare/6.0.0...7.1.1)
- [github.com/pre-commit/mirrors-mypy: v1.4.0 → v1.11.2](https://github.com/pre-commit/mirrors-mypy/compare/v1.4.0...v1.11.2)
---
 .pre-commit-config.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8c4d553f..5631a71d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,37 +3,37 @@ default_stages: [commit]
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-yaml
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.3.1
+    rev: v3.17.0
     hooks:
       - id: pyupgrade
         args: [--py310-plus]
 
   - repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 24.8.0
     hooks:
       - id: black
 
   - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
 
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 7.1.1
     hooks:
       - id: flake8
         args: ["--config=setup.cfg"]
         additional_dependencies: [flake8-isort]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.0
+    rev: v1.11.2
     hooks:
       - id: mypy
         args: ["--strict"]

From bccdaec949f4f6ed069010b1a3dfb7e798a57512 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 21:58:48 +0000
Subject: [PATCH 004/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 config/wsgi.py                                | 1 +
 config_generation/delete_config_folders.py    | 1 +
 config_generation/generate_collection_list.py | 1 +
 config_generation/generate_commands.py        | 1 +
 document_classifier/async_scraper.py          | 1 +
 document_classifier/encoder.py                | 1 +
 document_classifier/load_dataset.py           | 1 +
 sde_indexing_helper/users/tests/test_forms.py | 1 +
 8 files changed, 8 insertions(+)

diff --git a/config/wsgi.py b/config/wsgi.py
index bc448e89..bbc3c1ef 100644
--- a/config/wsgi.py
+++ b/config/wsgi.py
@@ -13,6 +13,7 @@
 framework.
 
 """
+
 import os
 import sys
 from pathlib import Path
diff --git a/config_generation/delete_config_folders.py b/config_generation/delete_config_folders.py
index 0fc138d6..119d48fc 100644
--- a/config_generation/delete_config_folders.py
+++ b/config_generation/delete_config_folders.py
@@ -5,6 +5,7 @@
     - commands
     - jobs
 """
+
 import glob
 import os
 import shutil
diff --git a/config_generation/generate_collection_list.py b/config_generation/generate_collection_list.py
index 86556c53..ee0e9b47 100644
--- a/config_generation/generate_collection_list.py
+++ b/config_generation/generate_collection_list.py
@@ -4,6 +4,7 @@
 - filter anything that isn't a webcrawler
 - provide a variable, turned_on_remaining_webcrawlers for import by other files
 """
+
 import os
 
 from db_to_xml import XmlEditor
diff --git a/config_generation/generate_commands.py b/config_generation/generate_commands.py
index a538ee03..1b41858c 100644
--- a/config_generation/generate_commands.py
+++ b/config_generation/generate_commands.py
@@ -2,6 +2,7 @@
 sometimes spot fixes need to be run on a list of collections
 this file provides a quick framework to generate a batch of commands based on an input json
 """
+
 from db_to_xml_file_based import XmlEditor
 from generate_jobs import ParallelJobCreator
 
diff --git a/document_classifier/async_scraper.py b/document_classifier/async_scraper.py
index fb2fb7c7..12c039a3 100644
--- a/document_classifier/async_scraper.py
+++ b/document_classifier/async_scraper.py
@@ -1,4 +1,5 @@
 """Asynchronously scrapes the HTML content of a given URL using a headless browser."""
+
 import asyncio
 import re
 
diff --git a/document_classifier/encoder.py b/document_classifier/encoder.py
index c62bfafc..1bacc5d9 100644
--- a/document_classifier/encoder.py
+++ b/document_classifier/encoder.py
@@ -1,4 +1,5 @@
 """ Encoding the url response """
+
 import pandas as pd
 
 
diff --git a/document_classifier/load_dataset.py b/document_classifier/load_dataset.py
index 8c64e03b..d61efdad 100644
--- a/document_classifier/load_dataset.py
+++ b/document_classifier/load_dataset.py
@@ -1,4 +1,5 @@
 """ Module for loading dataset """
+
 from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
 
 
diff --git a/sde_indexing_helper/users/tests/test_forms.py b/sde_indexing_helper/users/tests/test_forms.py
index f89d7141..68145eaa 100644
--- a/sde_indexing_helper/users/tests/test_forms.py
+++ b/sde_indexing_helper/users/tests/test_forms.py
@@ -1,6 +1,7 @@
 """
 Module for all Form Tests.
 """
+
 from django.utils.translation import gettext_lazy as _
 
 from sde_indexing_helper.users.forms import UserAdminCreationForm

From abd565a01c1f2e3ddcfa0aa6a60595b4cda326b9 Mon Sep 17 00:00:00 2001
From: Dawadi Kiran <dkiran@dawadi24-mbp.local>
Date: Mon, 26 Aug 2024 21:15:21 -0500
Subject: [PATCH 005/441] Add LRM_QA_{USER, PASSWORD} variable to .django

---
 .envs/.local/.django | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 97dfaab8..291b9a32 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -37,3 +37,5 @@ LRM_USER=''
 LRM_PASSWORD=''
 XLI_USER=''
 XLI_PASSWORD=''
+LRM_QA_USER = ''
+LRM_QA_PASSWORD = ''

From d656506fb5fee5a11433141579b402f3c6ad06f1 Mon Sep 17 00:00:00 2001
From: Dawadi Kiran <dkiran@dawadi24-mbp.local>
Date: Wed, 28 Aug 2024 17:02:20 -0500
Subject: [PATCH 006/441] Fixes issue #989 - Make coding syntax consistent

---
 .envs/.local/.django | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 291b9a32..402efc3c 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -37,5 +37,5 @@ LRM_USER=''
 LRM_PASSWORD=''
 XLI_USER=''
 XLI_PASSWORD=''
-LRM_QA_USER = ''
-LRM_QA_PASSWORD = ''
+LRM_QA_USER=''
+LRM_QA_PASSWORD=''

From 71c8fb4485fbbf9c9a7a767c7d15c491561eeff2 Mon Sep 17 00:00:00 2001
From: Dawadi Kiran <dkiran@dawadi24-mbp.local>
Date: Mon, 2 Sep 2024 15:02:17 -0500
Subject: [PATCH 007/441] Fixes issue #993 - Add SQLDumpRestoration.md file

---
 SQLDumpRestoration.md | 81 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 SQLDumpRestoration.md

diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md
new file mode 100644
index 00000000..3a672a1f
--- /dev/null
+++ b/SQLDumpRestoration.md
@@ -0,0 +1,81 @@
+## Restoring the Database from SQL Dump
+
+We generally load a database backup from a JSON file by using the following command.
+
+```
+docker-compose -f local.yml run --rm django python manage.py loaddata backup.json
+```
+
+However, if the JSON file is particularly large (>1.5GB), Docker might struggle with this method. In such cases, you can use SQL dump and restore commands as an alternative.
+
+### Steps for Using SQL Dump and Restore
+
+1. Begin by starting only the PostgreSQL container. This prevents the Django container from making changes while the PostgreSQL container is starting up.
+
+```
+docker-compose -f local.yml up postgres
+```
+
+2. Find the container ID using `docker ps`, then enter the PostgreSQL container to execute commands.
+
+```
+$ docker ps                               
+CONTAINER ID   IMAGE                                     COMMAND                  
+23d33f22cc43   sde_indexing_helper_production_postgres   "docker-entrypoint.s…"
+
+$ docker exec -it 23d33f22cc43 bash   
+```
+
+3. Create a connection to the database.
+
+```
+psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper
+```
+
+4. Ensure that the database `sde_indexing_helper` is empty. 
+
+```
+sde_indexing_helper-# \c
+You are now connected to database "sde_indexing_helper" as user "VnUvMKBSdk...".
+sde_indexing_helper-# \dt
+Did not find any relations.
+```
+
+If the database is not empty, delete its contents to create a fresh database:
+
+```
+sde_indexing_helper=# \c postgres
+You are now connected to database "postgres" as user "VnUvMKBSdkoFIETgLongnxYHrYVJKufn".
+postgres=# DROP DATABASE sde_indexing_helper;
+DROP DATABASE
+postgres=# CREATE DATABASE sde_indexing_helper;
+CREATE DATABASE
+
+```
+
+5. Transfer the backup SQL dump (`backup.sql`) from your local machine to the PostgreSQL container.
+
+```
+docker cp /local/path/backup.sql 23d33f22cc43:/
+```
+
+6. Import the SQL dump into the PostgreSQL container.
+
+```
+psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper -f backup.sql
+```
+
+**Note**: To create a SQL dump of your PostgreSQL database, use the following command:
+
+```
+pg_dump -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -W -F p -f backup.sql sde_indexing_helper
+```
+
+7. Bring up all containers at once, and create a superuser account for logging in.
+
+```
+docker-compose -f local.yml up
+docker-compose -f local.yml run --rm django python manage.py createsuperuser
+```
+
+8. Log in to the SDE Indexing Helper frontend to ensure that all data has been correctly populated in the UI.
\ No newline at end of file

From 52521e3076eab9e6a3b265a142238a9b4f281f89 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 20:04:58 +0000
Subject: [PATCH 008/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 SQLDumpRestoration.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md
index 3a672a1f..49e78994 100644
--- a/SQLDumpRestoration.md
+++ b/SQLDumpRestoration.md
@@ -19,11 +19,11 @@ docker-compose -f local.yml up postgres
 2. Find the container ID using `docker ps`, then enter the PostgreSQL container to execute commands.
 
 ```
-$ docker ps                               
-CONTAINER ID   IMAGE                                     COMMAND                  
+$ docker ps
+CONTAINER ID   IMAGE                                     COMMAND
 23d33f22cc43   sde_indexing_helper_production_postgres   "docker-entrypoint.s…"
 
-$ docker exec -it 23d33f22cc43 bash   
+$ docker exec -it 23d33f22cc43 bash
 ```
 
 3. Create a connection to the database.
@@ -32,7 +32,7 @@ $ docker exec -it 23d33f22cc43 bash
 psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper
 ```
 
-4. Ensure that the database `sde_indexing_helper` is empty. 
+4. Ensure that the database `sde_indexing_helper` is empty.
 
 ```
 sde_indexing_helper-# \c
@@ -78,4 +78,4 @@ docker-compose -f local.yml up
 docker-compose -f local.yml run --rm django python manage.py createsuperuser
 ```
 
-8. Log in to the SDE Indexing Helper frontend to ensure that all data has been correctly populated in the UI.
\ No newline at end of file
+8. Log in to the SDE Indexing Helper frontend to ensure that all data has been correctly populated in the UI.

From 2e147f9e2d24d1b2ae20afcba3afc68ec5cd1d57 Mon Sep 17 00:00:00 2001
From: Dawadi Kiran <dkiran@dawadi24-mbp.local>
Date: Mon, 2 Sep 2024 16:53:13 -0500
Subject: [PATCH 009/441] Fixes issue #995

---
 CONTRIBUTING.md | 69 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..db454288
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,69 @@
+Thank you for your interest in contributing to COSMOS! We welcome contributions and appreciate your help in making this project better. Please follow the guidelines below to ensure a smooth contribution process.
+
+## Pull Requests
+
+### Prerequisites
+
+- **GitHub CLI (`gh`)**: Make sure you have the GitHub CLI installed. If not, you can install it from [GitHub CLI installation page](https://cli.github.com/).
+
+### 1. **Create an Issue on the Repo**
+
+1. **Navigate to Your Repository**:
+    
+    ```bash
+    $ cd path/to/your/repository
+    ```
+    
+2. **Create an Issue**:
+Use the `gh issue create` command to create a new issue.
+    
+    ```bash
+    $ gh issue create --title "Issue Title" --body "Description of the issue"
+    ```
+    
+    After running this command, you’ll get an issue number in the output. Note this number as it will be used to create a branch.
+    
+
+### 2. **Create a Branch for the Issue**
+
+1. **Create a Branch**:
+Use the `gh` CLI to create a branch associated with the issue. The `gh` CLI can automatically create a branch for you based on the issue number. In this case, the `<issue_number>` is 989.
+    
+    ```bash
+    $ gh issue develop -c 989
+    github.com/NASA-IMPACT/COSMOS/tree/989-make-coding-syntax-consistent                                                                                                                          
+    From https://github.com/NASA-IMPACT/COSMOS                                                                                                                                                    
+     * [new branch]      989-make-coding-syntax-consistent -> origin/989-make-coding-syntax-consistent
+    
+    ```
+    
+    This command creates a new branch named `<issue_number>-issue` and switches to it. This branch will be used to work on the issue.
+    
+2. **Make Your Changes and Push:**
+Edit files, add code, or make any changes needed to address the issue. Commit your changes and push the branch to the remote repository.
+    
+    ```bash
+    git add .
+    git commit -m "Fixes issue #<issue_number>"
+    git push origin <issue_number>-issue
+    ```
+    
+
+### 3. **Create a Pull Request**
+
+1. **Create the Pull Request**:
+After pushing the branch, create a pull request using the `gh pr create` command:
+    
+    ```bash
+    gh pr create --base dev --head <issue_number>-issue --title "Title of the Pull Request" --body "Description of the changes"
+    ```
+    
+    - **`-base`**: The base branch you want to merge your changes into (`dev` in our case)
+    - **`-head`**: The branch that contains your changes (e.g., `<issue_number>-issue`).
+    - **`-title`**: The title of the pull request.
+    - **`-body`**: The description or body of the pull request.
+    
+    This command will create a pull request from your branch into the base branch specified.
+    
+2. **Review and Merge**:
+Once the pull request is created, you can review it on GitHub and merge it if everything looks good.
\ No newline at end of file

From 733c6c55d40c8e144aeea3b1e91462f857416d82 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 21:57:55 +0000
Subject: [PATCH 010/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CONTRIBUTING.md | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index db454288..7564194d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,61 +9,61 @@ Thank you for your interest in contributing to COSMOS! We welcome contributions
 ### 1. **Create an Issue on the Repo**
 
 1. **Navigate to Your Repository**:
-    
+
     ```bash
     $ cd path/to/your/repository
     ```
-    
+
 2. **Create an Issue**:
 Use the `gh issue create` command to create a new issue.
-    
+
     ```bash
     $ gh issue create --title "Issue Title" --body "Description of the issue"
     ```
-    
+
     After running this command, you’ll get an issue number in the output. Note this number as it will be used to create a branch.
-    
+
 
 ### 2. **Create a Branch for the Issue**
 
 1. **Create a Branch**:
 Use the `gh` CLI to create a branch associated with the issue. The `gh` CLI can automatically create a branch for you based on the issue number. In this case, the `<issue_number>` is 989.
-    
+
     ```bash
     $ gh issue develop -c 989
-    github.com/NASA-IMPACT/COSMOS/tree/989-make-coding-syntax-consistent                                                                                                                          
-    From https://github.com/NASA-IMPACT/COSMOS                                                                                                                                                    
+    github.com/NASA-IMPACT/COSMOS/tree/989-make-coding-syntax-consistent
+    From https://github.com/NASA-IMPACT/COSMOS
      * [new branch]      989-make-coding-syntax-consistent -> origin/989-make-coding-syntax-consistent
-    
+
     ```
-    
+
     This command creates a new branch named `<issue_number>-issue` and switches to it. This branch will be used to work on the issue.
-    
+
 2. **Make Your Changes and Push:**
 Edit files, add code, or make any changes needed to address the issue. Commit your changes and push the branch to the remote repository.
-    
+
     ```bash
     git add .
     git commit -m "Fixes issue #<issue_number>"
     git push origin <issue_number>-issue
     ```
-    
+
 
 ### 3. **Create a Pull Request**
 
 1. **Create the Pull Request**:
 After pushing the branch, create a pull request using the `gh pr create` command:
-    
+
     ```bash
     gh pr create --base dev --head <issue_number>-issue --title "Title of the Pull Request" --body "Description of the changes"
     ```
-    
+
     - **`-base`**: The base branch you want to merge your changes into (`dev` in our case)
     - **`-head`**: The branch that contains your changes (e.g., `<issue_number>-issue`).
     - **`-title`**: The title of the pull request.
     - **`-body`**: The description or body of the pull request.
-    
+
     This command will create a pull request from your branch into the base branch specified.
-    
+
 2. **Review and Merge**:
-Once the pull request is created, you can review it on GitHub and merge it if everything looks good.
\ No newline at end of file
+Once the pull request is created, you can review it on GitHub and merge it if everything looks good.

From 3cfa766d2c531a131d62cb889820261f0542a987 Mon Sep 17 00:00:00 2001
From: Dawadi Kiran <dkiran@dawadi24-mbp.local>
Date: Mon, 2 Sep 2024 20:18:05 -0500
Subject: [PATCH 011/441] Add more description to 'Review and Merge'

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7564194d..b3d25f62 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -66,4 +66,4 @@ After pushing the branch, create a pull request using the `gh pr create` command
     This command will create a pull request from your branch into the base branch specified.
 
 2. **Review and Merge**:
-Once the pull request is created, you can review it on GitHub and merge it if everything looks good.
+Once the pull request is created, we will review it on GitHub and merge it if everything looks good. If any changes are required, we might ask you to make adjustments before the merge.

From a7f823a03996d7b119aeb07beca6725c4005851d Mon Sep 17 00:00:00 2001
From: Dawadi Kiran <dkiran@dawadi24-mbp.local>
Date: Mon, 2 Sep 2024 21:34:39 -0500
Subject: [PATCH 012/441] Improve SQLDumpRestoration.md and README.md files

---
 README.md             |  7 +++++--
 SQLDumpRestoration.md | 16 ++++++++++------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 61cf6b50..cc64ef01 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,10 @@ $ docker cp /path/to/your/backup.json container_name:/path/inside/container/back
 $ docker-compose -f local.yml run --rm django python manage.py loaddata /path/inside/the/container/backup.json
 $ docker-compose -f local.yml run --rm django python manage.py migrate
 ```
+### Restoring the Database from a SQL Dump
+If the JSON file is particularly large (>1.5GB), Docker might struggle with this method. In such cases, you can use SQL dump and restore commands as an alternative, as described [here](./SQLDumpRestoration.md).
+
+
 
 ## Additional Commands
 
@@ -191,8 +195,7 @@ Documented [here](https://github.com/NASA-IMPACT/sde-indexing-helper/wiki/How-to
 
 ## Adding New Features/Fixes
 
-1. Start with a [GitHub issue](https://github.com/NASA-IMPACT/sde-indexing-helper/issues).
-2. Use the GitHub CLI to create branches and pull requests (`gh issue develop -c <issue_number>`).
+We welcome contributions to improve the project! Before you begin, please take a moment to review our [Contributing Guidelines](./CONTRIBUTING.md). These guidelines will help you understand the process for submitting new features, bug fixes, and other improvements.
 
 ## Job Creation
 
diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md
index 49e78994..6b4792be 100644
--- a/SQLDumpRestoration.md
+++ b/SQLDumpRestoration.md
@@ -29,10 +29,14 @@ $ docker exec -it 23d33f22cc43 bash
 3. Create a connection to the database.
 
 ```
-psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper
+psql -U <POSTGRES_USER> -d <POSTGRES_DB>
 ```
 
-4. Ensure that the database `sde_indexing_helper` is empty.
+**Note**:
+- For local deployment, refer to the `.envs/.local/.postgres` file for the `POSTGRES_USER` and `POSTGRES_DB` variables.
+- For production deployment, refer to the `.envs/.production/.postgres` file.
+
+4. Ensure that the database `<POSTGRES_DB>` is empty. Here's an example:
 
 ```
 sde_indexing_helper-# \c
@@ -44,8 +48,8 @@ Did not find any relations.
 If the database is not empty, delete its contents to create a fresh database:
 
 ```
-sde_indexing_helper=# \c postgres
-You are now connected to database "postgres" as user "VnUvMKBSdkoFIETgLongnxYHrYVJKufn".
+sde_indexing_helper=# \c postgres      //connect to a different database before dropping
+You are now connected to database "postgres" as user "VnUvMKBSdk....".
 postgres=# DROP DATABASE sde_indexing_helper;
 DROP DATABASE
 postgres=# CREATE DATABASE sde_indexing_helper;
@@ -62,13 +66,13 @@ docker cp /local/path/backup.sql 23d33f22cc43:/
 6. Import the SQL dump into the PostgreSQL container.
 
 ```
-psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper -f backup.sql
+psql -U <POSTGRES_USER> -d <POSTGRES_DB> -f backup.sql
 ```
 
 **Note**: To create a SQL dump of your PostgreSQL database, use the following command:
 
 ```
-pg_dump -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -W -F p -f backup.sql sde_indexing_helper
+pg_dump -U <POSTGRES_USER> -W -F p -f backup.sql <POSTGRES_DB>
 ```
 
 7. Bring up all containers at once, and create a superuser account for logging in.

From 40574b9b8ad9e86295e6b6fdc1aea705744a63c3 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 9 Sep 2024 09:20:18 -0500
Subject: [PATCH 013/441] remove force reindexing from templates

---
 config_generation/xmls/job_template.xml             | 4 ++--
 config_generation/xmls/plugin_indexing_template.xml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config_generation/xmls/job_template.xml b/config_generation/xmls/job_template.xml
index c5406ea9..9832101e 100644
--- a/config_generation/xmls/job_template.xml
+++ b/config_generation/xmls/job_template.xml
@@ -3,7 +3,7 @@
     <JobType>collection</JobType>
     <Description></Description>
     <IdentityOverride></IdentityOverride>
-    <CollectionOverride>_ForceReindexation</CollectionOverride>
+    <CollectionOverride></CollectionOverride>
     <Trigger>
         <Enabled>false</Enabled>
         <TriggerType></TriggerType>
@@ -32,4 +32,4 @@
     <InstanceText></InstanceText>
     <PauseTimeBeforeStartup></PauseTimeBeforeStartup>
     <StatusMaxOk></StatusMaxOk>
-</Sinequa>
+</Sinequa>
\ No newline at end of file
diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml
index b7a9ce63..f7978062 100644
--- a/config_generation/xmls/plugin_indexing_template.xml
+++ b/config_generation/xmls/plugin_indexing_template.xml
@@ -8,7 +8,7 @@
     <domain></domain>
     <Revision>1</Revision>
     <visibility></visibility>
-    <ForceReindexation>true</ForceReindexation>
+    <ForceReindexation>false</ForceReindexation>
     <Plugin>SMD_Plugins/Sinequa.Plugin.WebCrawler_Index_URLList</Plugin>
     <WorkerCount>6</WorkerCount>
     <MaxWorkerPerHost></MaxWorkerPerHost>
@@ -272,4 +272,4 @@
         <Name>version</Name>
         <Value>Md5(doc.url1)</Value>
     </Mapping>
-</Sinequa>
+</Sinequa>
\ No newline at end of file

From a4fb1586af159a9455cdb0126043aae479e7e1b3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Sep 2024 14:23:31 +0000
Subject: [PATCH 014/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 config_generation/xmls/job_template.xml             | 2 +-
 config_generation/xmls/plugin_indexing_template.xml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/config_generation/xmls/job_template.xml b/config_generation/xmls/job_template.xml
index 9832101e..7763ecf1 100644
--- a/config_generation/xmls/job_template.xml
+++ b/config_generation/xmls/job_template.xml
@@ -32,4 +32,4 @@
     <InstanceText></InstanceText>
     <PauseTimeBeforeStartup></PauseTimeBeforeStartup>
     <StatusMaxOk></StatusMaxOk>
-</Sinequa>
\ No newline at end of file
+</Sinequa>
diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml
index f7978062..44bfba6c 100644
--- a/config_generation/xmls/plugin_indexing_template.xml
+++ b/config_generation/xmls/plugin_indexing_template.xml
@@ -272,4 +272,4 @@
         <Name>version</Name>
         <Value>Md5(doc.url1)</Value>
     </Mapping>
-</Sinequa>
\ No newline at end of file
+</Sinequa>

From 0bb97e3c01dd34142c32aa5c2d623da113fa207a Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 16 Sep 2024 16:12:59 -0500
Subject: [PATCH 015/441] point tree root to name

---
 sde_collections/serializers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 4540bdfb..9623e85d 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -141,9 +141,9 @@ def get_file_extension(self, obj):
     def get_tree_root(self, obj):
         if obj.collection.is_multi_division:
             if obj.division:
-                return f"/{obj.get_division_display()}/{obj.collection.config_folder}"
+                return f"/{obj.get_division_display()}/{obj.collection.name}/"
             else:
-                return f"/{obj.collection.get_division_display()}/{obj.collection.config_folder}"
+                return f"/{obj.collection.get_division_display()}/{obj.collection.name}/"
         else:
             return obj.collection.tree_root
 

From 233730b19c3a7984e56160f58fb3f35d5da74123 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 23 Sep 2024 13:35:20 -0500
Subject: [PATCH 016/441] change LRM dev configurations

---
 sde_collections/sinequa_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 1dffe26b..1c0a663f 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -38,13 +38,13 @@
         "base_url": "http://sde-xli.nasa-impact.net",
     },
     "lrm_dev_server": {
-        "app_name": "nasa-sba-smd",
-        "query_name": "query-smd-primary",
+        "app_name": "sde-init-check",
+        "query_name": "query-init-check",
         "base_url": "https://sde-lrm.nasa-impact.net",
     },
     "lrm_qa_server": {
-        "app_name": "nasa-sba-smd",
-        "query_name": "query-smd-primary",
+        "app_name": "sde-init-check",
+        "query_name": "query-init-check",
         "base_url": "https://sde-qa.nasa-impact.net",
     },
 }

From 9408e45fc8e565efeeac25d2f1e43abf90f27526 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 25 Sep 2024 13:44:13 -0500
Subject: [PATCH 017/441] get URLs from scrapers folder for LRM servers

---
 sde_collections/sinequa_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 1c0a663f..0e4c3b62 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -94,7 +94,7 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
         }
 
         if collection_config_folder:
-            if self.server_name == "lis_server":
+            if self.server_name in ["lis_server", "lrm_dev_server", "lrm_qa_server"]:
                 payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/"
             else:
                 payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/"

From 9c7b25bc0f43234fa9ac097f2a53a9ef02aae131 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 9 Oct 2024 22:04:46 -0500
Subject: [PATCH 018/441] adding the new base URL model

---
 sde_collections/models/url.py | 85 +++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 sde_collections/models/url.py

diff --git a/sde_collections/models/url.py b/sde_collections/models/url.py
new file mode 100644
index 00000000..7ce86dff
--- /dev/null
+++ b/sde_collections/models/url.py
@@ -0,0 +1,85 @@
+import os
+from urllib.parse import urlparse
+
+from django.db import models
+
+from .collection import Collection
+from .collection_choice_fields import Divisions, DocumentTypes
+from .pattern import ExcludePattern
+
+
+class UrlQuerySet(models.QuerySet):
+    def with_exclusion_status(self):
+        return self.annotate(
+            excluded=models.Exists(
+                ExcludePattern.candidate_urls.through.objects.filter(candidateurl=models.OuterRef("pk"))
+            )
+        )
+
+
+class UrlManager(models.Manager):
+    def get_queryset(self):
+        return UrlQuerySet(self.model, using=self._db).with_exclusion_status()
+
+
+class Url(models.Model):
+    """This is the base URL model which serves as a base for DeltaUrl and CuratedUrl."""
+
+    collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="urls")
+    url = models.CharField("URL", max_length=4096)
+    scraped_title = models.CharField(
+        "Scraped Title",
+        max_length=1024,
+        default="",
+        blank=True,
+        help_text="This is the original title scraped by Sinequa",
+    )
+    generated_title = models.CharField(
+        "Generated Title",
+        max_length=1024,
+        default="",
+        blank=True,
+        help_text="This is the title generated based on a Title Pattern",
+    )
+    visited = models.BooleanField(default=False)
+    document_type = models.IntegerField(choices=DocumentTypes.choices, null=True)
+    division = models.IntegerField(choices=Divisions.choices, null=True)
+
+    objects = UrlManager()
+
+    class Meta:
+        verbose_name = "URL"
+        verbose_name_plural = "URLs"
+        ordering = ["url"]
+
+    @property
+    def fileext(self) -> str:
+        parsed_url = urlparse(self.url)
+        path = parsed_url.path
+        if path.endswith("/") or not path:
+            return "html"
+        extension = os.path.splitext(path)[1]
+        return extension[1:] if extension.startswith(".") else extension or "html"
+
+    def splits(self) -> list[tuple[str, str]]:
+        parts = []
+        part_string = ""
+        for part in self.path.split("/"):
+            if part:
+                part_string += f"/{part}"
+                parts.append((part_string, part))
+        return parts
+
+    @property
+    def path(self) -> str:
+        parsed = urlparse(self.url)
+        path = f"{parsed.path}"
+        if parsed.query:
+            path += f"?{parsed.query}"
+        return path
+
+    def __str__(self) -> str:
+        return self.url
+
+    def save(self, *args, **kwargs):
+        super().save(*args, **kwargs)

From 115481d5359ff7064061ea18ff0e02152343fbd7 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 9 Oct 2024 22:05:08 -0500
Subject: [PATCH 019/441] adding the new dump url model

---
 sde_collections/models/dump_url.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 sde_collections/models/dump_url.py

diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py
new file mode 100644
index 00000000..85ef85d9
--- /dev/null
+++ b/sde_collections/models/dump_url.py
@@ -0,0 +1,9 @@
+from .url import Url
+
+
+class DumpUrl(Url):
+    """Model for storing all the imported URLs before seperating them into delta URLs and Curated URLs."""
+
+    class Meta:
+        verbose_name = "Dump URL"
+        verbose_name_plural = "Dump URLs"

From 8af6102de71cb45d5e32f0c61dedf011583df1d0 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 9 Oct 2024 22:05:28 -0500
Subject: [PATCH 020/441] adding the new delta url model

---
 sde_collections/models/delta_url.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 sde_collections/models/delta_url.py

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
new file mode 100644
index 00000000..028607ab
--- /dev/null
+++ b/sde_collections/models/delta_url.py
@@ -0,0 +1,13 @@
+from django.db import models
+
+from .url import Url
+
+
+class DeltaUrl(Url):
+    """Model for storing delta URLs for curation purposes"""
+
+    delete = models.BooleanField(default=False)
+
+    class Meta:
+        verbose_name = "Delta URL"
+        verbose_name_plural = "Delta URLs"

From 3f9c88520939f53a61711f6c2dc6f0ec351c6918 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 9 Oct 2024 22:05:50 -0500
Subject: [PATCH 021/441] adding the new curated url model

---
 sde_collections/models/curated_url.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 sde_collections/models/curated_url.py

diff --git a/sde_collections/models/curated_url.py b/sde_collections/models/curated_url.py
new file mode 100644
index 00000000..d55dcb5f
--- /dev/null
+++ b/sde_collections/models/curated_url.py
@@ -0,0 +1,9 @@
+from .url import Url
+
+
+class CuratedUrl(Url):
+    """Model for storing curated and live URLs after the curation process."""
+
+    class Meta:
+        verbose_name = "Curated URL"
+        verbose_name_plural = "Curated URLs"

From 3c9627fc3e67d477f2746d63a8304695b334ed5e Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 9 Oct 2024 22:06:08 -0500
Subject: [PATCH 022/441] adding the necessary migration file

---
 .../0059_url_curatedurl_deltaurl_dumpurl.py   | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py

diff --git a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py
new file mode 100644
index 00000000..82f4d4af
--- /dev/null
+++ b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py
@@ -0,0 +1,146 @@
+# Generated by Django 4.2.9 on 2024-10-10 03:01
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="Url",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("url", models.CharField(max_length=4096, verbose_name="URL")),
+                (
+                    "scraped_title",
+                    models.CharField(
+                        blank=True,
+                        default="",
+                        help_text="This is the original title scraped by Sinequa",
+                        max_length=1024,
+                        verbose_name="Scraped Title",
+                    ),
+                ),
+                (
+                    "generated_title",
+                    models.CharField(
+                        blank=True,
+                        default="",
+                        help_text="This is the title generated based on a Title Pattern",
+                        max_length=1024,
+                        verbose_name="Generated Title",
+                    ),
+                ),
+                ("visited", models.BooleanField(default=False)),
+                (
+                    "document_type",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Images"),
+                            (2, "Data"),
+                            (3, "Documentation"),
+                            (4, "Software and Tools"),
+                            (5, "Missions and Instruments"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                (
+                    "division",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Astrophysics"),
+                            (2, "Biological and Physical Sciences"),
+                            (3, "Earth Science"),
+                            (4, "Heliophysics"),
+                            (5, "Planetary Science"),
+                            (6, "General"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="urls",
+                        to="sde_collections.collection",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "URL",
+                "verbose_name_plural": "URLs",
+                "ordering": ["url"],
+            },
+        ),
+        migrations.CreateModel(
+            name="CuratedUrl",
+            fields=[
+                (
+                    "url_ptr",
+                    models.OneToOneField(
+                        auto_created=True,
+                        on_delete=django.db.models.deletion.CASCADE,
+                        parent_link=True,
+                        primary_key=True,
+                        serialize=False,
+                        to="sde_collections.url",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "Curated URL",
+                "verbose_name_plural": "Curated URLs",
+            },
+            bases=("sde_collections.url",),
+        ),
+        migrations.CreateModel(
+            name="DeltaUrl",
+            fields=[
+                (
+                    "url_ptr",
+                    models.OneToOneField(
+                        auto_created=True,
+                        on_delete=django.db.models.deletion.CASCADE,
+                        parent_link=True,
+                        primary_key=True,
+                        serialize=False,
+                        to="sde_collections.url",
+                    ),
+                ),
+                ("delete", models.BooleanField(default=False)),
+            ],
+            options={
+                "verbose_name": "Delta URL",
+                "verbose_name_plural": "Delta URLs",
+            },
+            bases=("sde_collections.url",),
+        ),
+        migrations.CreateModel(
+            name="DumpUrl",
+            fields=[
+                (
+                    "url_ptr",
+                    models.OneToOneField(
+                        auto_created=True,
+                        on_delete=django.db.models.deletion.CASCADE,
+                        parent_link=True,
+                        primary_key=True,
+                        serialize=False,
+                        to="sde_collections.url",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "Dump URL",
+                "verbose_name_plural": "Dump URLs",
+            },
+            bases=("sde_collections.url",),
+        ),
+    ]

From 2fcd346a2260779f64f319f7c63436792ca13cc1 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 9 Oct 2024 22:08:41 -0500
Subject: [PATCH 023/441] adding a command file to migrate urls into delta and
 curated URL models

---
 .../management/commands/migrate_urls.py       | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 sde_collections/management/commands/migrate_urls.py

diff --git a/sde_collections/management/commands/migrate_urls.py b/sde_collections/management/commands/migrate_urls.py
new file mode 100644
index 00000000..6958c107
--- /dev/null
+++ b/sde_collections/management/commands/migrate_urls.py
@@ -0,0 +1,59 @@
+from django.core.management.base import BaseCommand
+
+from sde_collections.models.candidate_url import CandidateURL
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+from sde_collections.models.curated_url import CuratedUrl
+from sde_collections.models.delta_url import DeltaUrl
+
+
+class Command(BaseCommand):
+    help = "Migrate CandidateURLs to CuratedUrl or DeltaUrl based on collection workflow status"
+
+    def handle(self, *args, **kwargs):
+        # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl
+        collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED)
+        self.stdout.write(
+            f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..."
+        )
+
+        for collection in collections_for_curated:
+            candidate_urls = CandidateURL.objects.filter(collection=collection)
+            for candidate_url in candidate_urls:
+                CuratedUrl.objects.create(
+                    collection=candidate_url.collection,
+                    url=candidate_url.url,
+                    scraped_title=candidate_url.scraped_title,
+                    generated_title=candidate_url.generated_title,
+                    visited=candidate_url.visited,
+                    document_type=candidate_url.document_type,
+                    division=candidate_url.division,
+                )
+            self.stdout.write(
+                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
+            )
+
+        # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl
+        collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED)
+        self.stdout.write(
+            f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..."
+        )
+
+        for collection in collections_for_delta:
+            candidate_urls = CandidateURL.objects.filter(collection=collection)
+            for candidate_url in candidate_urls:
+                DeltaUrl.objects.create(
+                    collection=candidate_url.collection,
+                    url=candidate_url.url,
+                    scraped_title=candidate_url.scraped_title,
+                    generated_title=candidate_url.generated_title,
+                    visited=candidate_url.visited,
+                    document_type=candidate_url.document_type,
+                    division=candidate_url.division,
+                    delete=False,
+                )
+            self.stdout.write(
+                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
+            )
+
+        self.stdout.write(self.style.SUCCESS("Migration complete."))

From d691af30fa10362f35a61b6bfd9f175ba3175bac Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 9 Oct 2024 22:09:37 -0500
Subject: [PATCH 024/441] added the new models into admin console

---
 sde_collections/admin.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index cb105f80..e4ff5097 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -5,7 +5,11 @@
 
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
+from .models.curated_url import CuratedUrl
+from .models.delta_url import DeltaUrl
+from .models.dump_url import DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
+from .models.url import Url
 from .tasks import import_candidate_urls_from_api
 
 
@@ -299,9 +303,41 @@ class DivisionPatternAdmin(admin.ModelAdmin):
     search_fields = ("match_pattern", "division")
 
 
+class UrlAdmin(admin.ModelAdmin):
+    """Admin View for Url"""
+
+    list_display = ("url", "scraped_title", "collection")
+    list_filter = ("collection",)
+
+
+class DumpUrlAdmin(admin.ModelAdmin):
+    """Admin View for DumpUrl"""
+
+    list_display = ("url", "scraped_title", "collection")
+    list_filter = ("collection",)
+
+
+class CuratedUrlAdmin(admin.ModelAdmin):
+    """Admin View for CuratedUrl"""
+
+    list_display = ("url", "scraped_title", "collection")
+    list_filter = ("collection",)
+
+
+class DeltaUrlAdmin(admin.ModelAdmin):
+    """Admin View for DeltaUrl"""
+
+    list_display = ("url", "scraped_title", "collection")
+    list_filter = ("collection",)
+
+
 admin.site.register(WorkflowHistory, WorkflowHistoryAdmin)
 admin.site.register(CandidateURL, CandidateURLAdmin)
 admin.site.register(TitlePattern, TitlePatternAdmin)
 admin.site.register(IncludePattern)
 admin.site.register(ResolvedTitle, ResolvedTitleAdmin)
 admin.site.register(DivisionPattern, DivisionPatternAdmin)
+admin.site.register(Url, UrlAdmin)
+admin.site.register(DeltaUrl, DeltaUrlAdmin)
+admin.site.register(DumpUrl, DumpUrlAdmin)
+admin.site.register(CuratedUrl, CuratedUrlAdmin)

From a17029f88dc2644e3705ed11aa9ce9a4e727c431 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 14 Oct 2024 12:01:11 -0500
Subject: [PATCH 025/441] removed url and dumpurl models from admin

---
 sde_collections/admin.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index e4ff5097..4fce1ea7 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -7,9 +7,7 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.curated_url import CuratedUrl
 from .models.delta_url import DeltaUrl
-from .models.dump_url import DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .models.url import Url
 from .tasks import import_candidate_urls_from_api
 
 
@@ -303,20 +301,6 @@ class DivisionPatternAdmin(admin.ModelAdmin):
     search_fields = ("match_pattern", "division")
 
 
-class UrlAdmin(admin.ModelAdmin):
-    """Admin View for Url"""
-
-    list_display = ("url", "scraped_title", "collection")
-    list_filter = ("collection",)
-
-
-class DumpUrlAdmin(admin.ModelAdmin):
-    """Admin View for DumpUrl"""
-
-    list_display = ("url", "scraped_title", "collection")
-    list_filter = ("collection",)
-
-
 class CuratedUrlAdmin(admin.ModelAdmin):
     """Admin View for CuratedUrl"""
 
@@ -337,7 +321,5 @@ class DeltaUrlAdmin(admin.ModelAdmin):
 admin.site.register(IncludePattern)
 admin.site.register(ResolvedTitle, ResolvedTitleAdmin)
 admin.site.register(DivisionPattern, DivisionPatternAdmin)
-admin.site.register(Url, UrlAdmin)
 admin.site.register(DeltaUrl, DeltaUrlAdmin)
-admin.site.register(DumpUrl, DumpUrlAdmin)
 admin.site.register(CuratedUrl, CuratedUrlAdmin)

From 8606581c8e7970403519499e7171ae8503f7c296 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 14 Oct 2024 12:02:10 -0500
Subject: [PATCH 026/441] edited the curated url api serialzier used for
 indexing

---
 sde_collections/serializers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 9623e85d..2f11700b 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -3,6 +3,7 @@
 from .models.candidate_url import CandidateURL
 from .models.collection import Collection, WorkflowHistory
 from .models.collection_choice_fields import Divisions, DocumentTypes
+from .models.curated_url import CuratedUrl
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -107,19 +108,18 @@ class Meta:
         )
 
 
-class CandidateURLAPISerializer(serializers.ModelSerializer):
+class CuratedUrlAPISerializer(serializers.ModelSerializer):
     document_type = serializers.SerializerMethodField()
     title = serializers.SerializerMethodField()
     file_extension = serializers.SerializerMethodField()
     tree_root = serializers.SerializerMethodField()
 
     class Meta:
-        model = CandidateURL
+        model = CuratedUrl
         fields = (
             "url",
             "title",
             "document_type",
-            "hash",
             "file_extension",
             "tree_root",
         )

From 0f8578cb2059bfce3c9f0508663090fd7e6c08ff Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 14 Oct 2024 12:02:35 -0500
Subject: [PATCH 027/441] changed the api endpoit to have an appropriate name

---
 sde_collections/urls.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index 4e3d0534..214d1198 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -55,9 +55,9 @@
     # Delete an existing CandidateURL instance: /candidate-urls/{id}/
     path("api/", include(router.urls)),
     path(
-        "candidate-urls-api/<str:config_folder>/",
-        view=views.CandidateURLAPIView.as_view(),
-        name="candidate-url-api",
+        "curated-urls-api/<str:config_folder>/",
+        view=views.CuratedURLAPIView.as_view(),
+        name="curated-url-api",
     ),
     path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"),
 ]

From 717eb533f59878f776b45b43216464323127341f Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 14 Oct 2024 12:03:22 -0500
Subject: [PATCH 028/441] changed the api vew to point to the right curated url
 model

---
 sde_collections/views.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sde_collections/views.py b/sde_collections/views.py
index 241979ba..b8ff70a0 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -27,6 +27,7 @@
     DocumentTypes,
     WorkflowStatusChoices,
 )
+from .models.curated_url import CuratedUrl
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -35,11 +36,11 @@
     TitlePattern,
 )
 from .serializers import (
-    CandidateURLAPISerializer,
     CandidateURLBulkCreateSerializer,
     CandidateURLSerializer,
     CollectionReadSerializer,
     CollectionSerializer,
+    CuratedUrlAPISerializer,
     DivisionPatternSerializer,
     DocumentTypePatternSerializer,
     ExcludePatternSerializer,
@@ -307,8 +308,8 @@ def create(self, request, *args, **kwargs):
         return Response(serializer.data, status=status.HTTP_201_CREATED)
 
 
-class CandidateURLAPIView(ListAPIView):
-    serializer_class = CandidateURLAPISerializer
+class CuratedURLAPIView(ListAPIView):
+    serializer_class = CuratedUrlAPISerializer
 
     def get(self, request, *args, **kwargs):
         config_folder = kwargs.get("config_folder")
@@ -317,7 +318,7 @@ def get(self, request, *args, **kwargs):
 
     def get_queryset(self):
         queryset = (
-            CandidateURL.objects.filter(collection__config_folder=self.config_folder)
+            CuratedUrl.objects.filter(collection__config_folder=self.config_folder)
             .with_exclusion_status()
             .filter(excluded=False)
         )

From 83cb35a45d39dba10fcc22e0d7b6ae7979cc299b Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 14 Oct 2024 12:03:36 -0500
Subject: [PATCH 029/441] migration file with changes

---
 .../migrations/0060_delete_dumpurl.py            | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 sde_collections/migrations/0060_delete_dumpurl.py

diff --git a/sde_collections/migrations/0060_delete_dumpurl.py b/sde_collections/migrations/0060_delete_dumpurl.py
new file mode 100644
index 00000000..db9a10c1
--- /dev/null
+++ b/sde_collections/migrations/0060_delete_dumpurl.py
@@ -0,0 +1,16 @@
+# Generated by Django 4.2.9 on 2024-10-14 16:37
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"),
+    ]
+
+    operations = [
+        migrations.DeleteModel(
+            name="DumpUrl",
+        ),
+    ]

From 27d0b49bff19ce81905286f3b3cb2925132dcca0 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 14 Oct 2024 20:40:01 -0500
Subject: [PATCH 030/441] change EnableNeuralIndexing to true in indexing
 template

---
 config_generation/xmls/plugin_indexing_template.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml
index 44bfba6c..34aea51f 100644
--- a/config_generation/xmls/plugin_indexing_template.xml
+++ b/config_generation/xmls/plugin_indexing_template.xml
@@ -20,7 +20,7 @@
     <ExcludedFilenames></ExcludedFilenames>
     <IncludedFolders></IncludedFolders>
     <ExcludedFolders></ExcludedFolders>
-    <EnableNeuralIndexing>false</EnableNeuralIndexing>
+    <EnableNeuralIndexing>true</EnableNeuralIndexing>
     <NeuralSearchSelectionQuery></NeuralSearchSelectionQuery>
     <UrlStayInside>true</UrlStayInside>
     <UrlRefererStayInside>false</UrlRefererStayInside>

From d537302dcbbc288175ec81f62994b5fec84fbcbc Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 17 Oct 2024 13:52:37 -0500
Subject: [PATCH 031/441] add per indicator thrsholding and new dump

---
 scripts/ej/cmr_to_models.py  |  2 +-
 scripts/ej/create_ej_dump.py | 37 +++++++++++++++++++++++++-----------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py
index 130de722..f7ba46db 100644
--- a/scripts/ej/cmr_to_models.py
+++ b/scripts/ej/cmr_to_models.py
@@ -69,7 +69,7 @@ def categorize_processing_level(level):
 # remove existing data
 EnvironmentalJusticeRow.objects.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV).delete()
 
-ej_dump = json.load(open("backups/ej_dump_20240815_112916.json"))
+ej_dump = json.load(open("backups/ej_dump_20241017_133151.json.json"))
 for dataset in ej_dump:
     ej_row = EnvironmentalJusticeRow(
         destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV,
diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py
index 36d7f722..c44aebc5 100644
--- a/scripts/ej/create_ej_dump.py
+++ b/scripts/ej/create_ej_dump.py
@@ -2,7 +2,7 @@
 inferences are supplied by the classification model. the contact point is Bishwas
 cmr is supplied by running
 github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py
-move to the serve like this: scp ej_dump_20240814_143036.json  sde:/home/ec2-user/sde_indexing_helper/backups/
+move to the server like this: scp ej_dump_20241017_133151.json  sde:/home/ec2-user/sde_indexing_helper/backups/
 """
 
 import json
@@ -19,12 +19,12 @@ def save_to_json(data: dict | list, file_path: str) -> None:
         json.dump(data, file, indent=2)
 
 
-def process_classifications(predictions: list[dict[str, float]], threshold: float = 0.5) -> list[str]:
+def process_classifications(predictions: list[dict[str, float]], thresholds: dict[str, float]) -> list[str]:
     """
-    Process the predictions and classify as follows:
-    1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification
-    2. Filter classifications based on the threshold, excluding 'Not EJ'
-    3. Default to 'Not EJ' if no classifications meet the threshold
+    Process the predictions and classify based on the individual thresholds per indicator:
+    1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification.
+    2. Filter classifications based on their individual thresholds, excluding 'Not EJ'.
+    3. Default to 'Not EJ' if no classifications meet the threshold.
     """
     highest_prediction = max(predictions, key=lambda x: x["score"])
 
@@ -32,7 +32,9 @@ def process_classifications(predictions: list[dict[str, float]], threshold: floa
         return ["Not EJ"]
 
     classifications = [
-        pred["label"] for pred in predictions if pred["score"] >= threshold and pred["label"] != "Not EJ"
+        pred["label"]
+        for pred in predictions
+        if pred["score"] >= thresholds[pred["label"]] and pred["label"] != "Not EJ"
     ]
 
     return classifications if classifications else ["Not EJ"]
@@ -63,14 +65,14 @@ def remove_unauthorized_classifications(classifications: list[str]) -> list[str]
 def update_cmr_with_classifications(
     inferences: list[dict[str, dict]],
     cmr_dict: dict[str, dict[str, dict]],
-    threshold: float = 0.5,
+    thresholds: dict[str, float],
 ) -> list[dict[str, dict]]:
     """Update CMR data with valid classifications based on inferences."""
 
     predicted_cmr = []
 
     for inference in inferences:
-        classifications = process_classifications(predictions=inference["predictions"], threshold=threshold)
+        classifications = process_classifications(predictions=inference["predictions"], thresholds=thresholds)
         classifications = remove_unauthorized_classifications(classifications)
 
         if classifications:
@@ -84,17 +86,30 @@ def update_cmr_with_classifications(
 
 
 def main():
-    inferences = load_json_file("cmr-inference.json")
+    thresholds = {
+        "Not EJ": 0.80,
+        "Climate Change": 0.95,
+        "Disasters": 0.80,
+        "Extreme Heat": 0.50,
+        "Food Availability": 0.80,
+        "Health & Air Quality": 0.90,
+        "Human Dimensions": 0.80,
+        "Urban Flooding": 0.50,
+        "Water Availability": 0.80,
+    }
+
+    inferences = load_json_file("alpha-1.3-wise-vortex-42-predictions.json")
     cmr = load_json_file("cmr_collections_umm_20240807_142146.json")
 
     cmr_dict = create_cmr_dict(cmr)
 
-    predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, threshold=0.8)
+    predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, thresholds=thresholds)
 
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     file_name = f"ej_dump_{timestamp}.json"
 
     save_to_json(predicted_cmr, file_name)
+    print(f"Saved to {file_name}")
 
 
 if __name__ == "__main__":

From b559facb6a5a43104445943cf1eadec4fe6ae0e7 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 23 Oct 2024 21:48:31 -0500
Subject: [PATCH 032/441] Fixes issue #1071

---
 .envs/.local/.django                          |   5 +
 sde_collections/admin.py                      |  18 ++
 .../0059_candidateurl_scraped_text.py         |  18 ++
 sde_collections/models/candidate_url.py       |   1 +
 sde_collections/tasks.py                      | 192 +++++++++++++++++-
 5 files changed, 232 insertions(+), 2 deletions(-)
 create mode 100644 sde_collections/migrations/0059_candidateurl_scraped_text.py

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 402efc3c..ce2e8095 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -39,3 +39,8 @@ XLI_USER=''
 XLI_PASSWORD=''
 LRM_QA_USER=''
 LRM_QA_PASSWORD=''
+
+#Server Tokens
+#--------------------------------------------------------------------------------
+LRMDEV_TOKEN=''
+LIS_TOKEN=''
\ No newline at end of file
diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index cb105f80..ecf92838 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -7,6 +7,22 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
+from .tasks import fetch_and_update_full_text
+
+
+@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
+def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
+    for collection in queryset:
+        fetch_and_update_full_text.delay(collection.id, "LRM_DEV")
+    modeladmin.message_user(request, "Full text fetched and updated from LRM_DEV successfully.")
+
+
+@admin.action(description="Import candidate URLs from Li's Server with Full Text")
+def fetch_full_text_lis_action(modeladmin, request, queryset):
+    for collection in queryset:
+        fetch_and_update_full_text.delay(collection.id, "LIS")
+    modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.")
+ 
 
 
 @admin.action(description="Generate deployment message")
@@ -239,6 +255,8 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
         import_candidate_urls_lis_server,
         import_candidate_urls_lrm_dev_server,
         import_candidate_urls_lrm_qa_server,
+        fetch_full_text_lrm_dev_action,
+        fetch_full_text_lis_action,
     ]
     ordering = ("cleaning_order",)
 
diff --git a/sde_collections/migrations/0059_candidateurl_scraped_text.py b/sde_collections/migrations/0059_candidateurl_scraped_text.py
new file mode 100644
index 00000000..cc3ea65b
--- /dev/null
+++ b/sde_collections/migrations/0059_candidateurl_scraped_text.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.9 on 2024-10-21 23:10
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="candidateurl",
+            name="scraped_text",
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 51c3a28b..936ea363 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -35,6 +35,7 @@ class CandidateURL(models.Model):
         blank=True,
         help_text="This is the original title scraped by Sinequa",
     )
+    scraped_text = models.TextField(blank=True, null=True)
     generated_title = models.CharField(
         "Generated Title",
         default="",
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index fa754efc..3172b22f 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -1,13 +1,13 @@
 import json
 import os
 import shutil
-
+import requests
 import boto3
 from django.apps import apps
 from django.conf import settings
 from django.core import management
 from django.core.management.commands import loaddata
-
+from sde_collections.models.candidate_url import CandidateURL
 from config import celery_app
 
 from .models.collection import Collection, WorkflowStatusChoices
@@ -141,3 +141,191 @@ def resolve_title_pattern(title_pattern_id):
     TitlePattern = apps.get_model("sde_collections", "TitlePattern")
     title_pattern = TitlePattern.objects.get(id=title_pattern_id)
     title_pattern.apply()
+'''
+@celery_app.task
+def fetch_and_update_full_text(collection_id):
+    
+    try:
+        collection = Collection.objects.get(id=collection_id)
+    except Collection.DoesNotExist:
+        raise Exception(f"Collection with ID {collection_id} does not exist.")
+    
+    url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server
+    sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'"
+    token = os.getenv('LRMDEV_TOKEN')
+
+
+    payload = json.dumps({
+        "method": "engine.sql",
+        "sql": sql_command,
+        "pretty": True,
+        "log": False,
+        "output": "json",
+        "resolveIndexList": "false",
+        "engines": "default"
+    })
+    
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {token}'
+    }
+    
+    response = requests.post(url, headers=headers, data=payload)
+    if response.status_code == 200:
+        records = response.json().get("Rows", [])
+        for record in records:
+            url, full_text, title = record
+            if not url or not full_text or not title:
+                continue
+            # Directly update or create the entry without checking for content changes
+            CandidateURL.objects.update_or_create(
+                url=url,
+                collection=collection,
+                defaults={
+                    'scraped_text': full_text,
+                    'scraped_title': title
+                }
+            )
+
+        return f"Processed {len(records)} records; Updated or created in database."
+    else:
+        raise Exception(f"Failed to fetch text: {response.status_code} {response.text}")
+    '''
+
+#You will have to have a different function for Li's server as it uses user and pw with body to login.
+#If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3
+#Fill in the tokens in the .django file
+
+#Integrated - LRM devs and Lis separate
+'''
+@celery_app.task
+def fetch_and_update_full_text(collection_id, server_type):
+    try:
+        collection = Collection.objects.get(id=collection_id)
+    except Collection.DoesNotExist:
+        raise Exception(f"Collection with ID {collection_id} does not exist.")
+    
+    # Server-specific configurations
+    server_config = get_server_config(server_type)
+
+    # API Request Parameters
+    payload = json.dumps({
+        "method": "engine.sql",
+        "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'",
+        "pretty": True,
+        "log": False,
+        "output": "json",
+        "resolveIndexList": "false",
+        "engines": "default"
+    })
+
+    token = server_config["token"]
+    url = server_config["url"]
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {token}'
+    }
+
+    # Send the request
+    response = requests.post(url, headers=headers, data=payload)
+    if response.status_code == 200:
+        records = response.json().get("Rows", [])
+        for record in records:
+            url, full_text, title = record
+            if not url or not full_text or not title:
+                continue
+            CandidateURL.objects.update_or_create(
+                url=url,
+                collection=collection,
+                defaults={
+                    'scraped_text': full_text,
+                    'scraped_title': title
+                }
+            )
+        return f"Processed {len(records)} records; Updated or created in database."
+    else:
+        raise Exception(f"Failed to fetch text: {response.status_code} {response.text}")
+
+
+def get_server_config(server_type):
+    if server_type == "LRM_DEV":
+        return {
+            "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql",
+            "token": os.getenv("LRMDEV_TOKEN")
+        }
+    elif server_type == "LIS":
+        return {
+            "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql",
+            "token": os.getenv("LIS_TOKEN")
+        }
+    else:
+        raise ValueError("Invalid server type.")
+'''
+
+
+@celery_app.task
+def fetch_and_update_full_text(collection_id, server_type):
+    try:
+        collection = Collection.objects.get(id=collection_id)
+    except Collection.DoesNotExist:
+        raise Exception(f"Collection with ID {collection_id} does not exist.")
+
+    server_config = get_server_config(server_type)
+    token = server_config["token"]
+    url = server_config["url"]
+
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {token}'
+    }
+
+    payload = json.dumps({
+        "method": "engine.sql",
+        "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'",
+        "pretty": True,
+        "log": False,
+        "output": "json",
+        "resolveIndexList": "false",
+        "engines": "default"
+    })
+
+    try:
+        response = requests.post(url, headers=headers, data=payload, timeout=10)
+        response.raise_for_status()  # Raise exception for HTTP errors
+    except requests.exceptions.RequestException as e:
+        raise Exception(f"API request failed: {str(e)}")
+
+    records = response.json().get("Rows", [])
+    if not records:
+        return "No records found in the response."
+
+    for record in records:
+        url, full_text, title = record
+        if not (url and full_text and title):
+            continue 
+
+        CandidateURL.objects.update_or_create(
+            url=url,
+            collection=collection,
+            defaults={
+                'scraped_text': full_text,
+                'scraped_title': title
+            }
+        )
+
+    return f"Successfully processed {len(records)} records and updated the database."
+
+def get_server_config(server_type):
+    if server_type == "LRM_DEV":
+        return {
+            "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql",
+            "token": os.getenv("LRMDEV_TOKEN")
+        }
+    elif server_type == "LIS":
+        return {
+            "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql",
+            "token": os.getenv("LIS_TOKEN")
+        }
+    else:
+        raise ValueError("Invalid server type.")
+

From 8678ed6e83edc61461c51a51cc8bd9b5c9190dde Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Oct 2024 03:05:09 +0000
Subject: [PATCH 033/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .envs/.local/.django     |  2 +-
 sde_collections/admin.py |  4 +-
 sde_collections/tasks.py | 82 ++++++++++++++++++----------------------
 3 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index ce2e8095..07e159fa 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -43,4 +43,4 @@ LRM_QA_PASSWORD=''
 #Server Tokens
 #--------------------------------------------------------------------------------
 LRMDEV_TOKEN=''
-LIS_TOKEN=''
\ No newline at end of file
+LIS_TOKEN=''
diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index ecf92838..7b519a15 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -6,8 +6,7 @@
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .tasks import import_candidate_urls_from_api
-from .tasks import fetch_and_update_full_text
+from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api
 
 
 @admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
@@ -22,7 +21,6 @@ def fetch_full_text_lis_action(modeladmin, request, queryset):
     for collection in queryset:
         fetch_and_update_full_text.delay(collection.id, "LIS")
     modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.")
- 
 
 
 @admin.action(description="Generate deployment message")
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 3172b22f..8d93a2de 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -1,14 +1,16 @@
 import json
 import os
 import shutil
-import requests
+
 import boto3
+import requests
 from django.apps import apps
 from django.conf import settings
 from django.core import management
 from django.core.management.commands import loaddata
-from sde_collections.models.candidate_url import CandidateURL
+
 from config import celery_app
+from sde_collections.models.candidate_url import CandidateURL
 
 from .models.collection import Collection, WorkflowStatusChoices
 from .sinequa_api import Api
@@ -141,15 +143,17 @@ def resolve_title_pattern(title_pattern_id):
     TitlePattern = apps.get_model("sde_collections", "TitlePattern")
     title_pattern = TitlePattern.objects.get(id=title_pattern_id)
     title_pattern.apply()
-'''
+
+
+"""
 @celery_app.task
 def fetch_and_update_full_text(collection_id):
-    
+
     try:
         collection = Collection.objects.get(id=collection_id)
     except Collection.DoesNotExist:
         raise Exception(f"Collection with ID {collection_id} does not exist.")
-    
+
     url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server
     sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'"
     token = os.getenv('LRMDEV_TOKEN')
@@ -164,12 +168,12 @@ def fetch_and_update_full_text(collection_id):
         "resolveIndexList": "false",
         "engines": "default"
     })
-    
+
     headers = {
         'Content-Type': 'application/json',
         'Authorization': f'Bearer {token}'
     }
-    
+
     response = requests.post(url, headers=headers, data=payload)
     if response.status_code == 200:
         records = response.json().get("Rows", [])
@@ -190,21 +194,21 @@ def fetch_and_update_full_text(collection_id):
         return f"Processed {len(records)} records; Updated or created in database."
     else:
         raise Exception(f"Failed to fetch text: {response.status_code} {response.text}")
-    '''
+    """
 
-#You will have to have a different function for Li's server as it uses user and pw with body to login.
-#If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3
-#Fill in the tokens in the .django file
+# You will have to have a different function for Li's server as it uses user and pw with body to login.
+# If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3
+# Fill in the tokens in the .django file
 
-#Integrated - LRM devs and Lis separate
-'''
+# Integrated - LRM devs and Lis separate
+"""
 @celery_app.task
 def fetch_and_update_full_text(collection_id, server_type):
     try:
         collection = Collection.objects.get(id=collection_id)
     except Collection.DoesNotExist:
         raise Exception(f"Collection with ID {collection_id} does not exist.")
-    
+
     # Server-specific configurations
     server_config = get_server_config(server_type)
 
@@ -260,7 +264,7 @@ def get_server_config(server_type):
         }
     else:
         raise ValueError("Invalid server type.")
-'''
+"""
 
 
 @celery_app.task
@@ -274,20 +278,19 @@ def fetch_and_update_full_text(collection_id, server_type):
     token = server_config["token"]
     url = server_config["url"]
 
-    headers = {
-        'Content-Type': 'application/json',
-        'Authorization': f'Bearer {token}'
-    }
-
-    payload = json.dumps({
-        "method": "engine.sql",
-        "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'",
-        "pretty": True,
-        "log": False,
-        "output": "json",
-        "resolveIndexList": "false",
-        "engines": "default"
-    })
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
+
+    payload = json.dumps(
+        {
+            "method": "engine.sql",
+            "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'",
+            "pretty": True,
+            "log": False,
+            "output": "json",
+            "resolveIndexList": "false",
+            "engines": "default",
+        }
+    )
 
     try:
         response = requests.post(url, headers=headers, data=payload, timeout=10)
@@ -302,30 +305,19 @@ def fetch_and_update_full_text(collection_id, server_type):
     for record in records:
         url, full_text, title = record
         if not (url and full_text and title):
-            continue 
+            continue
 
         CandidateURL.objects.update_or_create(
-            url=url,
-            collection=collection,
-            defaults={
-                'scraped_text': full_text,
-                'scraped_title': title
-            }
+            url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title}
         )
 
     return f"Successfully processed {len(records)} records and updated the database."
 
+
 def get_server_config(server_type):
     if server_type == "LRM_DEV":
-        return {
-            "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql",
-            "token": os.getenv("LRMDEV_TOKEN")
-        }
+        return {"url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LRMDEV_TOKEN")}
     elif server_type == "LIS":
-        return {
-            "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql",
-            "token": os.getenv("LIS_TOKEN")
-        }
+        return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")}
     else:
         raise ValueError("Invalid server type.")
-

From e4881a94adaa5dba4d9dca928a55117ef4e671b7 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 23 Oct 2024 22:24:32 -0500
Subject: [PATCH 034/441] Fixes issue #1071

---
 sde_collections/tasks.py | 124 +--------------------------------------
 1 file changed, 1 insertion(+), 123 deletions(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 8d93a2de..0c54ea0c 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -144,129 +144,6 @@ def resolve_title_pattern(title_pattern_id):
     title_pattern = TitlePattern.objects.get(id=title_pattern_id)
     title_pattern.apply()
 
-
-"""
-@celery_app.task
-def fetch_and_update_full_text(collection_id):
-
-    try:
-        collection = Collection.objects.get(id=collection_id)
-    except Collection.DoesNotExist:
-        raise Exception(f"Collection with ID {collection_id} does not exist.")
-
-    url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server
-    sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'"
-    token = os.getenv('LRMDEV_TOKEN')
-
-
-    payload = json.dumps({
-        "method": "engine.sql",
-        "sql": sql_command,
-        "pretty": True,
-        "log": False,
-        "output": "json",
-        "resolveIndexList": "false",
-        "engines": "default"
-    })
-
-    headers = {
-        'Content-Type': 'application/json',
-        'Authorization': f'Bearer {token}'
-    }
-
-    response = requests.post(url, headers=headers, data=payload)
-    if response.status_code == 200:
-        records = response.json().get("Rows", [])
-        for record in records:
-            url, full_text, title = record
-            if not url or not full_text or not title:
-                continue
-            # Directly update or create the entry without checking for content changes
-            CandidateURL.objects.update_or_create(
-                url=url,
-                collection=collection,
-                defaults={
-                    'scraped_text': full_text,
-                    'scraped_title': title
-                }
-            )
-
-        return f"Processed {len(records)} records; Updated or created in database."
-    else:
-        raise Exception(f"Failed to fetch text: {response.status_code} {response.text}")
-    """
-
-# You will have to have a different function for Li's server as it uses user and pw with body to login.
-# If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3
-# Fill in the tokens in the .django file
-
-# Integrated - LRM devs and Lis separate
-"""
-@celery_app.task
-def fetch_and_update_full_text(collection_id, server_type):
-    try:
-        collection = Collection.objects.get(id=collection_id)
-    except Collection.DoesNotExist:
-        raise Exception(f"Collection with ID {collection_id} does not exist.")
-
-    # Server-specific configurations
-    server_config = get_server_config(server_type)
-
-    # API Request Parameters
-    payload = json.dumps({
-        "method": "engine.sql",
-        "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'",
-        "pretty": True,
-        "log": False,
-        "output": "json",
-        "resolveIndexList": "false",
-        "engines": "default"
-    })
-
-    token = server_config["token"]
-    url = server_config["url"]
-    headers = {
-        'Content-Type': 'application/json',
-        'Authorization': f'Bearer {token}'
-    }
-
-    # Send the request
-    response = requests.post(url, headers=headers, data=payload)
-    if response.status_code == 200:
-        records = response.json().get("Rows", [])
-        for record in records:
-            url, full_text, title = record
-            if not url or not full_text or not title:
-                continue
-            CandidateURL.objects.update_or_create(
-                url=url,
-                collection=collection,
-                defaults={
-                    'scraped_text': full_text,
-                    'scraped_title': title
-                }
-            )
-        return f"Processed {len(records)} records; Updated or created in database."
-    else:
-        raise Exception(f"Failed to fetch text: {response.status_code} {response.text}")
-
-
-def get_server_config(server_type):
-    if server_type == "LRM_DEV":
-        return {
-            "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql",
-            "token": os.getenv("LRMDEV_TOKEN")
-        }
-    elif server_type == "LIS":
-        return {
-            "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql",
-            "token": os.getenv("LIS_TOKEN")
-        }
-    else:
-        raise ValueError("Invalid server type.")
-"""
-
-
 @celery_app.task
 def fetch_and_update_full_text(collection_id, server_type):
     try:
@@ -321,3 +198,4 @@ def get_server_config(server_type):
         return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")}
     else:
         raise ValueError("Invalid server type.")
+    
\ No newline at end of file

From 47f164f7f7a5d3a1f3f983d92d9d1bd4636f087b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Oct 2024 03:25:01 +0000
Subject: [PATCH 035/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 0c54ea0c..f505c942 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -144,6 +144,7 @@ def resolve_title_pattern(title_pattern_id):
     title_pattern = TitlePattern.objects.get(id=title_pattern_id)
     title_pattern.apply()
 
+
 @celery_app.task
 def fetch_and_update_full_text(collection_id, server_type):
     try:
@@ -198,4 +199,3 @@ def get_server_config(server_type):
         return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")}
     else:
         raise ValueError("Invalid server type.")
-    
\ No newline at end of file

From f4849e862184c83e20e115f2ce2beffb38daf914 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Tue, 29 Oct 2024 23:09:16 -0500
Subject: [PATCH 036/441] add PairedFieldDescriptor two-column tag model

---
 ...ection_tdamm_manual_collection_tdamm_ml.py | 23 ++++++++++++++++++
 sde_collections/models/collection.py          |  5 ++++
 .../utils/paired_field_descriptor.py          | 24 +++++++++++++++++++
 3 files changed, 52 insertions(+)
 create mode 100644 sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py
 create mode 100644 sde_collections/utils/paired_field_descriptor.py

diff --git a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py
new file mode 100644
index 00000000..557ad13e
--- /dev/null
+++ b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py
@@ -0,0 +1,23 @@
+# Generated by Django 4.2.9 on 2024-10-30 00:44
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="collection",
+            name="tdamm_manual",
+            field=models.CharField(blank=True, max_length=255, null=True),
+        ),
+        migrations.AddField(
+            model_name="collection",
+            name="tdamm_ml",
+            field=models.CharField(blank=True, max_length=255, null=True),
+        ),
+    ]
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 31306b8c..a2d3181c 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -26,6 +26,7 @@
     UpdateFrequencies,
     WorkflowStatusChoices,
 )
+from ..utils.paired_field_descriptor import PairedFieldDescriptor
 
 User = get_user_model()
 
@@ -33,6 +34,10 @@
 class Collection(models.Model):
     """Model definition for Collection."""
 
+    tdamm_manual = models.CharField(max_length=255, null=True, blank=True)
+    tdamm_ml = models.CharField(max_length=255, null=True, blank=True)
+    tdamm = PairedFieldDescriptor('tdamm')
+
     name = models.CharField("Name", max_length=1024)
     config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False)
     url = models.URLField("URL", max_length=2048)
diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py
new file mode 100644
index 00000000..e07d41dc
--- /dev/null
+++ b/sde_collections/utils/paired_field_descriptor.py
@@ -0,0 +1,24 @@
+from django.db import models
+
+
+class PairedFieldDescriptor:
+    def __init__(self, field_name):
+        self.manual_field_name = f"{field_name}_manual"
+        self.ml_field_name = f"{field_name}_ml"
+
+    def __get__(self, instance, owner):
+        if instance is None:
+            return self
+        # Return manual tag if available, otherwise ML tag
+        manual_value = getattr(instance, self.manual_field_name, None)
+        machine_learning_value = getattr(instance, self.ml_field_name, None)
+        return manual_value if manual_value is not None else machine_learning_value
+
+    def __set__(self, instance, value):
+        # Set the value of the manual field
+        setattr(instance, self.manual_field_name, value)
+
+    def __delete__(self, instance):
+        # Delete both manual and ML fields
+        delattr(instance, self.manual_field_name)
+        delattr(instance, self.ml_field_name)

From a469ef1242824645885433ac0d3ecd8d4a23a7fe Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 30 Oct 2024 16:14:07 -0500
Subject: [PATCH 037/441] add fields to admin panel

---
 sde_collections/admin.py                      | 35 +++++++++++++++++++
 ...remove_collection_tdamm_manual_and_more.py | 31 ++++++++++++++++
 sde_collections/models/collection.py          |  6 ++--
 sde_collections/serializers.py                |  1 +
 4 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index cb105f80..add9a906 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -7,6 +7,7 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
+from django import forms
 
 
 @admin.action(description="Generate deployment message")
@@ -174,10 +175,34 @@ def update_config(self, request, queryset):
     update_config.short_description = "Update configs of selected"
 
 
+class CollectionForm(forms.ModelForm):
+    tdamm_tag = forms.CharField(required=False, label="TDAMM Tag")
+
+    class Meta:
+        model = Collection
+        fields = "__all__"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.instance and hasattr(self.instance, "tdamm_tag"):
+            # Set the initial value of tdamm_tag to the computed value
+            self.fields["tdamm_tag"].initial = self.instance.tdamm_tag
+
+    def clean(self):
+        cleaned_data = super().clean()
+        tdamm_value = cleaned_data.get("tdamm_tag")
+        if tdamm_value:
+            # Set the manual field with the value from tdamm
+            cleaned_data["tdamm_tag_manual"] = tdamm_value
+        return cleaned_data
+
+
 @admin.register(Collection)
 class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
     """Admin View for Collection"""
 
+    form = CollectionForm
+
     fieldsets = (
         (
             "Essential information",
@@ -187,6 +212,9 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
                     "config_folder",
                     "url",
                     "division",
+                    "tdamm_tag",
+                    "tdamm_tag_ml",
+                    "tdamm_tag_manual",
                     "document_type",
                     "update_frequency",
                     "source",
@@ -215,15 +243,22 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
         ),
     )
 
+    def tdamm_tag(self, obj):
+        return obj.tdamm_tag
+
     list_display = (
         "name",
         "candidate_urls_count",
         "config_folder",
         "url",
+        "tdamm_tag",
+        "tdamm_tag_ml",
+        "tdamm_tag_manual",
         "division",
         "new_collection",
         "is_multi_division",
     )
+
     readonly_fields = ("config_folder",)
     list_filter = ("division", "curation_status", "workflow_status", "turned_on", "is_multi_division")
     search_fields = ("name", "url", "config_folder")
diff --git a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py
new file mode 100644
index 00000000..37b817a7
--- /dev/null
+++ b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py
@@ -0,0 +1,31 @@
+# Generated by Django 4.2.9 on 2024-10-30 21:05
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0059_collection_tdamm_manual_collection_tdamm_ml"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="collection",
+            name="tdamm_manual",
+        ),
+        migrations.RemoveField(
+            model_name="collection",
+            name="tdamm_ml",
+        ),
+        migrations.AddField(
+            model_name="collection",
+            name="tdamm_tag_manual",
+            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"),
+        ),
+        migrations.AddField(
+            model_name="collection",
+            name="tdamm_tag_ml",
+            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"),
+        ),
+    ]
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index a2d3181c..1d140889 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -34,9 +34,9 @@
 class Collection(models.Model):
     """Model definition for Collection."""
 
-    tdamm_manual = models.CharField(max_length=255, null=True, blank=True)
-    tdamm_ml = models.CharField(max_length=255, null=True, blank=True)
-    tdamm = PairedFieldDescriptor('tdamm')
+    tdamm_tag_manual = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM Manual Tag")
+    tdamm_tag_ml = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM ML Tag")
+    tdamm_tag = PairedFieldDescriptor('tdamm_tag')
 
     name = models.CharField("Name", max_length=1024)
     config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False)
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 9623e85d..19717818 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -26,6 +26,7 @@ class Meta:
             "workflow_status_display",
             "curated_by",
             "division",
+            "tdamm_tag",
             "document_type",
             "name",
         )

From 8e8e0ac743f6a915e8196c6dc9914060766315eb Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Mon, 4 Nov 2024 00:13:31 -0600
Subject: [PATCH 038/441] moved tdamm_tags feature from collection to
 candidate_url

---
 sde_collections/admin.py                      | 127 ++++++++++++-----
 ..._candidateurl_tdamm_tag_manual_and_more.py | 134 ++++++++++++++++++
 ...ection_tdamm_manual_collection_tdamm_ml.py |  23 ---
 ...remove_collection_tdamm_manual_and_more.py |  31 ----
 sde_collections/models/candidate_url.py       |  60 +++++++-
 sde_collections/models/collection.py          |   5 -
 sde_collections/serializers.py                |   2 +-
 7 files changed, 287 insertions(+), 95 deletions(-)
 create mode 100644 sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
 delete mode 100644 sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py
 delete mode 100644 sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index add9a906..73576899 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -8,6 +8,7 @@
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
 from django import forms
+from django.contrib.postgres.fields import ArrayField
 
 
 @admin.action(description="Generate deployment message")
@@ -175,34 +176,10 @@ def update_config(self, request, queryset):
     update_config.short_description = "Update configs of selected"
 
 
-class CollectionForm(forms.ModelForm):
-    tdamm_tag = forms.CharField(required=False, label="TDAMM Tag")
-
-    class Meta:
-        model = Collection
-        fields = "__all__"
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if self.instance and hasattr(self.instance, "tdamm_tag"):
-            # Set the initial value of tdamm_tag to the computed value
-            self.fields["tdamm_tag"].initial = self.instance.tdamm_tag
-
-    def clean(self):
-        cleaned_data = super().clean()
-        tdamm_value = cleaned_data.get("tdamm_tag")
-        if tdamm_value:
-            # Set the manual field with the value from tdamm
-            cleaned_data["tdamm_tag_manual"] = tdamm_value
-        return cleaned_data
-
-
 @admin.register(Collection)
 class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
     """Admin View for Collection"""
 
-    form = CollectionForm
-
     fieldsets = (
         (
             "Essential information",
@@ -212,9 +189,6 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
                     "config_folder",
                     "url",
                     "division",
-                    "tdamm_tag",
-                    "tdamm_tag_ml",
-                    "tdamm_tag_manual",
                     "document_type",
                     "update_frequency",
                     "source",
@@ -243,17 +217,11 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
         ),
     )
 
-    def tdamm_tag(self, obj):
-        return obj.tdamm_tag
-
     list_display = (
         "name",
         "candidate_urls_count",
         "config_folder",
         "url",
-        "tdamm_tag",
-        "tdamm_tag_ml",
-        "tdamm_tag_manual",
         "division",
         "new_collection",
         "is_multi_division",
@@ -296,13 +264,104 @@ def exclude_and_delete_children(modeladmin, request, queryset):
     for candidate_url in queryset.all():
         candidate_url.get_children().delete()
 
+class CandidateURLForm(forms.ModelForm):
+    # tdamm_tag = forms.MultipleChoiceField(
+    #     choices=CandidateURL.TDAMM_TAG_CHOICES,
+    #     required=False,
+    #     label="TDAMM Tags",
+    #     widget=forms.CheckboxSelectMultiple,
+    # )
+
+    tdamm_tag_ml = forms.MultipleChoiceField(
+        choices=CandidateURL.TDAMM_TAG_CHOICES,
+        required=False,
+        label="TDAMM ML Tags",
+        widget=forms.CheckboxSelectMultiple,
+    )
+
+    tdamm_tag_manual = forms.MultipleChoiceField(
+        choices=CandidateURL.TDAMM_TAG_CHOICES,
+        required=False,
+        label="TDAMM Manual Tags",
+        widget=forms.CheckboxSelectMultiple,
+    )
+    
+    class Meta:
+        model = CandidateURL
+        fields = '__all__'
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Initialize tdamm_tag
+        # if self.instance and hasattr(self.instance, 'tdamm_tag'):
+        #     self.fields['tdamm_tag'].initial = self.instance.tdamm_tag or []
+
+        # Initialize tdamm_tag_ml
+        if self.instance and self.instance.tdamm_tag_ml:
+            self.fields['tdamm_tag_ml'].initial = self.instance.tdamm_tag_ml
+        
+        # Initialize tdamm_tag_manual
+        if self.instance and self.instance.tdamm_tag_manual:
+            self.fields['tdamm_tag_manual'].initial = self.instance.tdamm_tag_manual
+
+    def clean(self):
+        cleaned_data = super().clean()
+        
+        # Handle tdamm_tag
+        # tdamm_tag_value = cleaned_data.get('tdamm_tag', [])
+        # if not tdamm_tag_value:
+        #     cleaned_data['tdamm_tag_manual'] = None
+        # else:
+        #     cleaned_data['tdamm_tag_manual'] = tdamm_tag_value
+
+        # Handle tdamm_tag_ml
+        tdamm_tag_ml_value = cleaned_data.get('tdamm_tag_ml', [])
+        if not tdamm_tag_ml_value:
+            cleaned_data['tdamm_tag_ml'] = None
+
+        # Handle tdamm_tag_manual
+        tdamm_tag_manual_value = cleaned_data.get('tdamm_tag_manual', [])
+        if not tdamm_tag_manual_value:
+            cleaned_data['tdamm_tag_manual'] = None
+
+        return cleaned_data
 
 class CandidateURLAdmin(admin.ModelAdmin):
     """Admin View for CandidateURL"""
 
-    list_display = ("url", "scraped_title", "collection")
+    form = CandidateURLForm
+
+    list_display = (
+        "url", 
+        "scraped_title", 
+        "collection", 
+        # "tdamm_tag_display",
+        "tdamm_tag_ml_display",
+        "tdamm_tag_manual_display"
+    )
     list_filter = ("collection",)
 
+    # @admin.display(description='TDAMM Tags')
+    # def tdamm_tag_display(self, obj):
+    #     if obj.tdamm_tag:
+    #         readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag]
+    #         return ", ".join(readable_tags)
+    #     return ""
+
+    @admin.display(description='TDAMM ML Tags')
+    def tdamm_tag_ml_display(self, obj):
+        if obj.tdamm_tag_ml:
+            readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_ml]
+            return ", ".join(readable_tags)
+        return ""
+
+    @admin.display(description='TDAMM Manual Tags')
+    def tdamm_tag_manual_display(self, obj):
+        if obj.tdamm_tag_manual:
+            readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_manual]
+            return ", ".join(readable_tags)
+        return ""
+
 
 class TitlePatternAdmin(admin.ModelAdmin):
     """Admin View for TitlePattern"""
diff --git a/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
new file mode 100644
index 00000000..057f1ed6
--- /dev/null
+++ b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
@@ -0,0 +1,134 @@
+# Generated by Django 4.2.9 on 2024-11-02 04:36
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="candidateurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                null=True,
+                size=None,
+                verbose_name="TDAMM Manual Tags",
+            ),
+        ),
+        migrations.AddField(
+            model_name="candidateurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                null=True,
+                size=None,
+                verbose_name="TDAMM ML Tags",
+            ),
+        ),
+        migrations.AddField(
+            model_name="collection",
+            name="tdamm_tag_manual",
+            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"),
+        ),
+        migrations.AddField(
+            model_name="collection",
+            name="tdamm_tag_ml",
+            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"),
+        ),
+    ]
diff --git a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py
deleted file mode 100644
index 557ad13e..00000000
--- a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Generated by Django 4.2.9 on 2024-10-30 00:44
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name="collection",
-            name="tdamm_manual",
-            field=models.CharField(blank=True, max_length=255, null=True),
-        ),
-        migrations.AddField(
-            model_name="collection",
-            name="tdamm_ml",
-            field=models.CharField(blank=True, max_length=255, null=True),
-        ),
-    ]
diff --git a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py
deleted file mode 100644
index 37b817a7..00000000
--- a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Generated by Django 4.2.9 on 2024-10-30 21:05
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("sde_collections", "0059_collection_tdamm_manual_collection_tdamm_ml"),
-    ]
-
-    operations = [
-        migrations.RemoveField(
-            model_name="collection",
-            name="tdamm_manual",
-        ),
-        migrations.RemoveField(
-            model_name="collection",
-            name="tdamm_ml",
-        ),
-        migrations.AddField(
-            model_name="collection",
-            name="tdamm_tag_manual",
-            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"),
-        ),
-        migrations.AddField(
-            model_name="collection",
-            name="tdamm_tag_ml",
-            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"),
-        ),
-    ]
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 51c3a28b..f8c91a97 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -7,7 +7,8 @@
 from .collection import Collection
 from .collection_choice_fields import Divisions, DocumentTypes
 from .pattern import ExcludePattern, TitlePattern
-
+from ..utils.paired_field_descriptor import PairedFieldDescriptor
+from django.contrib.postgres.fields import ArrayField
 
 class CandidateURLQuerySet(models.QuerySet):
     def with_exclusion_status(self):
@@ -80,6 +81,63 @@ class CandidateURL(models.Model):
         help_text="Helps keep track if the Current URL is present in production or not",
     )
 
+    TDAMM_TAG_CHOICES = [
+        ('MMA_M_EM', 'Messenger - EM Radiation'),
+        ('MMA_M_EM_G', 'Messenger - EM Radiation - Gamma rays'),
+        ('MMA_M_EM_X', 'Messenger - EM Radiation - X-rays'),
+        ('MMA_M_EM_U', 'Messenger - EM Radiation - Ultraviolet'),
+        ('MMA_M_EM_O', 'Messenger - EM Radiation - Optical'),
+        ('MMA_M_EM_I', 'Messenger - EM Radiation - Infrared'),
+        ('MMA_M_EM_M', 'Messenger - EM Radiation - Microwave'),
+        ('MMA_M_EM_R', 'Messenger - EM Radiation - Radio'),
+        ('MMA_M_G', 'Messenger - Gravitational Waves'),
+        ('MMA_M_G_CBI', 'Messenger - Gravitational Waves - Compact Binary Inspiral'),
+        ('MMA_M_G_S', 'Messenger - Gravitational Waves - Stochastic'),
+        ('MMA_M_G_CON', 'Messenger - Gravitational Waves - Continuous'),
+        ('MMA_M_G_B', 'Messenger - Gravitational Waves - Burst'),
+        ('MMA_M_C', 'Messenger - Cosmic Rays'),
+        ('MMA_M_N', 'Messenger - Neutrinos'),
+        ('MMA_O_BI', 'Objects - Binaries'),
+        ('MMA_O_BI_BBH', 'Objects - Binaries - Binary Black Holes'),
+        ('MMA_O_BI_BNS', 'Objects - Binaries - Binary Neutron Stars'),
+        ('MMA_O_BI_C', 'Objects - Binaries - Cataclysmic Variables'),
+        ('MMA_O_BI_N', 'Objects - Binaries - Neutron Star-Black Hole'),
+        ('MMA_O_BI_B', 'Objects - Binaries - Binary Pulsars'),
+        ('MMA_O_BI_W', 'Objects - Binaries - White Dwarf Binaries'),
+        ('MMA_O_BH', 'Objects - Black Holes'),
+        ('MMA_O_BH_AGN', 'Objects - Black Holes - Active Galactic Nuclei'),
+        ('MMA_O_BH_IM', 'Objects - Black Holes - Intermediate mass'),
+        ('MMA_O_BH_STM', 'Objects - Black Holes - Stellar mass'),
+        ('MMA_O_BH_SUM', 'Objects - Black Holes - Supermassive'),
+        ('MMA_O_E', 'Objects - Exoplanets'),
+        ('MMA_O_N', 'Objects - Neutron Stars'),
+        ('MMA_O_N_M', 'Objects - Neutron Stars - Magnetars'),
+        ('MMA_O_N_P', 'Objects - Neutron Stars - Pulsars'),
+        ('MMA_O_N_PWN', 'Objects - Neutron Stars - Pulsar Wind Nebula'),
+        ('MMA_O_S', 'Objects - Supernova Remnants'),
+        ('MMA_S_F', 'Signals - Fast Radio Bursts'),
+        ('MMA_S_G', 'Signals - Gamma-ray Bursts'),
+        ('MMA_S_K', 'Signals - Kilonovae'),
+        ('MMA_S_N', 'Signals - Novae'),
+        ('MMA_S_P', 'Signals - Pevatrons'),
+        ('MMA_S_ST', 'Signals - Stellar flares'),
+        ('MMA_S_SU', 'Signals - Supernovae'),
+    ]
+
+    tdamm_tag_manual = ArrayField(
+        models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES),
+        blank=True,
+        null=True,
+        verbose_name="TDAMM Manual Tags"
+    )
+    tdamm_tag_ml = ArrayField(
+        models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES),
+        blank=True,
+        null=True,
+        verbose_name="TDAMM ML Tags"
+    )
+    tdamm_tag = PairedFieldDescriptor('tdamm_tag')
+
     class Meta:
         """Meta definition for Candidate URL."""
 
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 1d140889..31306b8c 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -26,7 +26,6 @@
     UpdateFrequencies,
     WorkflowStatusChoices,
 )
-from ..utils.paired_field_descriptor import PairedFieldDescriptor
 
 User = get_user_model()
 
@@ -34,10 +33,6 @@
 class Collection(models.Model):
     """Model definition for Collection."""
 
-    tdamm_tag_manual = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM Manual Tag")
-    tdamm_tag_ml = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM ML Tag")
-    tdamm_tag = PairedFieldDescriptor('tdamm_tag')
-
     name = models.CharField("Name", max_length=1024)
     config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False)
     url = models.URLField("URL", max_length=2048)
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 19717818..b7bb3b25 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -26,7 +26,6 @@ class Meta:
             "workflow_status_display",
             "curated_by",
             "division",
-            "tdamm_tag",
             "document_type",
             "name",
         )
@@ -123,6 +122,7 @@ class Meta:
             "hash",
             "file_extension",
             "tree_root",
+            "tdamm_tag"
         )
 
     def get_document_type(self, obj):

From 6bf48ff100d32cfe3e52605b13625f044210e79b Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 10:46:19 -0600
Subject: [PATCH 039/441] adding admin views for DumpURL and URL models

---
 sde_collections/admin.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 4fce1ea7..a8fce352 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -7,7 +7,9 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.curated_url import CuratedUrl
 from .models.delta_url import DeltaUrl
+from .models.dump_url import DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
+from .models.url import Url
 from .tasks import import_candidate_urls_from_api
 
 
@@ -315,6 +317,20 @@ class DeltaUrlAdmin(admin.ModelAdmin):
     list_filter = ("collection",)
 
 
+class DumpUrlAdmin(admin.ModelAdmin):
+    """Admin View for DumpUrl"""
+
+    list_display = ("url", "scraped_title", "generated_title", "collection")
+    list_filter = ("collection",)
+
+
+class UrlAdmin(admin.ModelAdmin):
+    """Admin View for Url"""
+
+    list_display = ("url", "scraped_title", "collection")
+    list_filter = ("collection",)
+
+
 admin.site.register(WorkflowHistory, WorkflowHistoryAdmin)
 admin.site.register(CandidateURL, CandidateURLAdmin)
 admin.site.register(TitlePattern, TitlePatternAdmin)
@@ -323,3 +339,5 @@ class DeltaUrlAdmin(admin.ModelAdmin):
 admin.site.register(DivisionPattern, DivisionPatternAdmin)
 admin.site.register(DeltaUrl, DeltaUrlAdmin)
 admin.site.register(CuratedUrl, CuratedUrlAdmin)
+admin.site.register(DumpUrl, DumpUrlAdmin)
+admin.site.register(Url, UrlAdmin)

From 483685135cceffd131de25ffbf78c9d0bbdee929 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 10:46:34 -0600
Subject: [PATCH 040/441] migration for the dump URL file

---
 sde_collections/migrations/0061_dumpurl.py | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 sde_collections/migrations/0061_dumpurl.py

diff --git a/sde_collections/migrations/0061_dumpurl.py b/sde_collections/migrations/0061_dumpurl.py
new file mode 100644
index 00000000..4aeb0088
--- /dev/null
+++ b/sde_collections/migrations/0061_dumpurl.py
@@ -0,0 +1,35 @@
+# Generated by Django 4.2.9 on 2024-10-23 19:29
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0060_delete_dumpurl"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="DumpUrl",
+            fields=[
+                (
+                    "url_ptr",
+                    models.OneToOneField(
+                        auto_created=True,
+                        on_delete=django.db.models.deletion.CASCADE,
+                        parent_link=True,
+                        primary_key=True,
+                        serialize=False,
+                        to="sde_collections.url",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "Dump URL",
+                "verbose_name_plural": "Dump URLs",
+            },
+            bases=("sde_collections.url",),
+        ),
+    ]

From 19feff8cd273488bb727db9e9b81b9a0a112701b Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 10:49:31 -0600
Subject: [PATCH 041/441] adding tasks to compare and add URLs to the new
 models

---
 sde_collections/tasks.py | 109 +++++++++++++++++++++++++++++++++++----
 1 file changed, 99 insertions(+), 10 deletions(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index fa754efc..ecc3c1a9 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -6,11 +6,14 @@
 from django.apps import apps
 from django.conf import settings
 from django.core import management
-from django.core.management.commands import loaddata
 
 from config import celery_app
 
 from .models.collection import Collection, WorkflowStatusChoices
+from .models.curated_url import CuratedUrl
+from .models.delta_url import DeltaUrl
+from .models.dump_url import DumpUrl
+from .models.url import Url
 from .sinequa_api import Api
 from .utils.github_helper import GitHubHandler
 
@@ -49,7 +52,7 @@ def _get_data_to_import(collection, server_name):
                 continue
 
             augmented_data = {
-                "model": "sde_collections.candidateurl",
+                "model": "sde_collections.url",
                 "fields": {
                     "collection": collection_pk,
                     "url": url,
@@ -62,6 +65,88 @@ def _get_data_to_import(collection, server_name):
     return data_to_import
 
 
+def _compare_and_populate_delta_urls(collection):
+    """Compare DumpUrl and CuratedUrl and populate DeltaUrl."""
+    dump_urls = DumpUrl.objects.filter(collection=collection)
+    curated_urls = CuratedUrl.objects.filter(collection=collection)
+
+    DeltaUrl.objects.filter(collection=collection).delete()
+
+    curated_urls_dict = {url.url: url for url in curated_urls}
+
+    # Iterate over Dump URLs to find deltas
+    for dump_url in dump_urls:
+        curated_url = curated_urls_dict.get(dump_url.url)
+
+        if not curated_url:
+            # New URL found, add to DeltaUrl
+            DeltaUrl.objects.create(
+                collection=collection,
+                url=dump_url.url,
+                scraped_title=dump_url.scraped_title,
+                generated_title=dump_url.generated_title,
+                document_type=dump_url.document_type,
+                division=dump_url.division,
+                delete=False,
+            )
+        elif (
+            curated_url.scraped_title != dump_url.scraped_title
+            or curated_url.generated_title != dump_url.generated_title
+            or curated_url.document_type != dump_url.document_type
+            or curated_url.division != dump_url.division
+        ):
+            # Metadata changed, add to DeltaUrl
+            DeltaUrl.objects.create(
+                collection=collection,
+                url=dump_url.url,
+                scraped_title=dump_url.scraped_title,
+                generated_title=dump_url.generated_title,
+                document_type=dump_url.document_type,
+                division=dump_url.division,
+                delete=False,
+            )
+
+    # Mark any missing URLs in CuratedUrl as deleted in DeltaUrl
+    dump_url_set = set(dump_urls.values_list("url", flat=True))
+    for curated_url in curated_urls:
+        if curated_url.url not in dump_url_set:
+            DeltaUrl.objects.create(
+                collection=collection,
+                url=curated_url.url,
+                scraped_title=curated_url.scraped_title,
+                generated_title=curated_url.generated_title,
+                document_type=curated_url.document_type,
+                division=curated_url.division,
+                delete=True,
+            )
+
+
+def populate_dump_urls(collection):
+    urls = Url.objects.filter(collection=collection)
+
+    for url_instance in urls:
+        try:
+            # Create DumpUrl by passing in the parent Url fields
+            dump_url_instance = DumpUrl(
+                id=url_instance.id,
+                collection=url_instance.collection,
+                url=url_instance.url,
+                scraped_title=url_instance.scraped_title,
+                visited=url_instance.visited,
+                document_type=url_instance.document_type,
+                division=url_instance.division,
+            )
+            dump_url_instance.save()  # Save both Url and DumpUrl entries
+
+            print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
+
+        except Exception as e:
+            print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
+            continue
+
+    print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
+
+
 @celery_app.task(soft_time_limit=10000)
 def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
     TEMP_FOLDER_NAME = "temp"
@@ -76,26 +161,30 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
         data_to_import = _get_data_to_import(server_name=server_name, collection=collection)
         print(f"Got {len(data_to_import)} records for {collection.config_folder}")
 
+        print("Clearing DumpUrl model...")
+        DumpUrl.objects.filter(collection=collection).delete()
+
         print("Dumping django fixture to file")
         json.dump(data_to_import, open(urls_file, "w"))
 
-        print("Deleting existing candidate URLs")
-        # this sometimes takes a while
-        collection.candidate_urls.all().delete()
+        print("Loading data into Url model using loaddata...")
+        management.call_command("loaddata", urls_file)
 
-        print("Loading fixture; this may take a while")
-        # subprocess.call(f'python manage.py loaddata "{urls_file}"', shell=True)
-        management.call_command(loaddata.Command(), urls_file)
+        print("Creating DumpUrl entries...")
+        populate_dump_urls(collection)
 
         print("Applying existing patterns; this may take a while")
         collection.apply_all_patterns()
 
-        if collection.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
+        print("Comparing DumpUrl with CuratedUrl...")
+        _compare_and_populate_delta_urls(collection)
+
+        if collection.workflow_status != WorkflowStatusChoices.ENGINEERING_IN_PROGRESS:
             collection.workflow_status = WorkflowStatusChoices.ENGINEERING_IN_PROGRESS
             collection.save()
 
         # Finally set the status to READY_FOR_CURATION
-        collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
+        # collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
         collection.save()
 
     print("Deleting temp files")

From 7e24495fb2489615c0b8a6fd4b79d2e7550c436c Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 10:50:26 -0600
Subject: [PATCH 042/441] adding a save method for dump URL

---
 sde_collections/models/dump_url.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py
index 85ef85d9..82e168ca 100644
--- a/sde_collections/models/dump_url.py
+++ b/sde_collections/models/dump_url.py
@@ -7,3 +7,8 @@ class DumpUrl(Url):
     class Meta:
         verbose_name = "Dump URL"
         verbose_name_plural = "Dump URLs"
+
+    def save(self, *args, **kwargs):
+        if not self.pk:  # Ensure it's only called on create
+            super().save(*args, **kwargs)  # Save the parent `Url` entry
+        super().save(*args, **kwargs)

From e5e64f46c26d822c971d741c774bed8dabf1121b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 4 Nov 2024 11:34:53 -0600
Subject: [PATCH 043/441] move all url models into the same file

---
 sde_collections/models/curated_url.py |  9 --------
 sde_collections/models/delta_url.py   | 13 -----------
 sde_collections/models/dump_url.py    | 14 ------------
 sde_collections/models/url.py         | 31 +++++++++++++++++++++++++++
 4 files changed, 31 insertions(+), 36 deletions(-)
 delete mode 100644 sde_collections/models/curated_url.py
 delete mode 100644 sde_collections/models/delta_url.py
 delete mode 100644 sde_collections/models/dump_url.py

diff --git a/sde_collections/models/curated_url.py b/sde_collections/models/curated_url.py
deleted file mode 100644
index d55dcb5f..00000000
--- a/sde_collections/models/curated_url.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .url import Url
-
-
-class CuratedUrl(Url):
-    """Model for storing curated and live URLs after the curation process."""
-
-    class Meta:
-        verbose_name = "Curated URL"
-        verbose_name_plural = "Curated URLs"
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
deleted file mode 100644
index 028607ab..00000000
--- a/sde_collections/models/delta_url.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from django.db import models
-
-from .url import Url
-
-
-class DeltaUrl(Url):
-    """Model for storing delta URLs for curation purposes"""
-
-    delete = models.BooleanField(default=False)
-
-    class Meta:
-        verbose_name = "Delta URL"
-        verbose_name_plural = "Delta URLs"
diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py
deleted file mode 100644
index 82e168ca..00000000
--- a/sde_collections/models/dump_url.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from .url import Url
-
-
-class DumpUrl(Url):
-    """Model for storing all the imported URLs before seperating them into delta URLs and Curated URLs."""
-
-    class Meta:
-        verbose_name = "Dump URL"
-        verbose_name_plural = "Dump URLs"
-
-    def save(self, *args, **kwargs):
-        if not self.pk:  # Ensure it's only called on create
-            super().save(*args, **kwargs)  # Save the parent `Url` entry
-        super().save(*args, **kwargs)
diff --git a/sde_collections/models/url.py b/sde_collections/models/url.py
index 7ce86dff..3fc70243 100644
--- a/sde_collections/models/url.py
+++ b/sde_collections/models/url.py
@@ -83,3 +83,34 @@ def __str__(self) -> str:
 
     def save(self, *args, **kwargs):
         super().save(*args, **kwargs)
+
+
+class DumpUrl(Url):
+    """Model for storing all the imported URLs before separating them into delta URLs and Curated URLs."""
+
+    class Meta:
+        verbose_name = "Dump URL"
+        verbose_name_plural = "Dump URLs"
+
+    def save(self, *args, **kwargs):
+        if not self.pk:  # Ensure it's only called on create
+            super().save(*args, **kwargs)  # Save the parent `Url` entry
+        super().save(*args, **kwargs)
+
+
+class DeltaUrl(Url):
+    """Model for storing delta URLs for curation purposes"""
+
+    delete = models.BooleanField(default=False)
+
+    class Meta:
+        verbose_name = "Delta URL"
+        verbose_name_plural = "Delta URLs"
+
+
+class CuratedUrl(Url):
+    """Model for storing curated and live URLs after the curation process."""
+
+    class Meta:
+        verbose_name = "Curated URL"
+        verbose_name_plural = "Curated URLs"

From 7a906b71d5355fc13cacafd1f985ee692e9474ef Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 4 Nov 2024 11:42:44 -0600
Subject: [PATCH 044/441] update admin url imports

---
 sde_collections/admin.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index a8fce352..df33af9d 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -5,11 +5,8 @@
 
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
-from .models.curated_url import CuratedUrl
-from .models.delta_url import DeltaUrl
-from .models.dump_url import DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .models.url import Url
+from .models.url import Url, CuratedUrl, DeltaUrl, DumpUrl
 from .tasks import import_candidate_urls_from_api
 
 

From 728a5b425b76d402ffefb83aef5f574fa7b84c2c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Nov 2024 17:42:59 +0000
Subject: [PATCH 045/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/admin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index df33af9d..e7780846 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -6,7 +6,7 @@
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .models.url import Url, CuratedUrl, DeltaUrl, DumpUrl
+from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url
 from .tasks import import_candidate_urls_from_api
 
 

From f5c69bd4ce64c1edcfdd700e15e0e0404b19ce67 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Mon, 4 Nov 2024 14:46:19 -0600
Subject: [PATCH 046/441] refactor code

---
 sde_collections/admin.py                      | 171 ++++++++++++------
 ..._candidateurl_tdamm_tag_manual_and_more.py | 151 ++++++++++++++++
 sde_collections/models/candidate_url.py       | 120 +++++++-----
 sde_collections/serializers.py                |  21 ++-
 .../utils/paired_field_descriptor.py          |   3 -
 5 files changed, 349 insertions(+), 117 deletions(-)
 create mode 100644 sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 73576899..0860d0e5 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -8,7 +8,6 @@
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
 from django import forms
-from django.contrib.postgres.fields import ArrayField
 
 
 @admin.action(description="Generate deployment message")
@@ -264,14 +263,8 @@ def exclude_and_delete_children(modeladmin, request, queryset):
     for candidate_url in queryset.all():
         candidate_url.get_children().delete()
 
-class CandidateURLForm(forms.ModelForm):
-    # tdamm_tag = forms.MultipleChoiceField(
-    #     choices=CandidateURL.TDAMM_TAG_CHOICES,
-    #     required=False,
-    #     label="TDAMM Tags",
-    #     widget=forms.CheckboxSelectMultiple,
-    # )
 
+class CandidateURLForm(forms.ModelForm):
     tdamm_tag_ml = forms.MultipleChoiceField(
         choices=CandidateURL.TDAMM_TAG_CHOICES,
         required=False,
@@ -285,83 +278,141 @@ class CandidateURLForm(forms.ModelForm):
         label="TDAMM Manual Tags",
         widget=forms.CheckboxSelectMultiple,
     )
-    
+
     class Meta:
         model = CandidateURL
-        fields = '__all__'
+        fields = "__all__"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        # Initialize tdamm_tag
-        # if self.instance and hasattr(self.instance, 'tdamm_tag'):
-        #     self.fields['tdamm_tag'].initial = self.instance.tdamm_tag or []
-
-        # Initialize tdamm_tag_ml
-        if self.instance and self.instance.tdamm_tag_ml:
-            self.fields['tdamm_tag_ml'].initial = self.instance.tdamm_tag_ml
-        
-        # Initialize tdamm_tag_manual
-        if self.instance and self.instance.tdamm_tag_manual:
-            self.fields['tdamm_tag_manual'].initial = self.instance.tdamm_tag_manual
+        instance = kwargs.get("instance")
+
+        # Only show TDAMM fields if is_tdamm is True
+        if not instance or not instance.is_tdamm:
+            if "tdamm_tag_ml" in self.fields:
+                del self.fields["tdamm_tag_ml"]
+            if "tdamm_tag_manual" in self.fields:
+                del self.fields["tdamm_tag_manual"]
+        else:
+            # Initialize tdamm fields only if is_tdamm is True
+            if hasattr(self.instance, "tdamm_tag_ml"):
+                self.fields["tdamm_tag_ml"].initial = self.instance.tdamm_tag_ml or []
+
+            if hasattr(self.instance, "tdamm_tag_manual"):
+                self.fields["tdamm_tag_manual"].initial = self.instance.tdamm_tag_manual or []
 
     def clean(self):
         cleaned_data = super().clean()
-        
-        # Handle tdamm_tag
-        # tdamm_tag_value = cleaned_data.get('tdamm_tag', [])
-        # if not tdamm_tag_value:
-        #     cleaned_data['tdamm_tag_manual'] = None
-        # else:
-        #     cleaned_data['tdamm_tag_manual'] = tdamm_tag_value
-
-        # Handle tdamm_tag_ml
-        tdamm_tag_ml_value = cleaned_data.get('tdamm_tag_ml', [])
-        if not tdamm_tag_ml_value:
-            cleaned_data['tdamm_tag_ml'] = None
-
-        # Handle tdamm_tag_manual
-        tdamm_tag_manual_value = cleaned_data.get('tdamm_tag_manual', [])
-        if not tdamm_tag_manual_value:
-            cleaned_data['tdamm_tag_manual'] = None
-
         return cleaned_data
 
+    def save(self, commit=True):
+        instance = super().save(commit=False)
+
+        # Handle TDAMM fields if is_tdamm is True
+        if instance.is_tdamm:
+            # Get values from the form
+            tdamm_tag_ml = self.cleaned_data.get("tdamm_tag_ml", [])
+            tdamm_tag_manual = self.cleaned_data.get("tdamm_tag_manual", [])
+
+            # Set the values directly on the instance
+            instance.tdamm_tag_ml = tdamm_tag_ml or None
+            instance.tdamm_tag_manual = tdamm_tag_manual or None
+        else:
+            # Clear TDAMM fields if is_tdamm is False
+            instance.tdamm_tag_ml = None
+            instance.tdamm_tag_manual = None
+
+        if commit:
+            instance.save()
+
+        return instance
+
+
 class CandidateURLAdmin(admin.ModelAdmin):
     """Admin View for CandidateURL"""
 
     form = CandidateURLForm
 
-    list_display = (
-        "url", 
-        "scraped_title", 
-        "collection", 
-        # "tdamm_tag_display",
-        "tdamm_tag_ml_display",
-        "tdamm_tag_manual_display"
-    )
-    list_filter = ("collection",)
-
-    # @admin.display(description='TDAMM Tags')
-    # def tdamm_tag_display(self, obj):
-    #     if obj.tdamm_tag:
-    #         readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag]
-    #         return ", ".join(readable_tags)
-    #     return ""
-
-    @admin.display(description='TDAMM ML Tags')
+    def get_list_display(self, request):
+        list_display = [
+            "url",
+            "scraped_title",
+            "collection",
+            "is_tdamm",
+        ]
+        # Add TDAMM-related fields only if any TDAMM-enabled URLs exist
+        if CandidateURL.objects.filter(is_tdamm=True).exists():
+            list_display.extend(["tdamm_tag_ml_display", "tdamm_tag_manual_display"])
+        return list_display
+
+    list_filter = ("collection", "is_tdamm")
+
+    @admin.display(description="TDAMM ML Tags")
     def tdamm_tag_ml_display(self, obj):
-        if obj.tdamm_tag_ml:
+        if obj.is_tdamm and obj.tdamm_tag_ml:
             readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_ml]
             return ", ".join(readable_tags)
         return ""
 
-    @admin.display(description='TDAMM Manual Tags')
+    @admin.display(description="TDAMM Manual Tags")
     def tdamm_tag_manual_display(self, obj):
-        if obj.tdamm_tag_manual:
+        if obj.is_tdamm and obj.tdamm_tag_manual:
             readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_manual]
             return ", ".join(readable_tags)
         return ""
 
+    def get_fieldsets(self, request, obj=None):
+        """Dynamically adjust fieldsets based on is_tdamm"""
+        fieldsets = [
+            (
+                "Essential Information",
+                {
+                    "fields": (
+                        "collection",
+                        "url",
+                        "hash",
+                        "scraped_title",
+                        "generated_title",
+                        "test_title",
+                        "production_title",
+                        "level",
+                        "visited",
+                        "document_type",
+                        "division",
+                        "inferenced_by",
+                        "is_pdf",
+                        "present_on_test",
+                        "present_on_prod",
+                        "is_tdamm",
+                    )
+                },
+            ),
+        ]
+
+        # Add TDAMM fields only if is_tdamm is True
+        if obj and obj.is_tdamm:
+            fieldsets.append(
+                (
+                    "TDAMM Tags",
+                    {
+                        "fields": (
+                            "tdamm_tag_ml",
+                            "tdamm_tag_manual",
+                        ),
+                        "classes": ("collapse",),
+                    },
+                )
+            )
+
+        return fieldsets
+
+    def save_model(self, request, obj, form, change):
+        """Ensure proper saving of the model"""
+        if not obj.is_tdamm:
+            obj.tdamm_tag_ml = None
+            obj.tdamm_tag_manual = None
+        super().save_model(request, obj, form, change)
+
 
 class TitlePatternAdmin(admin.ModelAdmin):
     """Admin View for TitlePattern"""
diff --git a/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py
new file mode 100644
index 00000000..d8a0a4a7
--- /dev/null
+++ b/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py
@@ -0,0 +1,151 @@
+# Generated by Django 4.2.9 on 2024-11-04 06:33
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0059_candidateurl_tdamm_tag_manual_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="candidateurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+                verbose_name="TDAMM Manual Tags",
+            ),
+        ),
+        migrations.RenameField(
+            model_name="candidateurl",
+            old_name="tdamm_tag_manual",
+            new_name="_tdamm_tag_manual",
+        ),
+        migrations.AlterField(
+            model_name="candidateurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+                verbose_name="TDAMM ML Tags",
+            ),
+        ),
+        migrations.RenameField(
+            model_name="candidateurl",
+            old_name="tdamm_tag_ml",
+            new_name="_tdamm_tag_ml",
+        ),
+        migrations.RemoveField(
+            model_name="collection",
+            name="tdamm_tag_manual",
+        ),
+        migrations.RemoveField(
+            model_name="collection",
+            name="tdamm_tag_ml",
+        ),
+        migrations.AddField(
+            model_name="candidateurl",
+            name="is_tdamm",
+            field=models.BooleanField(
+                default=False, help_text="Enable TDAMM tagging for this URL", verbose_name="Is TDAMM"
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index f8c91a97..41c1072f 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -10,6 +10,7 @@
 from ..utils.paired_field_descriptor import PairedFieldDescriptor
 from django.contrib.postgres.fields import ArrayField
 
+
 class CandidateURLQuerySet(models.QuerySet):
     def with_exclusion_status(self):
         return self.annotate(
@@ -80,63 +81,91 @@ class CandidateURL(models.Model):
         default=False,
         help_text="Helps keep track if the Current URL is present in production or not",
     )
+    is_tdamm = models.BooleanField("Is TDAMM?", default=False, help_text="Enable TDAMM tagging for this URL")
 
     TDAMM_TAG_CHOICES = [
-        ('MMA_M_EM', 'Messenger - EM Radiation'),
-        ('MMA_M_EM_G', 'Messenger - EM Radiation - Gamma rays'),
-        ('MMA_M_EM_X', 'Messenger - EM Radiation - X-rays'),
-        ('MMA_M_EM_U', 'Messenger - EM Radiation - Ultraviolet'),
-        ('MMA_M_EM_O', 'Messenger - EM Radiation - Optical'),
-        ('MMA_M_EM_I', 'Messenger - EM Radiation - Infrared'),
-        ('MMA_M_EM_M', 'Messenger - EM Radiation - Microwave'),
-        ('MMA_M_EM_R', 'Messenger - EM Radiation - Radio'),
-        ('MMA_M_G', 'Messenger - Gravitational Waves'),
-        ('MMA_M_G_CBI', 'Messenger - Gravitational Waves - Compact Binary Inspiral'),
-        ('MMA_M_G_S', 'Messenger - Gravitational Waves - Stochastic'),
-        ('MMA_M_G_CON', 'Messenger - Gravitational Waves - Continuous'),
-        ('MMA_M_G_B', 'Messenger - Gravitational Waves - Burst'),
-        ('MMA_M_C', 'Messenger - Cosmic Rays'),
-        ('MMA_M_N', 'Messenger - Neutrinos'),
-        ('MMA_O_BI', 'Objects - Binaries'),
-        ('MMA_O_BI_BBH', 'Objects - Binaries - Binary Black Holes'),
-        ('MMA_O_BI_BNS', 'Objects - Binaries - Binary Neutron Stars'),
-        ('MMA_O_BI_C', 'Objects - Binaries - Cataclysmic Variables'),
-        ('MMA_O_BI_N', 'Objects - Binaries - Neutron Star-Black Hole'),
-        ('MMA_O_BI_B', 'Objects - Binaries - Binary Pulsars'),
-        ('MMA_O_BI_W', 'Objects - Binaries - White Dwarf Binaries'),
-        ('MMA_O_BH', 'Objects - Black Holes'),
-        ('MMA_O_BH_AGN', 'Objects - Black Holes - Active Galactic Nuclei'),
-        ('MMA_O_BH_IM', 'Objects - Black Holes - Intermediate mass'),
-        ('MMA_O_BH_STM', 'Objects - Black Holes - Stellar mass'),
-        ('MMA_O_BH_SUM', 'Objects - Black Holes - Supermassive'),
-        ('MMA_O_E', 'Objects - Exoplanets'),
-        ('MMA_O_N', 'Objects - Neutron Stars'),
-        ('MMA_O_N_M', 'Objects - Neutron Stars - Magnetars'),
-        ('MMA_O_N_P', 'Objects - Neutron Stars - Pulsars'),
-        ('MMA_O_N_PWN', 'Objects - Neutron Stars - Pulsar Wind Nebula'),
-        ('MMA_O_S', 'Objects - Supernova Remnants'),
-        ('MMA_S_F', 'Signals - Fast Radio Bursts'),
-        ('MMA_S_G', 'Signals - Gamma-ray Bursts'),
-        ('MMA_S_K', 'Signals - Kilonovae'),
-        ('MMA_S_N', 'Signals - Novae'),
-        ('MMA_S_P', 'Signals - Pevatrons'),
-        ('MMA_S_ST', 'Signals - Stellar flares'),
-        ('MMA_S_SU', 'Signals - Supernovae'),
+        ("MMA_M_EM", "Messenger - EM Radiation"),
+        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+        ("MMA_M_G", "Messenger - Gravitational Waves"),
+        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+        ("MMA_M_C", "Messenger - Cosmic Rays"),
+        ("MMA_M_N", "Messenger - Neutrinos"),
+        ("MMA_O_BI", "Objects - Binaries"),
+        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+        ("MMA_O_BH", "Objects - Black Holes"),
+        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+        ("MMA_O_E", "Objects - Exoplanets"),
+        ("MMA_O_N", "Objects - Neutron Stars"),
+        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+        ("MMA_O_S", "Objects - Supernova Remnants"),
+        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+        ("MMA_S_K", "Signals - Kilonovae"),
+        ("MMA_S_N", "Signals - Novae"),
+        ("MMA_S_P", "Signals - Pevatrons"),
+        ("MMA_S_ST", "Signals - Stellar flares"),
+        ("MMA_S_SU", "Signals - Supernovae"),
     ]
 
-    tdamm_tag_manual = ArrayField(
+    # Define TDAMM fields but make them optional
+    @property
+    def tdamm_tag_manual(self):
+        if hasattr(self, "_tdamm_tag_manual") and self.is_tdamm:
+            return self._tdamm_tag_manual
+        return None
+
+    @tdamm_tag_manual.setter
+    def tdamm_tag_manual(self, value):
+        if self.is_tdamm:
+            self._tdamm_tag_manual = value
+
+    @property
+    def tdamm_tag_ml(self):
+        if hasattr(self, "_tdamm_tag_ml") and self.is_tdamm:
+            return self._tdamm_tag_ml
+        return None
+
+    @tdamm_tag_ml.setter
+    def tdamm_tag_ml(self, value):
+        if self.is_tdamm:
+            self._tdamm_tag_ml = value
+
+    _tdamm_tag_manual = ArrayField(
         models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES),
         blank=True,
         null=True,
-        verbose_name="TDAMM Manual Tags"
+        verbose_name="TDAMM Manual Tags",
+        db_column="tdamm_tag_manual",
     )
-    tdamm_tag_ml = ArrayField(
+
+    _tdamm_tag_ml = ArrayField(
         models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES),
         blank=True,
         null=True,
-        verbose_name="TDAMM ML Tags"
+        verbose_name="TDAMM ML Tags",
+        db_column="tdamm_tag_ml",
     )
-    tdamm_tag = PairedFieldDescriptor('tdamm_tag')
+
+    tdamm_tag = PairedFieldDescriptor("tdamm_tag")
 
     class Meta:
         """Meta definition for Candidate URL."""
@@ -144,6 +173,7 @@ class Meta:
         verbose_name = "Candidate URL"
         verbose_name_plural = "Candidate URLs"
         ordering = ["url"]
+        db_table = "sde_collections_candidateurl"
 
     @property
     def fileext(self) -> str:
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index b7bb3b25..29d86c31 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -112,18 +112,21 @@ class CandidateURLAPISerializer(serializers.ModelSerializer):
     title = serializers.SerializerMethodField()
     file_extension = serializers.SerializerMethodField()
     tree_root = serializers.SerializerMethodField()
+    tdamm_tag = serializers.SerializerMethodField()
 
     class Meta:
         model = CandidateURL
-        fields = (
-            "url",
-            "title",
-            "document_type",
-            "hash",
-            "file_extension",
-            "tree_root",
-            "tdamm_tag"
-        )
+        fields = ("url", "title", "document_type", "hash", "file_extension", "tree_root", "is_tdamm", "tdamm_tag")
+
+    def to_representation(self, instance):
+        """Remove tdamm_tag field if is_tdamm is False"""
+        representation = super().to_representation(instance)
+        if not instance.is_tdamm:
+            representation.pop("tdamm_tag", None)
+        return representation
+
+    def get_tdamm_tag(self, obj):
+        return obj.tdamm_tag
 
     def get_document_type(self, obj):
         if obj.document_type is not None:
diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py
index e07d41dc..9ac0c4e3 100644
--- a/sde_collections/utils/paired_field_descriptor.py
+++ b/sde_collections/utils/paired_field_descriptor.py
@@ -1,6 +1,3 @@
-from django.db import models
-
-
 class PairedFieldDescriptor:
     def __init__(self, field_name):
         self.manual_field_name = f"{field_name}_manual"

From 7e888e8457f02bbe8417f6e66ecc1d52be9608c4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Nov 2024 20:49:23 +0000
Subject: [PATCH 047/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/admin.py                | 2 +-
 sde_collections/models/candidate_url.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 0860d0e5..bf97cf02 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -1,5 +1,6 @@
 import csv
 
+from django import forms
 from django.contrib import admin, messages
 from django.http import HttpResponse
 
@@ -7,7 +8,6 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
-from django import forms
 
 
 @admin.action(description="Generate deployment message")
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 41c1072f..8d2776dd 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -2,13 +2,13 @@
 import os
 from urllib.parse import urlparse
 
+from django.contrib.postgres.fields import ArrayField
 from django.db import models
 
+from ..utils.paired_field_descriptor import PairedFieldDescriptor
 from .collection import Collection
 from .collection_choice_fields import Divisions, DocumentTypes
 from .pattern import ExcludePattern, TitlePattern
-from ..utils.paired_field_descriptor import PairedFieldDescriptor
-from django.contrib.postgres.fields import ArrayField
 
 
 class CandidateURLQuerySet(models.QuerySet):

From df88c6b11c1a91709bfcd01a1a88f8887f8b814b Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 16:23:41 -0600
Subject: [PATCH 048/441] squashed migrations

---
 .../0059_url_curatedurl_deltaurl_dumpurl.py   |  2 +-
 .../migrations/0060_delete_dumpurl.py         | 16 ---------
 sde_collections/migrations/0061_dumpurl.py    | 35 -------------------
 3 files changed, 1 insertion(+), 52 deletions(-)
 delete mode 100644 sde_collections/migrations/0060_delete_dumpurl.py
 delete mode 100644 sde_collections/migrations/0061_dumpurl.py

diff --git a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py
index 82f4d4af..58478546 100644
--- a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py
+++ b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.9 on 2024-10-10 03:01
+# Generated by Django 4.2.9 on 2024-11-04 22:22
 
 from django.db import migrations, models
 import django.db.models.deletion
diff --git a/sde_collections/migrations/0060_delete_dumpurl.py b/sde_collections/migrations/0060_delete_dumpurl.py
deleted file mode 100644
index db9a10c1..00000000
--- a/sde_collections/migrations/0060_delete_dumpurl.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Generated by Django 4.2.9 on 2024-10-14 16:37
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"),
-    ]
-
-    operations = [
-        migrations.DeleteModel(
-            name="DumpUrl",
-        ),
-    ]
diff --git a/sde_collections/migrations/0061_dumpurl.py b/sde_collections/migrations/0061_dumpurl.py
deleted file mode 100644
index 4aeb0088..00000000
--- a/sde_collections/migrations/0061_dumpurl.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Generated by Django 4.2.9 on 2024-10-23 19:29
-
-from django.db import migrations, models
-import django.db.models.deletion
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("sde_collections", "0060_delete_dumpurl"),
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name="DumpUrl",
-            fields=[
-                (
-                    "url_ptr",
-                    models.OneToOneField(
-                        auto_created=True,
-                        on_delete=django.db.models.deletion.CASCADE,
-                        parent_link=True,
-                        primary_key=True,
-                        serialize=False,
-                        to="sde_collections.url",
-                    ),
-                ),
-            ],
-            options={
-                "verbose_name": "Dump URL",
-                "verbose_name_plural": "Dump URLs",
-            },
-            bases=("sde_collections.url",),
-        ),
-    ]

From 48592cb6af69176b54fca27944dc0da370178aa1 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 16:24:01 -0600
Subject: [PATCH 049/441] updated import references

---
 sde_collections/serializers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 2f11700b..c42a84e6 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -3,7 +3,6 @@
 from .models.candidate_url import CandidateURL
 from .models.collection import Collection, WorkflowHistory
 from .models.collection_choice_fields import Divisions, DocumentTypes
-from .models.curated_url import CuratedUrl
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -11,6 +10,7 @@
     IncludePattern,
     TitlePattern,
 )
+from .models.url import CuratedUrl
 
 
 class CollectionSerializer(serializers.ModelSerializer):

From 266082c6f6054af9b0a72ed9cdf1a227012a080b Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 16:24:20 -0600
Subject: [PATCH 050/441] updated import references

---
 sde_collections/tasks.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index ecc3c1a9..77876500 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -10,10 +10,7 @@
 from config import celery_app
 
 from .models.collection import Collection, WorkflowStatusChoices
-from .models.curated_url import CuratedUrl
-from .models.delta_url import DeltaUrl
-from .models.dump_url import DumpUrl
-from .models.url import Url
+from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url
 from .sinequa_api import Api
 from .utils.github_helper import GitHubHandler
 

From c3e2aee2be337ab04387e860c6cef24fcc8266ac Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 4 Nov 2024 16:24:39 -0600
Subject: [PATCH 051/441] update import references

---
 sde_collections/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/views.py b/sde_collections/views.py
index b8ff70a0..5d5d2982 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -27,7 +27,6 @@
     DocumentTypes,
     WorkflowStatusChoices,
 )
-from .models.curated_url import CuratedUrl
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -35,6 +34,7 @@
     IncludePattern,
     TitlePattern,
 )
+from .models.url import CuratedUrl
 from .serializers import (
     CandidateURLBulkCreateSerializer,
     CandidateURLSerializer,

From f95a1a2666c3ab3d34c3331ccc883e83aa8c6006 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 6 Nov 2024 15:55:28 -0600
Subject: [PATCH 052/441] Frontend work in progress

---
 sde_collections/serializers.py                |  90 +-
 sde_collections/urls.py                       |   2 +
 sde_collections/views.py                      |  64 +-
 .../static/js/candidate_url_list.js           | 848 +++++++++++++----
 .../sde_collections/candidate_urls_list.html  | 893 +++++++++++-------
 5 files changed, 1378 insertions(+), 519 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index c42a84e6..ff1b6d3d 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -10,7 +10,7 @@
     IncludePattern,
     TitlePattern,
 )
-from .models.url import CuratedUrl
+from .models.url import CuratedUrl, DeltaUrl
 
 
 class CollectionSerializer(serializers.ModelSerializer):
@@ -99,6 +99,94 @@ class Meta:
         )
 
 
+class CuratedURLSerializer(serializers.ModelSerializer):
+    excluded = serializers.BooleanField(required=False)
+    document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
+    division_display = serializers.CharField(source="get_division_display", read_only=True)
+    url = serializers.CharField(required=False)
+    generated_title_id = serializers.SerializerMethodField(read_only=True)
+    match_pattern_type = serializers.SerializerMethodField(read_only=True)
+    curated_urls_count = serializers.SerializerMethodField(read_only=True)
+
+    def get_curated_urls_count(self, obj):
+        titlepattern = obj.titlepattern_urls.last()
+        return titlepattern.curated_urls.count() if titlepattern else 0
+
+    def get_generated_title_id(self, obj):
+        titlepattern = obj.titlepattern_urls.last()
+        return titlepattern.id if titlepattern else None
+
+    def get_match_pattern_type(self, obj):
+        titlepattern = obj.titlepattern_urls.last()
+        return titlepattern.match_pattern_type if titlepattern else None
+
+    class Meta:
+        model = CuratedUrl
+        fields = (
+            "id",
+            "excluded",
+            "url",
+            "scraped_title",
+            "generated_title",
+            "generated_title_id",
+            "match_pattern_type",
+            "curated_urls_count",
+            "document_type",
+            "document_type_display",
+            "division",
+            "division_display",
+            "visited",
+            # "test_title",
+            # "production_title",
+            # "present_on_test",
+            # "present_on_prod",
+        )
+
+
+class DeltaURLSerializer(serializers.ModelSerializer):
+    excluded = serializers.BooleanField(required=False)
+    document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
+    division_display = serializers.CharField(source="get_division_display", read_only=True)
+    url = serializers.CharField(required=False)
+    generated_title_id = serializers.SerializerMethodField(read_only=True)
+    match_pattern_type = serializers.SerializerMethodField(read_only=True)
+    delta_urls_count = serializers.SerializerMethodField(read_only=True)
+
+    def get_delta_urls_count(self, obj):
+        titlepattern = obj.titlepattern_urls.last()
+        return titlepattern.delta_urls.count() if titlepattern else 0
+
+    def get_generated_title_id(self, obj):
+        titlepattern = obj.titlepattern_urls.last()
+        return titlepattern.id if titlepattern else None
+
+    def get_match_pattern_type(self, obj):
+        titlepattern = obj.titlepattern_urls.last()
+        return titlepattern.match_pattern_type if titlepattern else None
+
+    class Meta:
+        model = DeltaUrl
+        fields = (
+            "id",
+            "excluded",
+            "url",
+            "scraped_title",
+            "generated_title",
+            "generated_title_id",
+            "match_pattern_type",
+            "delta_urls_count",
+            "document_type",
+            "document_type_display",
+            "division",
+            "division_display",
+            "visited",
+            # "test_title",
+            # "production_title",
+            # "present_on_test",
+            # "present_on_prod",
+        )
+
+
 class CandidateURLBulkCreateSerializer(serializers.ModelSerializer):
     class Meta:
         model = CandidateURL
diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index 214d1198..a17f6390 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -9,6 +9,8 @@
 router.register(r"collections", views.CollectionViewSet, basename="collection")
 router.register(r"collections-read", views.CollectionReadViewSet, basename="collection-read")
 router.register(r"candidate-urls", views.CandidateURLViewSet)
+router.register(r"curated-urls", views.CuratedURLViewSet)
+router.register(r"delta-urls", views.DeltaURLViewSet)
 router.register(r"exclude-patterns", views.ExcludePatternViewSet)
 router.register(r"include-patterns", views.IncludePatternViewSet)
 router.register(r"title-patterns", views.TitlePatternViewSet)
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 5d5d2982..f738b23d 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -34,13 +34,15 @@
     IncludePattern,
     TitlePattern,
 )
-from .models.url import CuratedUrl
+from .models.url import CuratedUrl, DeltaUrl
 from .serializers import (
     CandidateURLBulkCreateSerializer,
     CandidateURLSerializer,
     CollectionReadSerializer,
     CollectionSerializer,
     CuratedUrlAPISerializer,
+    CuratedURLSerializer,
+    DeltaURLSerializer,
     DivisionPatternSerializer,
     DocumentTypePatternSerializer,
     ExcludePatternSerializer,
@@ -285,6 +287,66 @@ def update_division(self, request, pk=None):
         return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
 
 
+class CuratedURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = CuratedUrl.objects.all()
+    serializer_class = CuratedURLSerializer
+
+    def _filter_by_is_excluded(self, queryset, is_excluded):
+        if is_excluded == "false":
+            queryset = queryset.filter(excluded=False)
+        elif is_excluded == "true":
+            queryset = queryset.exclude(excluded=False)
+        return queryset
+
+    def get_queryset(self):
+        queryset = super().get_queryset()
+        if self.request.method == "GET":
+            # Filter based on exclusion status
+            is_excluded = self.request.GET.get("is_excluded")
+            if is_excluded:
+                queryset = self._filter_by_is_excluded(queryset, is_excluded)
+        return queryset.order_by("url")
+
+    def update_division(self, request, pk=None):
+        curated_url = get_object_or_404(CuratedUrl, pk=pk)
+        division = request.data.get("division")
+        if division:
+            curated_url.division = division
+            curated_url.save()
+            return Response(status=status.HTTP_200_OK)
+        return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
+
+
+class DeltaURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = DeltaUrl.objects.all()
+    serializer_class = DeltaURLSerializer
+
+    def _filter_by_is_excluded(self, queryset, is_excluded):
+        if is_excluded == "false":
+            queryset = queryset.filter(excluded=False)
+        elif is_excluded == "true":
+            queryset = queryset.exclude(excluded=False)
+        return queryset
+
+    def get_queryset(self):
+        queryset = super().get_queryset()
+        if self.request.method == "GET":
+            # Filter based on exclusion status
+            is_excluded = self.request.GET.get("is_excluded")
+            if is_excluded:
+                queryset = self._filter_by_is_excluded(queryset, is_excluded)
+        return queryset.order_by("url")
+
+    def update_division(self, request, pk=None):
+        delta_url = get_object_or_404(DeltaUrl, pk=pk)
+        division = request.data.get("division")
+        if division:
+            delta_url.division = division
+            delta_url.save()
+            return Response(status=status.HTTP_200_OK)
+        return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
+
+
 class CandidateURLBulkCreateView(generics.ListCreateAPIView):
     queryset = CandidateURL.objects.all()
     serializer_class = CandidateURLBulkCreateSerializer
diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js
index ed6d3e4b..7b01cc6c 100644
--- a/sde_indexing_helper/static/js/candidate_url_list.js
+++ b/sde_indexing_helper/static/js/candidate_url_list.js
@@ -322,6 +322,436 @@ function initializeDataTable() {
     }, 1000)
   );
 
+  var curated_urls_table = $("#curated_urls_table").DataTable({
+    pageLength: 100,
+    colReorder: true,
+    stateSave: true,
+    layout: {
+      bottomEnd: "inputPaging",
+      topEnd: null,
+      topStart: {
+        info: true,
+        pageLength: {
+          menu: [
+            [25, 50, 100, 500],
+            ["Show 25", "Show 50", "Show 100", "Show 500"],
+          ],
+        },
+        buttons: [
+          {
+            extend: "csv",
+            exportOptions: {
+              columns: [0, 11, 2, 12, 10],
+            },
+            customize: function (csv) {
+              var lines = csv.split("\n");
+
+              // Reorder the header columns
+              var headers = lines[0].split(",");
+              headers[4] = "New Title";
+              var reorderedHeaders = [
+                headers[0],
+                headers[3],
+                headers[1],
+                headers[4],
+                headers[5],
+                headers[2],
+              ];
+              lines[0] = reorderedHeaders.join(",");
+
+              const appliedFilt = [
+                [`URL:`, `${$("#curatedUrlFilter").val()}`.trim()],
+                [`Exclude:`, `${$(".dropdown-1").val()}`.trim()],
+                [
+                  `Scraped Title:`,
+                  `${$("#curatedScrapedTitleFilter").val()}`.trim(),
+                ],
+                [`New Title:`, `${$("#curatedNewTitleFilter").val()}`.trim()],
+                [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()],
+                [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()],
+              ];
+
+              const filtersAreEmpty = appliedFilt.every((filter) => {
+                return filter[1] === "" || filter[1] === "undefined";
+              });
+
+              // Remove the second row with the filters
+              if (lines.length > 2) {
+                lines.splice(1, 1);
+              }
+              let alteredLines = [];
+              lines.forEach((line) => {
+                let newLine = "";
+                newLine = line.replace("open_in_new", "");
+                alteredLines.push(newLine);
+              });
+
+              if (filtersAreEmpty) return alteredLines.join("\n");
+              else {
+                // Add filter information to the first row
+                const secondRowFilters = [
+                  "Export of SDE Curated URLs",
+                  `"(Applied Filters: ${appliedFilt
+                    .reduce((acc, curr) => {
+                      if (
+                        curr[1] !== " undefined" &&
+                        curr[1] !== " " &&
+                        curr[1] !== "" &&
+                        curr[1] !== "undefined"
+                      ) {
+                        acc = `${acc}, ${curr[0]} ${curr[1]}`;
+                      }
+                      return acc;
+                    }, "")
+                    .slice(2)})"`,
+                ];
+
+                var appliedFiltersInfo = secondRowFilters.join("\n");
+                return appliedFiltersInfo + "\n" + alteredLines.join("\n");
+              }
+            },
+          },
+          "spacer",
+          {
+            text: "Customize Columns",
+            className: "customizeColumns",
+            action: function () {
+              modalContents("#curated_urls_table");
+            },
+          },
+        ],
+      },
+    },
+    serverSide: true,
+    orderCellsTop: true,
+    pagingType: "input",
+    rowId: "url",
+    stateLoadCallback: function (settings) {
+      var state = JSON.parse(
+        localStorage.getItem(
+          "DataTables_curated_urls_" + window.location.pathname
+        )
+      );
+      if (!state) {
+        settings.oInit.pageLength = 1;
+      }
+      return state;
+    },
+    ajax: {
+      url: `/api/curated-urls/?format=datatables&collection_id=${collection_id}`,
+      data: function (d) {
+        d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null;
+      },
+    },
+    initComplete: function (data) {
+      const addDropdownSelect = [1, 4, 5];
+      const dict = {
+        1: "Images",
+        2: "Data",
+        3: "Documentation",
+        4: "Software and Tools",
+        5: "Missions and Instruments",
+      };
+      this.api()
+        .columns()
+        .every(function (index) {
+          let column = this;
+          if (addDropdownSelect.includes(index)) {
+            $("thead tr td select.dropdown-" + index).on("change", function () {
+              var val = $.fn.dataTable.util.escapeRegex($(this).val());
+              column.search(val ? "^" + val + "$" : "", true, false).draw();
+            });
+          }
+        });
+    },
+
+    columns: [
+      getCuratedURLColumn(),
+      getExcludedColumn(true_icon, false_icon),
+      getScrapedTitleColumn(),
+      getCuratedGeneratedTitleColumn(),
+      getDocumentTypeColumn(),
+      getDivisionColumn(),
+      { data: "id", visible: false, searchable: false },
+      { data: "generated_title_id", visible: false, searchable: false },
+      { data: "match_pattern_type", visible: false, searchable: false },
+      { data: "curated_urls_count", visible: false, searchable: false },
+      { data: "excluded", visible: false, searchable: false },
+      {
+        data: null,
+        render: function (data, type, row) {
+          if (!row.document_type) return "Select";
+          return dict[row.document_type];
+        },
+        visible: false,
+      },
+      {
+        data: null,
+        render: function (data, type, row) {
+          const excludedDict = {
+            true: "Yes",
+            false: "No",
+          };
+          return excludedDict[row.excluded];
+        },
+        visible: false,
+      },
+      {
+        data: null,
+        render: function (data, type, row) {
+          return row.generated_title;
+        },
+        visible: false,
+      },
+      // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []),
+      // getDivisionColumn(),
+    ],
+    createdRow: function (row, data, dataIndex) {
+      if (data["excluded"]) {
+        $(row).attr(
+          "style",
+          "background-color: rgba(255, 61, 87, 0.36) !important"
+        );
+      }
+    },
+  });
+
+  $("#curatedUrlFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      curated_urls_table.columns(0).search(this.value).draw();
+    }, 1000)
+  );
+
+  $("#curatedScrapedTitleFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      curated_urls_table.columns(2).search(this.value).draw();
+    }, 1000)
+  );
+
+  $("#curatedNewTitleFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      curated_urls_table.columns(3).search(this.value).draw();
+    }, 1000)
+  );
+
+  var delta_urls_table = $("#delta_urls_table").DataTable({
+    pageLength: 100,
+    colReorder: true,
+    stateSave: true,
+    layout: {
+      bottomEnd: "inputPaging",
+      topEnd: null,
+      topStart: {
+        info: true,
+        pageLength: {
+          menu: [
+            [25, 50, 100, 500],
+            ["Show 25", "Show 50", "Show 100", "Show 500"],
+          ],
+        },
+        buttons: [
+          {
+            extend: "csv",
+            exportOptions: {
+              columns: [0, 11, 2, 12, 10],
+            },
+            customize: function (csv) {
+              var lines = csv.split("\n");
+
+              // Reorder the header columns
+              var headers = lines[0].split(",");
+              headers[4] = "New Title";
+              var reorderedHeaders = [
+                headers[0],
+                headers[3],
+                headers[1],
+                headers[4],
+                headers[5],
+                headers[2],
+              ];
+              lines[0] = reorderedHeaders.join(",");
+
+              const appliedFilt = [
+                [`URL:`, `${$("#deltaUrlFilter").val()}`.trim()],
+                [`Exclude:`, `${$(".dropdown-1").val()}`.trim()],
+                [
+                  `Scraped Title:`,
+                  `${$("#deltaScrapedTitleFilter").val()}`.trim(),
+                ],
+                [`New Title:`, `${$("#deltaNewTitleFilter").val()}`.trim()],
+                [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()],
+                [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()],
+              ];
+
+              const filtersAreEmpty = appliedFilt.every((filter) => {
+                return filter[1] === "" || filter[1] === "undefined";
+              });
+
+              // Remove the second row with the filters
+              if (lines.length > 2) {
+                lines.splice(1, 1);
+              }
+              let alteredLines = [];
+              lines.forEach((line) => {
+                let newLine = "";
+                newLine = line.replace("open_in_new", "");
+                alteredLines.push(newLine);
+              });
+
+              if (filtersAreEmpty) return alteredLines.join("\n");
+              else {
+                // Add filter information to the first row
+                const secondRowFilters = [
+                  "Export of SDE Delta URLs",
+                  `"(Applied Filters: ${appliedFilt
+                    .reduce((acc, curr) => {
+                      if (
+                        curr[1] !== " undefined" &&
+                        curr[1] !== " " &&
+                        curr[1] !== "" &&
+                        curr[1] !== "undefined"
+                      ) {
+                        acc = `${acc}, ${curr[0]} ${curr[1]}`;
+                      }
+                      return acc;
+                    }, "")
+                    .slice(2)})"`,
+                ];
+
+                var appliedFiltersInfo = secondRowFilters.join("\n");
+                return appliedFiltersInfo + "\n" + alteredLines.join("\n");
+              }
+            },
+          },
+          "spacer",
+          {
+            text: "Customize Columns",
+            className: "customizeColumns",
+            action: function () {
+              modalContents("#delta_urls_table");
+            },
+          },
+        ],
+      },
+    },
+    serverSide: true,
+    orderCellsTop: true,
+    pagingType: "input",
+    rowId: "url",
+    stateLoadCallback: function (settings) {
+      var state = JSON.parse(
+        localStorage.getItem(
+          "DataTables_delta_urls_" + window.location.pathname
+        )
+      );
+      if (!state) {
+        settings.oInit.pageLength = 1;
+      }
+      return state;
+    },
+    ajax: {
+      url: `/api/delta-urls/?format=datatables&collection_id=${collection_id}`,
+      data: function (d) {
+        d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null;
+      },
+    },
+    initComplete: function (data) {
+      const addDropdownSelect = [1, 4, 5];
+      const dict = {
+        1: "Images",
+        2: "Data",
+        3: "Documentation",
+        4: "Software and Tools",
+        5: "Missions and Instruments",
+      };
+      this.api()
+        .columns()
+        .every(function (index) {
+          let column = this;
+          if (addDropdownSelect.includes(index)) {
+            $("thead tr td select.dropdown-" + index).on("change", function () {
+              var val = $.fn.dataTable.util.escapeRegex($(this).val());
+              column.search(val ? "^" + val + "$" : "", true, false).draw();
+            });
+          }
+        });
+    },
+
+    columns: [
+      getDeltaURLColumn(),
+      getExcludedColumn(true_icon, false_icon),
+      getScrapedTitleColumn(),
+      getDeltaGeneratedTitleColumn(),
+      getDocumentTypeColumn(),
+      getDivisionColumn(),
+      { data: "id", visible: false, searchable: false },
+      { data: "generated_title_id", visible: false, searchable: false },
+      { data: "match_pattern_type", visible: false, searchable: false },
+      { data: "delta_urls_count", visible: false, searchable: false },
+      { data: "excluded", visible: false, searchable: false },
+      {
+        data: null,
+        render: function (data, type, row) {
+          if (!row.document_type) return "Select";
+          return dict[row.document_type];
+        },
+        visible: false,
+      },
+      {
+        data: null,
+        render: function (data, type, row) {
+          const excludedDict = {
+            true: "Yes",
+            false: "No",
+          };
+          return excludedDict[row.excluded];
+        },
+        visible: false,
+      },
+      {
+        data: null,
+        render: function (data, type, row) {
+          return row.generated_title;
+        },
+        visible: false,
+      },
+      // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []),
+      // getDivisionColumn(),
+    ],
+    createdRow: function (row, data, dataIndex) {
+      if (data["excluded"]) {
+        $(row).attr(
+          "style",
+          "background-color: rgba(255, 61, 87, 0.36) !important"
+        );
+      }
+    },
+  });
+
+  $("#deltaUrlFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      delta_urls_table.columns(0).search(this.value).draw();
+    }, 1000)
+  );
+
+  $("#deltaScrapedTitleFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      delta_urls_table.columns(2).search(this.value).draw();
+    }, 1000)
+  );
+
+  $("#deltaNewTitleFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      delta_urls_table.columns(3).search(this.value).draw();
+    }, 1000)
+  );
+
   var exclude_patterns_table = $("#exclude_patterns_table").DataTable({
     // scrollY: true,
     dom: "lBrtip",
@@ -682,96 +1112,96 @@ function initializeDataTable() {
 var division_patterns_table = $("#division_patterns_table").DataTable({
   dom: "lBrtip",
   buttons: [
-      {
-          text: "Add Pattern",
-          className: "addPattern",
-          action: function () {
-              $modal = $("#divisionPatternModal").modal();
-          },
+    {
+      text: "Add Pattern",
+      className: "addPattern",
+      action: function () {
+        $modal = $("#divisionPatternModal").modal();
       },
-      {
-          text: "Customize Columns",
-          className: "customizeColumns",
-          action: function () {
-              modalContents("#division_patterns_table");
-          },
+    },
+    {
+      text: "Customize Columns",
+      className: "customizeColumns",
+      action: function () {
+        modalContents("#division_patterns_table");
       },
+    },
   ],
   lengthMenu: [
-      [25, 50, 100, 500],
-      ["Show 25", "Show 50", "Show 100", "Show 500"],
+    [25, 50, 100, 500],
+    ["Show 25", "Show 50", "Show 100", "Show 500"],
   ],
   orderCellsTop: true,
   pageLength: 100,
   ajax: `/api/division-patterns/?format=datatables&collection_id=${collection_id}`,
   initComplete: function (data) {
-      this.api()
-          .columns()
-          .every(function (index) {
-              var table = $("#division_patterns_table").DataTable();
-
-              let addDropdownSelect = {
-                  1: {
-                      columnToSearch: 6,
-                      matchPattern: {
-                          "Individual URL Pattern": 1,
-                          "Multi-URL Pattern": 2,
-                      },
-                  },
-                  2: {
-                      columnToSearch: 7,
-                      matchPattern: {
-                          "Astrophysics": 1,
-                          "Biological and Physical Sciences": 2,
-                          "Earth Science": 3,
-                          "Heliophysics": 4,
-                          "Planetary Science": 5,
-                      },
-                  },
-              };
-
-              let column = this;
-              if (column.data().length === 0) {
-                  $(`#division-patterns-dropdown-${index}`).prop("disabled", true);
-              } else if (index in addDropdownSelect) {
-                  $("#division-patterns-dropdown-" + index).on("change", function () {
-                      let col = addDropdownSelect[index].columnToSearch;
-                      let searchInput =
-                          addDropdownSelect[index].matchPattern[$(this).val()];
-                      if ($(this).val() === "" || $(this).val() === undefined)
-                          table.columns(col).search("").draw();
-                      else {
-                          table.columns(col).search(searchInput).draw();
-                      }
-                  });
-              }
+    this.api()
+      .columns()
+      .every(function (index) {
+        var table = $("#division_patterns_table").DataTable();
+
+        let addDropdownSelect = {
+          1: {
+            columnToSearch: 6,
+            matchPattern: {
+              "Individual URL Pattern": 1,
+              "Multi-URL Pattern": 2,
+            },
+          },
+          2: {
+            columnToSearch: 7,
+            matchPattern: {
+              "Astrophysics": 1,
+              "Biological and Physical Sciences": 2,
+              "Earth Science": 3,
+              "Heliophysics": 4,
+              "Planetary Science": 5,
+            },
+          },
+        };
+
+        let column = this;
+        if (column.data().length === 0) {
+          $(`#division-patterns-dropdown-${index}`).prop("disabled", true);
+        } else if (index in addDropdownSelect) {
+          $("#division-patterns-dropdown-" + index).on("change", function () {
+            let col = addDropdownSelect[index].columnToSearch;
+            let searchInput =
+              addDropdownSelect[index].matchPattern[$(this).val()];
+            if ($(this).val() === "" || $(this).val() === undefined)
+              table.columns(col).search("").draw();
+            else {
+              table.columns(col).search(searchInput).draw();
+            }
           });
+        }
+      });
   },
 
   columns: [
-      { data: "match_pattern", class: "whiteText" },
-      {
-          data: "match_pattern_type_display",
-          class: "text-center whiteText",
-          sortable: false,
-      },
-      { data: "division_display", class: "whiteText" },
-      {
-          data: "candidate_urls_count",
-          class: "text-center whiteText",
-          sortable: true,
-      },
-      {
-          data: null,
-          sortable: false,
-          class: "text-center",
-          render: function (data, type, row) {
-              return `<button class="btn btn-danger btn-sm delete-division-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
-          },
+    { data: "match_pattern", class: "whiteText" },
+    {
+      data: "match_pattern_type_display",
+      class: "text-center whiteText",
+      sortable: false,
+    },
+    { data: "division_display", class: "whiteText" },
+    {
+      data: "candidate_urls_count",
+      class: "text-center whiteText",
+      sortable: true,
+    },
+    {
+      data: null,
+      sortable: false,
+      class: "text-center",
+      render: function (data, type, row) {
+        return `<button class="btn btn-danger btn-sm delete-division-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
       },
-      { data: "id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false },
-      { data: "division", visible: false },
+    },
+    { data: "id", visible: false, searchable: false },
+    { data: "match_pattern_type", visible: false },
+    { data: "division", visible: false },
   ],
 });
 
@@ -841,8 +1271,8 @@ function getDivisionColumn() {
           <div class="dropdown-menu">
             <a class="dropdown-item division_select" href="#" value="0">None</a>
             ${Object.entries(divisionDict).map(([value, name]) => {
-              return `<a class="dropdown-item division_select" href="#" value="${value}">${name}</a>`;
-            }).join('')}
+        return `<a class="dropdown-item division_select" href="#" value="${value}">${name}</a>`;
+      }).join('')}
           </div>
         </div>`;
     },
@@ -882,7 +1312,7 @@ $("#division_pattern_form").on("submit", function (e) {
   inputs = {};
   input_serialized = $(this).serializeArray();
   input_serialized.forEach((field) => {
-      inputs[field.name] = field.value;
+    inputs[field.name] = field.value;
   });
 
   console.log("Form Inputs:", inputs);  // Debugging line to check inputs
@@ -902,43 +1332,43 @@ $(".division_form_select").on("click", function (e) {
 
 function postDivisionPatterns(match_pattern, match_pattern_type, division) {
   if (!match_pattern) {
-      toastr.error("Please highlight a pattern to add division.");
-      return;
+    toastr.error("Please highlight a pattern to add division.");
+    return;
   }
 
   $.ajax({
-      url: "/api/division-patterns/",
-      type: "POST",
-      data: {
-          collection: collection_id,
-          match_pattern: match_pattern,
-          match_pattern_type: match_pattern_type,
-          division: division,
-          csrfmiddlewaretoken: csrftoken,
-      },
-      success: function (data) {
-          $("#candidate_urls_table").DataTable().ajax.reload(null, false);
-          $("#division_patterns_table").DataTable().ajax.reload(null, false);
-          if (currentTab === "") { // Only add a notification if we are on the first tab
-              newDivisionPatternsCount = newDivisionPatternsCount + 1;
-              $("#divisionPatternsTab").html(
-                  `Division Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
-                  newDivisionPatternsCount + " new" +
-                  `</span>`
-              );
-          }
-      },
-      error: function (xhr, status, error) {
-          var errorMessage = xhr.responseText;
-          if (
-              errorMessage ==
-              '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}'
-          ) {
-              toastr.success("Pattern already exists");
-              return;
-          }
-          toastr.error(errorMessage);
-      },
+    url: "/api/division-patterns/",
+    type: "POST",
+    data: {
+      collection: collection_id,
+      match_pattern: match_pattern,
+      match_pattern_type: match_pattern_type,
+      division: division,
+      csrfmiddlewaretoken: csrftoken,
+    },
+    success: function (data) {
+      $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+      $("#division_patterns_table").DataTable().ajax.reload(null, false);
+      if (currentTab === "") { // Only add a notification if we are on the first tab
+        newDivisionPatternsCount = newDivisionPatternsCount + 1;
+        $("#divisionPatternsTab").html(
+          `Division Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
+          newDivisionPatternsCount + " new" +
+          `</span>`
+        );
+      }
+    },
+    error: function (xhr, status, error) {
+      var errorMessage = xhr.responseText;
+      if (
+        errorMessage ==
+        '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}'
+      ) {
+        toastr.success("Pattern already exists");
+        return;
+      }
+      toastr.error(errorMessage);
+    },
   });
 }
 
@@ -950,9 +1380,36 @@ function getURLColumn() {
       return `<div class="url-cell"><span class="candidate_url nameStyling">${remove_protocol(
         data
       )}</span>
-      <a target="_blank" href="${data}" data-url="/api/candidate-urls/${
-        row["id"]
-      }/" class="url-link"> <i class="material-icons url-icon">open_in_new</i></a></div>`;
+      <a target="_blank" href="${data}" data-url="/api/candidate-urls/${row["id"]
+        }/" class="url-link"> <i class="material-icons url-icon">open_in_new</i></a></div>`;
+    },
+  };
+}
+
+function getCuratedURLColumn() {
+  return {
+    data: "url",
+    width: "30%",
+    render: function (data, type, row) {
+      return `<div class="url-cell"><span class="curated_url nameStyling">${remove_protocol(
+        data
+      )}</span>
+      <a target="_blank" href="${data}" data-url="/api/curated-urls/${row["id"]
+        }/" class="url-link"> <i class="material-icons url-icon">open_in_new</i></a></div>`;
+    },
+  };
+}
+
+function getDeltaURLColumn() {
+  return {
+    data: "url",
+    width: "30%",
+    render: function (data, type, row) {
+      return `<div class="url-cell"><span class="delta_url nameStyling">${remove_protocol(
+        data
+      )}</span>
+      <a target="_blank" href="${data}" data-url="/api/delta-urls/${row["id"]
+        }/" class="url-link"> <i class="material-icons url-icon">open_in_new</i></a></div>`;
     },
   };
 }
@@ -972,13 +1429,36 @@ function getGeneratedTitleColumn() {
     data: "generated_title",
     width: "20%",
     render: function (data, type, row) {
-      return `<input type="text" class="form-control individual_title_input whiteText" value='${data}' data-generated-title-id=${
-        row["generated_title_id"]
-      } data-match-pattern-type=${
-        row["match_pattern_type"]
-      } data-candidate-urls-count=${
-        row["candidate_urls_count"]
-      } data-url=${remove_protocol(row["url"])} />`;
+      return `<input type="text" class="form-control individual_title_input whiteText" value='${data}' data-generated-title-id=${row["generated_title_id"]
+        } data-match-pattern-type=${row["match_pattern_type"]
+        } data-candidate-urls-count=${row["candidate_urls_count"]
+        } data-url=${remove_protocol(row["url"])} />`;
+    },
+  };
+}
+
+function getCuratedGeneratedTitleColumn() {
+  return {
+    data: "generated_title",
+    width: "20%",
+    render: function (data, type, row) {
+      return `<input type="text" class="form-control individual_title_input whiteText" value='${data}' data-generated-title-id=${row["generated_title_id"]
+        } data-match-pattern-type=${row["match_pattern_type"]
+        } data-curated-urls-count=${row["curated_urls_count"]
+        } data-url=${remove_protocol(row["url"])} />`;
+    },
+  };
+}
+
+function getDeltaGeneratedTitleColumn() {
+  return {
+    data: "generated_title",
+    width: "20%",
+    render: function (data, type, row) {
+      return `<input type="text" class="form-control individual_title_input whiteText" value='${data}' data-generated-title-id=${row["generated_title_id"]
+        } data-match-pattern-type=${row["match_pattern_type"]
+        } data-delta-urls-count=${row["delta_urls_count"]
+        } data-url=${remove_protocol(row["url"])} />`;
     },
   };
 }
@@ -991,11 +1471,11 @@ function getExcludedColumn(true_icon, false_icon) {
     render: function (data, type, row) {
       return data === true
         ? `<a class="exclude_individual_url" value=${remove_protocol(
-            row["url"]
-          )}>${true_icon}</a>`
+          row["url"]
+        )}>${true_icon}</a>`
         : `<a class="exclude_individual_url" value=${remove_protocol(
-            row["url"]
-          )}>${false_icon}</a>`;
+          row["url"]
+        )}>${false_icon}</a>`;
     },
   };
 }
@@ -1016,8 +1496,8 @@ function getDocumentTypeColumn() {
       button_color = data ? "btn-success" : "btn-secondary";
       return `
             <div class="dropdown document_type_dropdown"  data-match-pattern=${remove_protocol(
-              row["url"]
-            )}>
+        row["url"]
+      )}>
               <button class="btn ${button_color} btn-sm dropdown-toggle selectStyling" type="button" id="dropdownMenuButton" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
                 ${button_text}
               </button>
@@ -1036,10 +1516,10 @@ function getDocumentTypeColumn() {
 
 //template to add enter and escape functionalities to add pattern modals
 function addEnterEscapeKeypress(modalID, formID) {
-  $("body").on("keydown", function(event) {
+  $("body").on("keydown", function (event) {
     let modal = $(modalID);
     let form = $(formID)
-    if(event.key == "Escape" && modal.is(":visible")) {
+    if (event.key == "Escape" && modal.is(":visible")) {
       modal.modal("hide");
     }
     if (event.key == "Enter" && modal.is(":visible")) {
@@ -1249,14 +1729,14 @@ function postDocumentTypePatterns(
     success: function (data) {
       $("#candidate_urls_table").DataTable().ajax.reload(null, false);
       $("#document_type_patterns_table").DataTable().ajax.reload(null, false);
-      if(currentTab === ""){ //Only add a notification if we are on the first tab
-      newDocumentTypePatternsCount = newDocumentTypePatternsCount + 1;
-      $("#documentTypePatternsTab").html(
-        `Document Type Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
+      if (currentTab === "") { //Only add a notification if we are on the first tab
+        newDocumentTypePatternsCount = newDocumentTypePatternsCount + 1;
+        $("#documentTypePatternsTab").html(
+          `Document Type Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
           newDocumentTypePatternsCount + " new" +
           `</span>`
-      );
-    }
+        );
+      }
     },
     error: function (xhr, status, error) {
       var errorMessage = xhr.responseText;
@@ -1300,14 +1780,14 @@ function postExcludePatterns(match_pattern, match_pattern_type = 0, force) {
     success: function (data) {
       $("#candidate_urls_table").DataTable().ajax.reload(null, false);
       $("#exclude_patterns_table").DataTable().ajax.reload(null, false);
-      if(currentTab === ""){ //Only add a notification if we are on the first tab
-      newExcludePatternsCount = newExcludePatternsCount + 1;
-      $("#excludePatternsTab").html(
-        `Exclude Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
+      if (currentTab === "") { //Only add a notification if we are on the first tab
+        newExcludePatternsCount = newExcludePatternsCount + 1;
+        $("#excludePatternsTab").html(
+          `Exclude Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
           newExcludePatternsCount + " new" +
           `</span>`
-      );
-    }
+        );
+      }
     },
     error: function (xhr, status, error) {
       var errorMessage = xhr.responseText;
@@ -1342,14 +1822,14 @@ function postIncludePatterns(match_pattern, match_pattern_type = 0) {
     success: function (data) {
       $("#candidate_urls_table").DataTable().ajax.reload(null, false);
       $("#include_patterns_table").DataTable().ajax.reload(null, false);
-      if(currentTab === ""){ //Only add a notification if we are on the first tab
-      newIncludePatternsCount = newIncludePatternsCount + 1;
-      $("#includePatternsTab").html(
-        `Include Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
+      if (currentTab === "") { //Only add a notification if we are on the first tab
+        newIncludePatternsCount = newIncludePatternsCount + 1;
+        $("#includePatternsTab").html(
+          `Include Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
           newIncludePatternsCount + " new" +
           `</span>`
-      );
-    }
+        );
+      }
     },
     error: function (xhr, status, error) {
       var errorMessage = xhr.responseText;
@@ -1368,40 +1848,40 @@ function postTitlePatterns(
     return;
   }
 
-    $.ajax({
-        url: '/api/title-patterns/',
-        type: "POST",
-        data: {
-            collection: collection_id,
-            match_pattern: match_pattern,
-            match_pattern_type: match_pattern_type,
-            title_pattern: title_pattern,
-            csrfmiddlewaretoken: csrftoken
-        },
-        success: function (data) {
-            $('#candidate_urls_table').DataTable().ajax.reload(null, false);
-            $('#title_patterns_table').DataTable().ajax.reload(null, false);
-            if(currentTab === ""){ //Only add a notification if we are on the first tab
-              newTitlePatternsCount = newTitlePatternsCount + 1;
-              $("#titlePatternsTab").html(
-                `Title Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
-                  newTitlePatternsCount + " new" +
-                  `</span>`
-              );
-            }
-        },
-        error: function (xhr, status, error) {
-            var errorMessage = xhr.responseText;
-            if (errorMessage == '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}') {
-              toastr.success("Pattern already exists");
-              return;
-            }
-            var errorMessages = JSON.parse(errorMessage);
-            Object.entries(errorMessages.error).forEach(([key, value]) => {
-                toastr.error(value, key);
-            });
-        }
-    });
+  $.ajax({
+    url: '/api/title-patterns/',
+    type: "POST",
+    data: {
+      collection: collection_id,
+      match_pattern: match_pattern,
+      match_pattern_type: match_pattern_type,
+      title_pattern: title_pattern,
+      csrfmiddlewaretoken: csrftoken
+    },
+    success: function (data) {
+      $('#candidate_urls_table').DataTable().ajax.reload(null, false);
+      $('#title_patterns_table').DataTable().ajax.reload(null, false);
+      if (currentTab === "") { //Only add a notification if we are on the first tab
+        newTitlePatternsCount = newTitlePatternsCount + 1;
+        $("#titlePatternsTab").html(
+          `Title Patterns <span class="pill notifyBadge badge badge-pill badge-primary">` +
+          newTitlePatternsCount + " new" +
+          `</span>`
+        );
+      }
+    },
+    error: function (xhr, status, error) {
+      var errorMessage = xhr.responseText;
+      if (errorMessage == '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}') {
+        toastr.success("Pattern already exists");
+        return;
+      }
+      var errorMessages = JSON.parse(errorMessage);
+      Object.entries(errorMessages.error).forEach(([key, value]) => {
+        toastr.error(value, key);
+      });
+    }
+  });
 }
 
 function postVisited(url) {
@@ -1415,7 +1895,7 @@ function postVisited(url) {
     headers: {
       "X-CSRFToken": csrftoken,
     },
-    success: function (data) {},
+    success: function (data) { },
   });
 }
 
diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 00a82069..09f74207 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -5,301 +5,501 @@
 {{ collection.name }} Candidate URLs
 {% endblock title %}
 {% block stylesheets %}
-    {{ block.super }}
-    <link href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css" rel="stylesheet">
-    <link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css" rel="stylesheet" />
-    <link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
-    <link href="{% static 'css/project.css' %}" rel="stylesheet">
+{{ block.super }}
+<link
+    href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css"
+    rel="stylesheet">
+<link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css"
+    rel="stylesheet" />
+<link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
+<link href="{% static 'css/project.css' %}" rel="stylesheet">
 {% endblock stylesheets %}
 {% block content %}
 {% csrf_token %}
 <div class="headerDiv">
-<h1 class="pageTitle">Candidate URLs</h1>
-<button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
-type="button"
-data-toggle="dropdown"
-aria-haspopup="true"
-id="workflow-status-button-{{ collection.id }}"
-aria-expanded="false">{{ collection.get_workflow_status_display }}</button>
-<div class="dropdown-menu"
-aria-labelledby="workflow-status-button-{{ collection.id }}">
-{% for choice in workflow_status_choices %}
-<a class="dropdown-item workflow_status_select" value="{{ choice }}" data-collection-id={{ collection.id }} >{{ choice.label }}</a>
-{% endfor %}
-</div>
+    <h1 class="pageTitle">Candidate URLs</h1>
+    <button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
+        type="button" data-toggle="dropdown" aria-haspopup="true" id="workflow-status-button-{{ collection.id }}"
+        aria-expanded="false">{{ collection.get_workflow_status_display }}</button>
+    <div class="dropdown-menu" aria-labelledby="workflow-status-button-{{ collection.id }}">
+        {% for choice in workflow_status_choices %}
+        <a class="dropdown-item workflow_status_select" value="{{ choice }}" data-collection-id={{ collection.id }}>{{
+            choice.label }}</a>
+        {% endfor %}
+    </div>
 </div>
 <div class="candidateUrlContainer">
-<h3 class="whiteText candidateTitle">
-    {{ candidate_urls.count|intcomma }} Candidate URLs for <a
-        href="{% url 'sde_collections:detail' collection.pk %}"><strong class="urlStyle underline">{{ collection.name }}</strong></a>
-    <br>
-    <!-- <small class="muted">Base URL: <a href="{{ collection.url }}" target="_blank">{{ collection.url }}</a></small> -->
-</h3>
+    <h3 class="whiteText candidateTitle">
+        {{ candidate_urls.count|intcomma }} Candidate URLs for <a
+            href="{% url 'sde_collections:detail' collection.pk %}"><strong class="urlStyle underline">{{
+                collection.name }}</strong></a>
+        <br>
+        <!-- <small class="muted">Base URL: <a href="{{ collection.url }}" target="_blank">{{ collection.url }}</a></small> -->
+    </h3>
 
-<div>
-    <!-- Nav tabs -->
-    <ul class="nav nav-tabs">
-        <li class="nav-item">
-            <a class="tab-nav active tabStyle" data-toggle="tab" href="#Candidate-URLs">Candidate URLs</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="excludePatternsTab" data-toggle="tab" href="#Exclude-Patterns">Exclude Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="includePatternsTab" data-toggle="tab" href="#Include-Patterns">Include Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="titlePatternsTab" data-toggle="tab" href="#Title-Patterns">Title Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="documentTypePatternsTab" data-toggle="tab" href="#Document-Type-Patterns">Document Type Patterns</a>
-        </li>
-        {% if is_multi_division %}
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="divisionPatternsTab" data-toggle="tab" href="#Division-Patterns">Division Patterns</a>
-        </li>
-        {% endif %}
-    </ul>
+    <div>
+        <!-- Nav tabs -->
+        <ul class="nav nav-tabs">
+            <li class="nav-item">
+                <a class="tab-nav active tabStyle" data-toggle="tab" href="#Candidate-URLs">Candidate URLs</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" data-toggle="tab" href="#Curated-URLs">Curated URLs</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" data-toggle="tab" href="#Delta-URLs">Delta URLs</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="excludePatternsTab" data-toggle="tab" href="#Exclude-Patterns">Exclude
+                    Patterns</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="includePatternsTab" data-toggle="tab" href="#Include-Patterns">Include
+                    Patterns</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="titlePatternsTab" data-toggle="tab" href="#Title-Patterns">Title
+                    Patterns</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="documentTypePatternsTab" data-toggle="tab"
+                    href="#Document-Type-Patterns">Document Type Patterns</a>
+            </li>
+            {% if is_multi_division %}
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="divisionPatternsTab" data-toggle="tab"
+                    href="#Division-Patterns">Division Patterns</a>
+            </li>
+            {% endif %}
+        </ul>
 
-    <!-- Tab panes -->
-    <div class="tab-content">
-        <div class="tab-pane active" id="Candidate-URLs">
-            <table class="table" id="candidate_urls_table" style="width:100%" >
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1"><div class="header-title">URL</div></th>
+        <!-- Tab panes -->
+        <div class="tab-content">
+            <div class="tab-pane active" id="Candidate-URLs">
+                <table class="table" id="candidate_urls_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">URL</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Exclude</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Scraped Title</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">New Title</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Document Type</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Division</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">ID</div>
+                            </th>
+                            <th></th>
+                            <th></th>
+                            <th></th>
+                            <th></th>
+                            <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
                         <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">Scraped Title</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">New Title</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">Division</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">ID</div></th>
-                        <th></th>
-                        <th></th>
-                        <th></th>
-                        <th></th>
-                        <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
+                            <!-- {% if is_multi_division %} -->
+                            <!-- {% endif %} -->
+
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="candidateUrlFilter"
+                                    placeholder="URL" /></td>
+                            <td><select class="dropdown-1 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="false">FALSE</option>
+                                    <option value="true">TRUE</option>
+                                </select></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateNewTitleFilter" placeholder="New Title" /></td>
+                            <td><select class="dropdown-4 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="0">None</option>
+                                    <option value="1">Images</option>
+                                    <option value="2">Data</option>
+                                    <option value="3">Documentation</option>
+                                    <option value="4">Software and Tools</option>
+                                    <option value="5">Missions and Instruments</option>
+                                </select></td>
+                            <td><select class="dropdown-5 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="0">None</option>
+                                    <option value="1">Astrophysics</option>
+                                    <option value="2">Biological and Physical Sciences</option>
+                                    <option value="3">Earth Science</option>
+                                    <option value="4">Heliophysics</option>
+                                    <option value="5">Planetary Science</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+            <div class="tab-pane fade" id="Curated-URLs">
+                <table class="table" id="curated_urls_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">URL</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Exclude</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Scraped Title</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">New Title</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Document Type</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Division</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">ID</div>
+                            </th>
+                            <th></th>
+                            <th></th>
+                            <th></th>
+                            <th></th>
+                            <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
                         <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
                         <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
-                        <!-- {% if is_multi_division %} -->
-                        <!-- {% endif %} -->
+                            <!-- {% if is_multi_division %} -->
+                            <!-- {% endif %} -->
 
-                    </tr>
-                    <tr>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateUrlFilter" placeholder="URL" /></td>
-                        <td ><select class="dropdown-1 select-dropdown selectStyling"><option value="">SELECT</option>
-                            <option value="false">FALSE</option>
-                            <option value="true">TRUE</option>
-                        </select></td>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateScrapedTitleFilter" placeholder="Scraped Title" /></td>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateNewTitleFilter" placeholder="New Title" /></td>
-                        <td><select class="dropdown-4 select-dropdown selectStyling"><option value="">SELECT</option>
-                        <option value="0">None</option>
-                        <option value="1">Images</option>
-                        <option value="2">Data</option>
-                        <option value="3">Documentation</option>
-                        <option value="4">Software and Tools</option>
-                        <option value="5">Missions and Instruments</option>
-                        </select></td>
-                        <td><select class="dropdown-5 select-dropdown selectStyling"><option value="">SELECT</option>
-                        <option value="0">None</option>
-                        <option value="1">Astrophysics</option>
-                        <option value="2">Biological and Physical Sciences</option>
-                        <option value="3">Earth Science</option>
-                        <option value="4">Heliophysics</option>
-                        <option value="5">Planetary Science</option>
-                        </select></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-        <div class="tab-pane fade" id="Exclude-Patterns">
-            <table class="table" id="exclude_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                    </tr>
-                    <tr>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td > <select id="exclude-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
-                            <option value="Individual URL Pattern">Individual URL Pattern</option>
-                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                        </select>
-                            <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateReasonFilter" placeholder="Reason" /></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                    </tr>
-                </thead>
-            </table>
-            <hr>
-        </div>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="curatedUrlFilter"
+                                    placeholder="URL" /></td>
+                            <td><select class="dropdown-1 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="false">FALSE</option>
+                                    <option value="true">TRUE</option>
+                                </select></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="curatedScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="curatedNewTitleFilter" placeholder="New Title" /></td>
+                            <td><select class="dropdown-4 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="0">None</option>
+                                    <option value="1">Images</option>
+                                    <option value="2">Data</option>
+                                    <option value="3">Documentation</option>
+                                    <option value="4">Software and Tools</option>
+                                    <option value="5">Missions and Instruments</option>
+                                </select></td>
+                            <td><select class="dropdown-5 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="0">None</option>
+                                    <option value="1">Astrophysics</option>
+                                    <option value="2">Biological and Physical Sciences</option>
+                                    <option value="3">Earth Science</option>
+                                    <option value="4">Heliophysics</option>
+                                    <option value="5">Planetary Science</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+            <div class="tab-pane fade" id="Delta-URLs">
+                <table class="table" id="delta_urls_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">URL</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Exclude</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Scraped Title</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">New Title</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Document Type</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">Division</div>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <div class="header-title">ID</div>
+                            </th>
+                            <th></th>
+                            <th></th>
+                            <th></th>
+                            <th></th>
+                            <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
+                            <!-- {% if is_multi_division %} -->
+                            <!-- {% endif %} -->
+
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaUrlFilter"
+                                    placeholder="URL" /></td>
+                            <td><select class="dropdown-1 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="false">FALSE</option>
+                                    <option value="true">TRUE</option>
+                                </select></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaNewTitleFilter" placeholder="New Title" /></td>
+                            <td><select class="dropdown-4 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="0">None</option>
+                                    <option value="1">Images</option>
+                                    <option value="2">Data</option>
+                                    <option value="3">Documentation</option>
+                                    <option value="4">Software and Tools</option>
+                                    <option value="5">Missions and Instruments</option>
+                                </select></td>
+                            <td><select class="dropdown-5 select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="0">None</option>
+                                    <option value="1">Astrophysics</option>
+                                    <option value="2">Biological and Physical Sciences</option>
+                                    <option value="3">Earth Science</option>
+                                    <option value="4">Heliophysics</option>
+                                    <option value="5">Planetary Science</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+            <div class="tab-pane fade" id="Exclude-Patterns">
+                <table class="table" id="exclude_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td> <select id="exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateReasonFilter" placeholder="Reason" /></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+                <hr>
+            </div>
 
-        <div class="tab-pane fade" id="Include-Patterns">
-            <table class="table" id="include_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Actions</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>ID</strong>
-                        </th>
-                    </tr>
-                    <tr>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td > <select id="include-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
-                            <option value="Individual URL Pattern">Individual URL Pattern</option>
-                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                        </select></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                    </tr>
-                </thead>
-            </table>
+            <div class="tab-pane fade" id="Include-Patterns">
+                <table class="table" id="include_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Affected URLs</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Actions</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>ID</strong>
+                            </th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td> <select id="include-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
 
-        </div>
-        <div class="tab-pane fade" id="Title-Patterns">
-            <table class="table" id="title_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Title Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Actions</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>ID</strong>
-                        </th>
-                    </tr>
-                    <tr>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td > <select id="title-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
-                            <option value="Individual URL Pattern">Individual URL Pattern</option>
-                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                        </select></td>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-        <div class="tab-pane fade" id="Document-Type-Patterns">
-            <table class="table" id="document_type_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Document Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Actions</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>ID</strong>
-                        </th>
-                    </tr>
-                    <tr>
-                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="candidateDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td ><select id="document-type-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
-                            <option value="Individual URL Pattern">Individual URL Pattern</option>
-                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                        </select></td>
-                        <td ><select id="document-type-patterns-dropdown-2" class="select-dropdown selectStyling"><option value="">SELECT</option>
-                            <option value="Images">Images</option>
-                            <option value="Data">Data</option>
-                            <option value="Documentation">Documentation</option>
-                            <option value="Software and Tools">Software and Tools</option>
-                            <option value="Missions and Instruments">Missions and Instruments</option>
-                        </select></td>
-                        <td ></td>
-                        <td ></td>
-                        <td ></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-        <div class="tab-pane fade" id="Division-Patterns">
-            <table class="table" id="division_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Division</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="candidateDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td><select id="division-patterns-dropdown-1" class="select-dropdown selectStyling">
-                            <option value="">SELECT</option>
-                            <option value="Individual URL Pattern">Individual URL Pattern</option>
-                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                        </select></td>
-                        <td><select id="division-patterns-dropdown-2" class="select-dropdown selectStyling">
-                            <option value="">SELECT</option>
-                            <option value="1">Astrophysics</option>
-                            <option value="2">Biological and Physical Sciences</option>
-                            <option value="3">Earth Science</option>
-                            <option value="4">Heliophysics</option>
-                            <option value="5">Planetary Science</option>
-                        </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
+            </div>
+            <div class="tab-pane fade" id="Title-Patterns">
+                <table class="table" id="title_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Title Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Affected URLs</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Actions</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>ID</strong>
+                            </th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td> <select id="title-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+            <div class="tab-pane fade" id="Document-Type-Patterns">
+                <table class="table" id="document_type_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Document Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Affected URLs</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Actions</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>ID</strong>
+                            </th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td><select id="document-type-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td><select id="document-type-patterns-dropdown-2" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Images">Images</option>
+                                    <option value="Data">Data</option>
+                                    <option value="Documentation">Documentation</option>
+                                    <option value="Software and Tools">Software and Tools</option>
+                                    <option value="Missions and Instruments">Missions and Instruments</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+            <div class="tab-pane fade" id="Division-Patterns">
+                <table class="table" id="division_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Division</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="candidateDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td><select id="division-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td><select id="division-patterns-dropdown-2" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="1">Astrophysics</option>
+                                    <option value="2">Biological and Physical Sciences</option>
+                                    <option value="3">Earth Science</option>
+                                    <option value="4">Heliophysics</option>
+                                    <option value="5">Planetary Science</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
 
+        </div>
     </div>
-</div>
 
-<br>
+    <br>
 </div>
 
 
@@ -323,15 +523,18 @@ <h5 class="modal-title" id="excludePatternModalLabel">Exclude Pattern Form</h5>
             <form id="exclude_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
+                        </label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                    <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                    <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                </div></div>
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
             </form>
         </div>
     </div>
@@ -349,15 +552,18 @@ <h5 class="modal-title" id="includePatternModalLabel">Include Pattern Form</h5>
             <form id="include_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
+                        </label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                    <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                    <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                </div></div>
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
             </form>
         </div>
     </div>
@@ -375,19 +581,23 @@ <h5 class="modal-title" id="titlePatternModalLabel">Title Pattern Form</h5>
             <form id="title_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
+                        </label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                     <div class="form-group title_pattern-form-group">
-                        <label for="title_pattern_input" class="form-label">Title Pattern <div class="asterik">*</div></label>
+                        <label for="title_pattern_input" class="form-label">Title Pattern <div class="asterik">*</div>
+                        </label>
                         <input type="text" class="form-control" id="title_pattern_input" required name="title_pattern">
                     </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                    <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                    <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                </div> </div>
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
             </form>
         </div>
     </div>
@@ -405,7 +615,8 @@ <h5 class="modal-title" id="documentTypePatternModalLabel">Document Type Pattern
             <form id="document_type_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
+                        </label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                     <div class="form-group">
@@ -430,8 +641,10 @@ <h5 class="modal-title" id="documentTypePatternModalLabel">Document Type Pattern
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                        <button type="submit" class="document-type-submit btn btn-primary modal-button-2">Submit</button>
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit"
+                            class="document-type-submit btn btn-primary modal-button-2">Submit</button>
                     </div>
                 </div>
             </form>
@@ -451,19 +664,23 @@ <h5 class="modal-title" id="divisionPatternModalLabel">Division Pattern Form</h5
             <form id="division_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="division_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
-                        <input type="text" class="form-control" id="division_match_pattern_input" required name="match_pattern">
+                        <label for="division_match_pattern_input" class="form-label">Match Pattern <div class="asterik">
+                                *</div></label>
+                        <input type="text" class="form-control" id="division_match_pattern_input" required
+                            name="match_pattern">
                     </div>
                     <div class="form-group">
                         <div class="input-group">
-                            <input type="hidden" name="division_pattern" class="form-control"
-                                aria-label="Division" id="division_input_field">
+                            <input type="hidden" name="division_pattern" class="form-control" aria-label="Division"
+                                id="division_input_field">
                             <div class="input-group-append division-dropdown-input">
-                                <button class="btn btn-secondary btn-block dropdown-toggle division-dropdown" type="button"
-                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Division</button>
+                                <button class="btn btn-secondary btn-block dropdown-toggle division-dropdown"
+                                    type="button" data-toggle="dropdown" aria-haspopup="true"
+                                    aria-expanded="false">Select Division</button>
                                 <div class="division-form dropdown-menu">
                                     <a class="dropdown-item division_form_select" value="1">Astrophysics</a>
-                                    <a class="dropdown-item division_form_select" value="2">Biological and Physical Sciences</a>
+                                    <a class="dropdown-item division_form_select" value="2">Biological and Physical
+                                        Sciences</a>
                                     <a class="dropdown-item division_form_select" value="3">Earth Science</a>
                                     <a class="dropdown-item division_form_select" value="4">Heliophysics</a>
                                     <a class="dropdown-item division_form_select" value="5">Planetary Science</a>
@@ -474,7 +691,8 @@ <h5 class="modal-title" id="divisionPatternModalLabel">Division Pattern Form</h5
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
                         <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
                     </div>
                 </div>
@@ -503,15 +721,15 @@ <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Column
 </div>
 
 
-<div class="modal" id="deletePatternModal" tabindex="-1"
-aria-labelledby="deletePatternModal" aria-hidden="true">
-<div class="modal-dialog">
-    <div class="modal-content">
-        <div class="modalHeader">
-            <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeDivisionModal">
-                <span aria-hidden="true">&times;</span>
-            </button>
-        </div>
+<div class="modal" id="deletePatternModal" tabindex="-1" aria-labelledby="deletePatternModal" aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modalHeader">
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
+                    id="closeDivisionModal">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
             <div class="modal-body" id="modal-body">
                 <h5 class="modal-title">Are you sure?</h5>
                 <p class="delete-pattern-caption" id="caption"></p>
@@ -519,24 +737,27 @@ <h5 class="modal-title">Are you sure?</h5>
             <div class="modal-footer">
                 <form id="deletePatternModalForm">
                     <div class="button-wrapper">
-                    <button type="submit" class="btn btn-secondary modal-button-1" id="dontDeletePattern">No</button>
-                    <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="deletePattern">Yes</button>
+                        <button type="submit" class="btn btn-secondary modal-button-1"
+                            id="dontDeletePattern">No</button>
+                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal"
+                            id="deletePattern">Yes</button>
                     </div>
-                    </form>
+                </form>
             </div>
+        </div>
     </div>
 </div>
-</div>
 
-<div class="modal" id="workflowStatusChangeModal" tabindex="-1"
-aria-labelledby="workflowStatusChangeModal" aria-hidden="true">
-<div class="modal-dialog">
-    <div class="modal-content">
-        <div class="modalHeader">
-            <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeworkflowStatusChangeModal">
-                <span aria-hidden="true">&times;</span>
-            </button>
-        </div>
+<div class="modal" id="workflowStatusChangeModal" tabindex="-1" aria-labelledby="workflowStatusChangeModal"
+    aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modalHeader">
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
+                    id="closeworkflowStatusChangeModal">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
             <div class="modal-body" id="modal-body">
                 <h5 class="modal-title">Are you sure?</h5>
                 <p class="workflow-status-change-caption" id="caption"></p>
@@ -544,26 +765,32 @@ <h5 class="modal-title">Are you sure?</h5>
             <div class="modal-footer">
                 <form id="workflowStatusChangeModalForm">
                     <div class="button-wrapper">
-                    <button type="submit" class="btn btn-secondary modal-button-1" id="cancelworkflowStatusChange">No</button>
-                    <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="changeWorkflowStatus">Yes</button>
+                        <button type="submit" class="btn btn-secondary modal-button-1"
+                            id="cancelworkflowStatusChange">No</button>
+                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal"
+                            id="changeWorkflowStatus">Yes</button>
                     </div>
-                    </form>
+                </form>
             </div>
+        </div>
     </div>
-</div>
 
-{% endblock content %}
+    {% endblock content %}
 
-{% block javascripts %}
+    {% block javascripts %}
     {{ block.super }}
     <script>var collection_id = "{{ collection.id }}";</script>
     <script>var is_multi_division = "{{ is_multi_division|lower }}";</script>
     <script src="//cdnjs.cloudflare.com/ajax/libs/pdfmake/0.2.7/pdfmake.min.js"></script>
     <script src="//cdnjs.cloudflare.com/ajax/libs/pdfmake/0.2.7/vfs_fonts.js"></script>
-    <script src="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.js"></script>
+    <script
+        src="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.js"></script>
     <script src="//cdnjs.cloudflare.com/ajax/libs/jquery.blockUI/2.70/jquery.blockUI.min.js"></script>
-    <script src="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.js"></script>
+    <script
+        src="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.js"></script>
     <script src="{% static 'js/candidate_url_list.js' %}"></script>
+    <!-- <script src="{% static 'js/curated_url_list.js' %}"></script>
+    <script src="{% static 'js/delta_url_list.js' %}"></script> -->
     <script src="{% static 'js/project.js' %}"></script>
     <script src="{% static 'js/core/bootstrap.min.js' %}"></script>
-{% endblock javascripts %}
+    {% endblock javascripts %}

From 08e145ce389ecdfe5b4fbf7017d744263fec4237 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 7 Nov 2024 10:55:38 -0600
Subject: [PATCH 053/441] add basic models file

---
 sde_collections/models/delta_url.py | 39 +++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 sde_collections/models/delta_url.py

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
new file mode 100644
index 00000000..509d5c42
--- /dev/null
+++ b/sde_collections/models/delta_url.py
@@ -0,0 +1,39 @@
+import re
+from django.db import models
+from django.core.exceptions import ValidationError
+
+
+# models
+from django.db import models
+from .collection_choice_fields import Divisions, DocumentTypes
+
+
+class BaseUrl(models.Model):
+    """Abstract base class for Urls with shared fields and methods."""
+
+    collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="%(class)s_urls")
+    url = models.CharField("Url", unique=True)
+    scraped_title = models.CharField("Scraped Title", blank=True, default="")
+    generated_title = models.CharField("Generated Title", blank=True, default="")
+    visited = models.BooleanField(default=False)
+    document_type = models.IntegerField(choices=DocumentTypes.choices, null=True)
+    division = models.IntegerField(choices=Divisions.choices, null=True)
+
+    class Meta:
+        abstract = True
+        ordering = ["url"]
+
+    def __str__(self):
+        return self.url
+
+
+class DeltaUrl(BaseUrl):
+    """Urls that are being curated. Only deltas are stored in this model."""
+
+    delete = models.BooleanField(default=False)
+
+
+class CuratedUrl(BaseUrl):
+    """Urls that are curated and ready for production"""
+
+    pass

From 2ac12c99f993970407f9bb153acf4ffaf46478a1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:58:59 +0000
Subject: [PATCH 054/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/models/delta_url.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 509d5c42..12aa81f6 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -1,10 +1,9 @@
 import re
-from django.db import models
-from django.core.exceptions import ValidationError
-
 
+from django.core.exceptions import ValidationError
 # models
 from django.db import models
+
 from .collection_choice_fields import Divisions, DocumentTypes
 
 

From aee857c9d3cbdf06338cfe2e275e0ef64473af32 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 7 Nov 2024 11:02:46 -0600
Subject: [PATCH 055/441] update imports and add DumpUrl model

---
 sde_collections/models/delta_url.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 12aa81f6..5c9ea870 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -1,7 +1,3 @@
-import re
-
-from django.core.exceptions import ValidationError
-# models
 from django.db import models
 
 from .collection_choice_fields import Divisions, DocumentTypes
@@ -26,6 +22,10 @@ def __str__(self):
         return self.url
 
 
+class DumpUrl(BaseUrl):
+    """Stores the raw dump from the server before deltas are calculated."""
+
+
 class DeltaUrl(BaseUrl):
     """Urls that are being curated. Only deltas are stored in this model."""
 

From 44628cbc4508e3b7512650e4465fa4bf6fa5569c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 7 Nov 2024 11:07:57 -0600
Subject: [PATCH 056/441] update admin to pull from the delta_urls file

---
 sde_collections/admin.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index e7780846..75a392a3 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -6,7 +6,7 @@
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url
+from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from .tasks import import_candidate_urls_from_api
 
 
@@ -300,31 +300,24 @@ class DivisionPatternAdmin(admin.ModelAdmin):
     search_fields = ("match_pattern", "division")
 
 
-class CuratedUrlAdmin(admin.ModelAdmin):
-    """Admin View for CuratedUrl"""
+class DumpUrlAdmin(admin.ModelAdmin):
+    """Admin View for DumpUrl"""
 
-    list_display = ("url", "scraped_title", "collection")
+    list_display = ("url", "scraped_title", "generated_title", "collection")
     list_filter = ("collection",)
 
 
 class DeltaUrlAdmin(admin.ModelAdmin):
     """Admin View for DeltaUrl"""
 
-    list_display = ("url", "scraped_title", "collection")
-    list_filter = ("collection",)
-
-
-class DumpUrlAdmin(admin.ModelAdmin):
-    """Admin View for DumpUrl"""
-
     list_display = ("url", "scraped_title", "generated_title", "collection")
     list_filter = ("collection",)
 
 
-class UrlAdmin(admin.ModelAdmin):
-    """Admin View for Url"""
+class CuratedUrlAdmin(admin.ModelAdmin):
+    """Admin View for CuratedUrl"""
 
-    list_display = ("url", "scraped_title", "collection")
+    list_display = ("url", "scraped_title", "generated_title", "collection")
     list_filter = ("collection",)
 
 
@@ -334,7 +327,6 @@ class UrlAdmin(admin.ModelAdmin):
 admin.site.register(IncludePattern)
 admin.site.register(ResolvedTitle, ResolvedTitleAdmin)
 admin.site.register(DivisionPattern, DivisionPatternAdmin)
+admin.site.register(DumpUrl, DumpUrlAdmin)
 admin.site.register(DeltaUrl, DeltaUrlAdmin)
 admin.site.register(CuratedUrl, CuratedUrlAdmin)
-admin.site.register(DumpUrl, DumpUrlAdmin)
-admin.site.register(Url, UrlAdmin)

From 25fd9c1ef3b3e54867ab24d73a27f090ede34d27 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:08:20 +0000
Subject: [PATCH 057/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/admin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 75a392a3..88d10fea 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -5,8 +5,8 @@
 
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
-from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
 
 

From dd2076f881c6477e5c0b698ca479ab8644fd1ca9 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 7 Nov 2024 11:25:04 -0600
Subject: [PATCH 058/441] remove deprecated url model

---
 sde_collections/models/url.py  | 116 ---------------------------------
 sde_collections/serializers.py |   2 +-
 sde_collections/tasks.py       |   2 +-
 sde_collections/views.py       |   2 +-
 4 files changed, 3 insertions(+), 119 deletions(-)
 delete mode 100644 sde_collections/models/url.py

diff --git a/sde_collections/models/url.py b/sde_collections/models/url.py
deleted file mode 100644
index 3fc70243..00000000
--- a/sde_collections/models/url.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os
-from urllib.parse import urlparse
-
-from django.db import models
-
-from .collection import Collection
-from .collection_choice_fields import Divisions, DocumentTypes
-from .pattern import ExcludePattern
-
-
-class UrlQuerySet(models.QuerySet):
-    def with_exclusion_status(self):
-        return self.annotate(
-            excluded=models.Exists(
-                ExcludePattern.candidate_urls.through.objects.filter(candidateurl=models.OuterRef("pk"))
-            )
-        )
-
-
-class UrlManager(models.Manager):
-    def get_queryset(self):
-        return UrlQuerySet(self.model, using=self._db).with_exclusion_status()
-
-
-class Url(models.Model):
-    """This is the base URL model which serves as a base for DeltaUrl and CuratedUrl."""
-
-    collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="urls")
-    url = models.CharField("URL", max_length=4096)
-    scraped_title = models.CharField(
-        "Scraped Title",
-        max_length=1024,
-        default="",
-        blank=True,
-        help_text="This is the original title scraped by Sinequa",
-    )
-    generated_title = models.CharField(
-        "Generated Title",
-        max_length=1024,
-        default="",
-        blank=True,
-        help_text="This is the title generated based on a Title Pattern",
-    )
-    visited = models.BooleanField(default=False)
-    document_type = models.IntegerField(choices=DocumentTypes.choices, null=True)
-    division = models.IntegerField(choices=Divisions.choices, null=True)
-
-    objects = UrlManager()
-
-    class Meta:
-        verbose_name = "URL"
-        verbose_name_plural = "URLs"
-        ordering = ["url"]
-
-    @property
-    def fileext(self) -> str:
-        parsed_url = urlparse(self.url)
-        path = parsed_url.path
-        if path.endswith("/") or not path:
-            return "html"
-        extension = os.path.splitext(path)[1]
-        return extension[1:] if extension.startswith(".") else extension or "html"
-
-    def splits(self) -> list[tuple[str, str]]:
-        parts = []
-        part_string = ""
-        for part in self.path.split("/"):
-            if part:
-                part_string += f"/{part}"
-                parts.append((part_string, part))
-        return parts
-
-    @property
-    def path(self) -> str:
-        parsed = urlparse(self.url)
-        path = f"{parsed.path}"
-        if parsed.query:
-            path += f"?{parsed.query}"
-        return path
-
-    def __str__(self) -> str:
-        return self.url
-
-    def save(self, *args, **kwargs):
-        super().save(*args, **kwargs)
-
-
-class DumpUrl(Url):
-    """Model for storing all the imported URLs before separating them into delta URLs and Curated URLs."""
-
-    class Meta:
-        verbose_name = "Dump URL"
-        verbose_name_plural = "Dump URLs"
-
-    def save(self, *args, **kwargs):
-        if not self.pk:  # Ensure it's only called on create
-            super().save(*args, **kwargs)  # Save the parent `Url` entry
-        super().save(*args, **kwargs)
-
-
-class DeltaUrl(Url):
-    """Model for storing delta URLs for curation purposes"""
-
-    delete = models.BooleanField(default=False)
-
-    class Meta:
-        verbose_name = "Delta URL"
-        verbose_name_plural = "Delta URLs"
-
-
-class CuratedUrl(Url):
-    """Model for storing curated and live URLs after the curation process."""
-
-    class Meta:
-        verbose_name = "Curated URL"
-        verbose_name_plural = "Curated URLs"
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index c42a84e6..6389f9bf 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -10,7 +10,7 @@
     IncludePattern,
     TitlePattern,
 )
-from .models.url import CuratedUrl
+from .models.delta_url import CuratedUrl
 
 
 class CollectionSerializer(serializers.ModelSerializer):
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 77876500..33c66606 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -10,7 +10,7 @@
 from config import celery_app
 
 from .models.collection import Collection, WorkflowStatusChoices
-from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url
+from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from .sinequa_api import Api
 from .utils.github_helper import GitHubHandler
 
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 5d5d2982..73cdee86 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -34,7 +34,7 @@
     IncludePattern,
     TitlePattern,
 )
-from .models.url import CuratedUrl
+from .models.delta_url import CuratedUrl
 from .serializers import (
     CandidateURLBulkCreateSerializer,
     CandidateURLSerializer,

From 7c1c25564af26f9a4353af7261e06cb871c61487 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:25:26 +0000
Subject: [PATCH 059/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/serializers.py | 2 +-
 sde_collections/views.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 6389f9bf..1f9a6d7c 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -3,6 +3,7 @@
 from .models.candidate_url import CandidateURL
 from .models.collection import Collection, WorkflowHistory
 from .models.collection_choice_fields import Divisions, DocumentTypes
+from .models.delta_url import CuratedUrl
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -10,7 +11,6 @@
     IncludePattern,
     TitlePattern,
 )
-from .models.delta_url import CuratedUrl
 
 
 class CollectionSerializer(serializers.ModelSerializer):
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 73cdee86..6aa0fc68 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -27,6 +27,7 @@
     DocumentTypes,
     WorkflowStatusChoices,
 )
+from .models.delta_url import CuratedUrl
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -34,7 +35,6 @@
     IncludePattern,
     TitlePattern,
 )
-from .models.delta_url import CuratedUrl
 from .serializers import (
     CandidateURLBulkCreateSerializer,
     CandidateURLSerializer,

From 8bb7fc4293541c7f3174344ce17585cafb9e799c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 7 Nov 2024 11:43:33 -0600
Subject: [PATCH 060/441] add helper functions to url models

---
 sde_collections/models/delta_url.py | 42 +++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 5c9ea870..48b3cf20 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -1,3 +1,6 @@
+import os
+from urllib.parse import urlparse
+
 from django.db import models
 
 from .collection_choice_fields import Divisions, DocumentTypes
@@ -18,6 +21,45 @@ class Meta:
         abstract = True
         ordering = ["url"]
 
+    @property
+    def fileext(self) -> str:
+        # Parse the URL to get the path
+        parsed_url = urlparse(self.url)
+        path = parsed_url.path
+
+        # Check for cases where the path ends with a slash or is empty, implying a directory or default file
+        if path.endswith("/") or not path:
+            return "html"
+
+        # Extract the extension from the path
+        extension = os.path.splitext(path)[1]
+
+        # Default to .html if no extension is found
+        if not extension:
+            return "html"
+
+        if extension.startswith("."):
+            return extension[1:]
+        return extension
+
+    def splits(self) -> list[tuple[str, str]]:
+        """Split the path into multiple collections."""
+        parts = []
+        part_string = ""
+        for part in self.path.split("/"):
+            if part:
+                part_string += f"/{part}"
+                parts.append((part_string, part))
+        return parts
+
+    @property
+    def path(self) -> str:
+        parsed = urlparse(self.url)
+        path = f"{parsed.path}"
+        if parsed.query:
+            path += f"?{parsed.query}"
+        return path
+
     def __str__(self):
         return self.url
 

From b775a982a41924e9764bc9b68fd71b602cac216b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 7 Nov 2024 11:45:32 -0600
Subject: [PATCH 061/441] update migrations to mode to new deltaurls models

---
 ...url_ptr_remove_dumpurl_url_ptr_and_more.py |  37 ++++
 .../0061_dumpurl_deltaurl_curatedurl.py       | 162 ++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py
 create mode 100644 sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py

diff --git a/sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py b/sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py
new file mode 100644
index 00000000..1886e221
--- /dev/null
+++ b/sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py
@@ -0,0 +1,37 @@
+# Generated by Django 4.2.9 on 2024-11-07 17:40
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="deltaurl",
+            name="url_ptr",
+        ),
+        migrations.RemoveField(
+            model_name="dumpurl",
+            name="url_ptr",
+        ),
+        migrations.RemoveField(
+            model_name="url",
+            name="collection",
+        ),
+        migrations.DeleteModel(
+            name="CuratedUrl",
+        ),
+        migrations.DeleteModel(
+            name="DeltaUrl",
+        ),
+        migrations.DeleteModel(
+            name="DumpUrl",
+        ),
+        migrations.DeleteModel(
+            name="Url",
+        ),
+    ]
diff --git a/sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py b/sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py
new file mode 100644
index 00000000..abf4492c
--- /dev/null
+++ b/sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py
@@ -0,0 +1,162 @@
+# Generated by Django 4.2.9 on 2024-11-07 17:44
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="DumpUrl",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("url", models.CharField(unique=True, verbose_name="Url")),
+                ("scraped_title", models.CharField(blank=True, default="", verbose_name="Scraped Title")),
+                ("generated_title", models.CharField(blank=True, default="", verbose_name="Generated Title")),
+                ("visited", models.BooleanField(default=False)),
+                (
+                    "document_type",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Images"),
+                            (2, "Data"),
+                            (3, "Documentation"),
+                            (4, "Software and Tools"),
+                            (5, "Missions and Instruments"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                (
+                    "division",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Astrophysics"),
+                            (2, "Biological and Physical Sciences"),
+                            (3, "Earth Science"),
+                            (4, "Heliophysics"),
+                            (5, "Planetary Science"),
+                            (6, "General"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s_urls",
+                        to="sde_collections.collection",
+                    ),
+                ),
+            ],
+            options={
+                "ordering": ["url"],
+                "abstract": False,
+            },
+        ),
+        migrations.CreateModel(
+            name="DeltaUrl",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("url", models.CharField(unique=True, verbose_name="Url")),
+                ("scraped_title", models.CharField(blank=True, default="", verbose_name="Scraped Title")),
+                ("generated_title", models.CharField(blank=True, default="", verbose_name="Generated Title")),
+                ("visited", models.BooleanField(default=False)),
+                (
+                    "document_type",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Images"),
+                            (2, "Data"),
+                            (3, "Documentation"),
+                            (4, "Software and Tools"),
+                            (5, "Missions and Instruments"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                (
+                    "division",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Astrophysics"),
+                            (2, "Biological and Physical Sciences"),
+                            (3, "Earth Science"),
+                            (4, "Heliophysics"),
+                            (5, "Planetary Science"),
+                            (6, "General"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                ("delete", models.BooleanField(default=False)),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s_urls",
+                        to="sde_collections.collection",
+                    ),
+                ),
+            ],
+            options={
+                "ordering": ["url"],
+                "abstract": False,
+            },
+        ),
+        migrations.CreateModel(
+            name="CuratedUrl",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("url", models.CharField(unique=True, verbose_name="Url")),
+                ("scraped_title", models.CharField(blank=True, default="", verbose_name="Scraped Title")),
+                ("generated_title", models.CharField(blank=True, default="", verbose_name="Generated Title")),
+                ("visited", models.BooleanField(default=False)),
+                (
+                    "document_type",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Images"),
+                            (2, "Data"),
+                            (3, "Documentation"),
+                            (4, "Software and Tools"),
+                            (5, "Missions and Instruments"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                (
+                    "division",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Astrophysics"),
+                            (2, "Biological and Physical Sciences"),
+                            (3, "Earth Science"),
+                            (4, "Heliophysics"),
+                            (5, "Planetary Science"),
+                            (6, "General"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s_urls",
+                        to="sde_collections.collection",
+                    ),
+                ),
+            ],
+            options={
+                "ordering": ["url"],
+                "abstract": False,
+            },
+        ),
+    ]

From ae8c6cd735ab259395fb5885b0877f38b71fff13 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 7 Nov 2024 16:22:02 -0600
Subject: [PATCH 062/441] add promotion code and promotion tests

---
 sde_collections/models/collection.py          | 46 +++++++++++
 sde_collections/tests/factories.py            | 51 +++++++++++++
 .../tests/test_promote_collection.py          | 76 +++++++++++++++++++
 3 files changed, 173 insertions(+)
 create mode 100644 sde_collections/tests/factories.py
 create mode 100644 sde_collections/tests/test_promote_collection.py

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 31306b8c..c0d68aaf 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -8,6 +8,7 @@
 from django.dispatch import receiver
 from model_utils import FieldTracker
 from slugify import slugify
+from .delta_url import DeltaUrl, CuratedUrl
 
 from config_generation.db_to_xml import XmlEditor
 
@@ -83,6 +84,51 @@ class Meta:
         verbose_name = "Collection"
         verbose_name_plural = "Collections"
 
+    def promote_to_curated(self):
+        """
+        Promotes all DeltaUrls in this collection to CuratedUrls.
+        Updates, adds, or removes CuratedUrls as necessary to match the latest DeltaUrls.
+        """
+        # Step 1: Fetch all current DeltaUrls and CuratedUrls for this collection
+        delta_urls = {url.url: url for url in DeltaUrl.objects.filter(collection=self)}
+        curated_urls = {url.url: url for url in CuratedUrl.objects.filter(collection=self)}
+
+        # Step 2: Process each DeltaUrl to update or create the corresponding CuratedUrl
+        for url, delta in delta_urls.items():
+            curated = curated_urls.get(url)
+
+            # Delete the CuratedUrl if the DeltaUrl is marked for deletion
+            if delta.delete:
+                if curated:
+                    curated.delete()
+                continue
+
+            if curated:
+                updated_fields = {}
+                for field in delta._meta.fields:
+                    field_name = field.name
+                    if field_name == "delete":
+                        continue
+
+                    delta_value = getattr(delta, field_name)
+                    if delta_value not in [None, ""] and getattr(curated, field_name) != delta_value:
+                        updated_fields[field_name] = delta_value
+
+                if updated_fields:
+                    # Use update to modify fields directly in the database
+                    CuratedUrl.objects.filter(pk=curated.pk).update(**updated_fields)
+            else:
+                # If no matching CuratedUrl, create a new one using all non-null and non-empty fields
+                new_data = {
+                    field.name: getattr(delta, field.name)
+                    for field in delta._meta.fields
+                    if field.name not in ["delete", "collection"] and getattr(delta, field.name) not in [None, ""]
+                }
+                CuratedUrl.objects.create(collection=self, **new_data)
+
+        # Step 3: Clear all DeltaUrls for this collection since they've been promoted
+        DeltaUrl.objects.filter(collection=self).delete()
+
     def add_to_public_query(self):
         """Add the collection to the public query."""
         if self.workflow_status not in [
diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py
new file mode 100644
index 00000000..8ae8d3bc
--- /dev/null
+++ b/sde_collections/tests/factories.py
@@ -0,0 +1,51 @@
+import factory
+from django.contrib.auth import get_user_model
+from django.utils import timezone
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import (
+    ConnectorChoices,
+    CurationStatusChoices,
+    Divisions,
+    DocumentTypes,
+    UpdateFrequencies,
+    WorkflowStatusChoices,
+)
+
+User = get_user_model()
+
+
+class UserFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = User
+
+    username = factory.Sequence(lambda n: f"user{n}")
+    email = factory.LazyAttribute(lambda obj: f"{obj.username}@example.com")
+
+
+class CollectionFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = Collection
+
+    name = factory.Faker("company")
+    config_folder = factory.Sequence(
+        lambda n: f"config_folder_{n}"
+    )  # might need to update this to be calculated based on name?
+    url = factory.Faker("url")
+    division = Divisions.ASTROPHYSICS
+    connector = ConnectorChoices.CRAWLER2
+    update_frequency = UpdateFrequencies.WEEKLY
+    document_type = DocumentTypes.DOCUMENTATION
+    delete = False
+    is_multi_division = False
+
+    github_issue_number = factory.Sequence(lambda n: n)
+    notes = factory.Faker("paragraph")
+    updated_at = factory.LazyFunction(timezone.now)
+    new_collection = False
+
+    workflow_status = WorkflowStatusChoices.RESEARCH_IN_PROGRESS
+    tracker = factory.Maybe("workflow_status")
+
+    # ForeignKey to User for `curated_by`
+    curated_by = factory.SubFactory(UserFactory)
+    curation_started = factory.LazyFunction(timezone.now)
diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py
new file mode 100644
index 00000000..5e584984
--- /dev/null
+++ b/sde_collections/tests/test_promote_collection.py
@@ -0,0 +1,76 @@
+import pytest
+from sde_collections.models.delta_url import DeltaUrl, CuratedUrl
+from sde_collections.tests.factories import CollectionFactory
+
+
+@pytest.fixture
+def collection():
+    # Use the factory to create a collection with all necessary fields populated
+    return CollectionFactory()
+
+
+@pytest.mark.django_db
+def test_initial_promotion_creates_curated_urls(collection):
+    # Start with no DeltaUrls or CuratedUrls
+    assert DeltaUrl.objects.filter(collection=collection).count() == 0
+    assert CuratedUrl.objects.filter(collection=collection).count() == 0
+
+    # Add new DeltaUrls to promote
+    DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1")
+    DeltaUrl.objects.create(collection=collection, url="https://example2.com", scraped_title="Title 2")
+
+    # Promote DeltaUrls to CuratedUrls
+    collection.promote_to_curated()
+
+    # Check that CuratedUrls were created
+    curated_urls = CuratedUrl.objects.filter(collection=collection)
+    assert curated_urls.count() == 2
+    assert curated_urls.filter(url="https://example1.com", scraped_title="Title 1").exists()
+    assert curated_urls.filter(url="https://example2.com", scraped_title="Title 2").exists()
+
+
+@pytest.mark.django_db
+def test_promotion_updates_existing_curated_urls(collection):
+    # Dictionary containing test data for each URL
+    test_data = {
+        "url1": {"url": "https://example1.com", "original_title": "Title 1", "updated_title": "Updated Title 1"},
+        "url2": {"url": "https://example2.com", "original_title": "Title 2", "updated_title": "Updated Title 2"},
+    }
+
+    # Create initial DeltaUrls and promote them
+    for data in test_data.values():
+        DeltaUrl.objects.create(collection=collection, url=data["url"], scraped_title=data["original_title"])
+    collection.promote_to_curated()
+
+    assert DeltaUrl.objects.all().count() == 0
+
+    # Re-create DeltaUrls with updated titles
+    for data in test_data.values():
+        DeltaUrl.objects.create(collection=collection, url=data["url"], scraped_title=data["updated_title"])
+
+    # Promote the updates
+    collection.promote_to_curated()
+
+    # Check that CuratedUrls were updated with the updated titles
+    for data in test_data.values():
+        curated_url = CuratedUrl.objects.get(url=data["url"])
+        assert curated_url.scraped_title == data["updated_title"]
+
+
+@pytest.mark.django_db
+def test_promotion_deletes_curated_urls(collection):
+    # Create initial DeltaUrls and promote them
+    DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1")
+    DeltaUrl.objects.create(collection=collection, url="https://example2.com", scraped_title="Title 2")
+    collection.promote_to_curated()
+
+    # create a new DeltaUrl marked for deletion
+    DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1", delete=True)
+
+    # Promote the deletion
+    collection.promote_to_curated()
+
+    # Check that the CuratedUrl for the deleted DeltaUrl was removed
+    assert not CuratedUrl.objects.filter(url="https://example1.com").exists()
+    # Ensure the other CuratedUrl is still present
+    assert CuratedUrl.objects.filter(url="https://example2.com").exists()

From f25295791d8fa7b312a5da0e3c7a1914581f078b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 22:22:20 +0000
Subject: [PATCH 063/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/models/collection.py             | 2 +-
 sde_collections/tests/factories.py               | 1 +
 sde_collections/tests/test_promote_collection.py | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index c0d68aaf..5841fe6c 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -8,7 +8,6 @@
 from django.dispatch import receiver
 from model_utils import FieldTracker
 from slugify import slugify
-from .delta_url import DeltaUrl, CuratedUrl
 
 from config_generation.db_to_xml import XmlEditor
 
@@ -27,6 +26,7 @@
     UpdateFrequencies,
     WorkflowStatusChoices,
 )
+from .delta_url import CuratedUrl, DeltaUrl
 
 User = get_user_model()
 
diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py
index 8ae8d3bc..140bf294 100644
--- a/sde_collections/tests/factories.py
+++ b/sde_collections/tests/factories.py
@@ -1,6 +1,7 @@
 import factory
 from django.contrib.auth import get_user_model
 from django.utils import timezone
+
 from sde_collections.models.collection import Collection
 from sde_collections.models.collection_choice_fields import (
     ConnectorChoices,
diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py
index 5e584984..3cbf6ee0 100644
--- a/sde_collections/tests/test_promote_collection.py
+++ b/sde_collections/tests/test_promote_collection.py
@@ -1,5 +1,6 @@
 import pytest
-from sde_collections.models.delta_url import DeltaUrl, CuratedUrl
+
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
 from sde_collections.tests.factories import CollectionFactory
 
 

From 835c43e721451a73627b87b5cb227c85472fed64 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 8 Nov 2024 04:41:40 -0600
Subject: [PATCH 064/441] Update fixes for #1071, add fix for #1085

---
 .envs/.local/.django                          | 11 ++--
 config/settings/base.py                       |  6 +-
 sde_collections/admin.py                      | 31 +++++----
 .../0060_alter_candidateurl_scraped_text.py   | 24 +++++++
 sde_collections/models/candidate_url.py       |  8 ++-
 sde_collections/sinequa_api.py                | 65 ++++++++++++-------
 sde_collections/tasks.py                      | 59 +++++------------
 7 files changed, 118 insertions(+), 86 deletions(-)
 create mode 100644 sde_collections/migrations/0060_alter_candidateurl_scraped_text.py

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 07e159fa..0978166d 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -33,8 +33,11 @@ SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch'
 # Slack Webhook
 # ------------------------------------------------------------------------------
 SLACK_WEBHOOK_URL=''
-LRM_USER=''
-LRM_PASSWORD=''
+
+#Server Credentials
+#--------------------------------------------------------------------------------
+LRM_DEV_USER=''
+LRM_DEV_PASSWORD=''
 XLI_USER=''
 XLI_PASSWORD=''
 LRM_QA_USER=''
@@ -42,5 +45,5 @@ LRM_QA_PASSWORD=''
 
 #Server Tokens
 #--------------------------------------------------------------------------------
-LRMDEV_TOKEN=''
-LIS_TOKEN=''
+LRM_DEV_TOKEN=''
+XLI_TOKEN=''
diff --git a/config/settings/base.py b/config/settings/base.py
index 55c3e758..d5b111e2 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -343,7 +343,9 @@
 SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL")
 XLI_USER = env("XLI_USER")
 XLI_PASSWORD = env("XLI_PASSWORD")
-LRM_USER = env("LRM_USER")
-LRM_PASSWORD = env("LRM_PASSWORD")
+LRM_DEV_USER = env("LRM_DEV_USER")
+LRM_DEV_PASSWORD = env("LRM_DEV_PASSWORD")
 LRM_QA_USER = env("LRM_QA_USER")
 LRM_QA_PASSWORD = env("LRM_QA_PASSWORD")
+LRM_DEV_TOKEN=env("LRM_DEV_TOKEN")
+XLI_TOKEN=env("XLI_TOKEN")
diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 7b519a15..0357e819 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -9,19 +9,18 @@
 from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api
 
 
-@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
-def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
+def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name):
     for collection in queryset:
-        fetch_and_update_full_text.delay(collection.id, "LRM_DEV")
-    modeladmin.message_user(request, "Full text fetched and updated from LRM_DEV successfully.")
+        fetch_and_update_full_text.delay(collection.id, server_name)
+    modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server")
 
+@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
+def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
+    fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev")
 
-@admin.action(description="Import candidate URLs from Li's Server with Full Text")
+@admin.action(description="Import candidate URLs from XLI Server with Full Text")
 def fetch_full_text_lis_action(modeladmin, request, queryset):
-    for collection in queryset:
-        fetch_and_update_full_text.delay(collection.id, "LIS")
-    modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.")
-
+    fetch_and_update_text_for_server(modeladmin, request, queryset, "xli")
 
 @admin.action(description="Generate deployment message")
 def generate_deployment_message(modeladmin, request, queryset):
@@ -123,7 +122,7 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_
     messages.add_message(
         request,
         messages.INFO,
-        f"Started importing URLs from the API for: {collection_names} from {server_name.title()}",
+        f"Started importing URLs from {server_name.upper()} Server",
     )
 
 
@@ -147,19 +146,19 @@ def import_candidate_urls_secret_production(modeladmin, request, queryset):
     import_candidate_urls_from_api_caller(modeladmin, request, queryset, "secret_production")
 
 
-@admin.action(description="Import candidate URLs from Li's Server")
-def import_candidate_urls_lis_server(modeladmin, request, queryset):
-    import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lis_server")
+@admin.action(description="Import candidate URLs from XLI Server")
+def import_candidate_urls_xli_server(modeladmin, request, queryset):
+    import_candidate_urls_from_api_caller(modeladmin, request, queryset, "xli")
 
 
 @admin.action(description="Import candidate URLs from LRM Dev Server")
 def import_candidate_urls_lrm_dev_server(modeladmin, request, queryset):
-    import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev_server")
+    import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev")
 
 
 @admin.action(description="Import candidate URLs from LRM QA Server")
 def import_candidate_urls_lrm_qa_server(modeladmin, request, queryset):
-    import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa_server")
+    import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa")
 
 
 class ExportCsvMixin:
@@ -250,7 +249,7 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
         import_candidate_urls_production,
         import_candidate_urls_secret_test,
         import_candidate_urls_secret_production,
-        import_candidate_urls_lis_server,
+        import_candidate_urls_xli_server,
         import_candidate_urls_lrm_dev_server,
         import_candidate_urls_lrm_qa_server,
         fetch_full_text_lrm_dev_action,
diff --git a/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py b/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py
new file mode 100644
index 00000000..12a0fb3c
--- /dev/null
+++ b/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py
@@ -0,0 +1,24 @@
+# Generated by Django 4.2.9 on 2024-11-07 17:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0059_candidateurl_scraped_text"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="candidateurl",
+            name="scraped_text",
+            field=models.TextField(
+                blank=True,
+                default="",
+                help_text="This is the text scraped by Sinequa",
+                null=True,
+                verbose_name="Scraped Text",
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 936ea363..080e3c15 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -35,7 +35,13 @@ class CandidateURL(models.Model):
         blank=True,
         help_text="This is the original title scraped by Sinequa",
     )
-    scraped_text = models.TextField(blank=True, null=True)
+    scraped_text = models.TextField(
+        "Scraped Text",
+        default="",
+        null=True,
+        blank=True,
+        help_text="This is the text scraped by Sinequa",
+    )
     generated_title = models.CharField(
         "Generated Title",
         default="",
diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 0e4c3b62..652f3317 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -1,5 +1,5 @@
 from typing import Any
-
+import json
 import requests
 import urllib3
 from django.conf import settings
@@ -32,17 +32,17 @@
         "query_name": "query-sde-primary",
         "base_url": "https://sciencediscoveryengine.nasa.gov",
     },
-    "lis_server": {
+    "xli": {
         "app_name": "nasa-sba-smd",
         "query_name": "query-smd-primary",
         "base_url": "http://sde-xli.nasa-impact.net",
     },
-    "lrm_dev_server": {
+    "lrm_dev": {
         "app_name": "sde-init-check",
         "query_name": "query-init-check",
         "base_url": "https://sde-lrm.nasa-impact.net",
     },
-    "lrm_qa_server": {
+    "lrm_qa": {
         "app_name": "sde-init-check",
         "query_name": "query-init-check",
         "base_url": "https://sde-qa.nasa-impact.net",
@@ -53,15 +53,13 @@
 class Api:
     def __init__(self, server_name: str) -> None:
         self.server_name = server_name
-        self.app_name: str = server_configs[server_name]["app_name"]
-        self.query_name: str = server_configs[server_name]["query_name"]
-        self.base_url: str = server_configs[server_name]["base_url"]
-        self.xli_user = settings.XLI_USER
-        self.xli_password = settings.XLI_PASSWORD
-        self.lrm_user = settings.LRM_USER
-        self.lrm_password = settings.LRM_PASSWORD
-        self.lrm_qa_user = settings.LRM_QA_USER
-        self.lrm_qa_password = settings.LRM_QA_PASSWORD
+        config = server_configs[server_name]
+        self.app_name: str = config["app_name"]
+        self.query_name: str = config["query_name"]
+        self.base_url: str = config["base_url"]
+        self.user = getattr(settings, f"{server_name}_USER".upper(), None)
+        self.password = getattr(settings, f"{server_name}_PASSWORD".upper(), None)
+        self.token = getattr(settings, f"{server_name}_TOKEN".upper(), None)
 
     def process_response(self, url: str, payload: dict[str, Any]) -> Any:
         response = requests.post(url, headers={}, json=payload, verify=False)
@@ -74,14 +72,7 @@ def process_response(self, url: str, payload: dict[str, Any]) -> Any:
         return meaningful_response
 
     def query(self, page: int, collection_config_folder: str = "") -> Any:
-        if self.server_name == "lis_server":
-            url = f"{self.base_url}/api/v1/search.query?Password={self.xli_password}&User={self.xli_user}"
-        elif self.server_name == "lrm_dev_server":
-            url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_password}&User={self.lrm_user}"
-        elif self.server_name == "lrm_qa_server":
-            url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_qa_password}&User={self.lrm_qa_user}"
-        else:
-            url = f"{self.base_url}/api/v1/search.query"
+        url = f"{self.base_url}/api/v1/search.query?Password={self.password}&User={self.user}"
         payload = {
             "app": self.app_name,
             "query": {
@@ -94,7 +85,7 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
         }
 
         if collection_config_folder:
-            if self.server_name in ["lis_server", "lrm_dev_server", "lrm_qa_server"]:
+            if self.server_name in ["xli", "lrm_dev", "lrm_qa"]:
                 payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/"
             else:
                 payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/"
@@ -102,3 +93,33 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
         response = self.process_response(url, payload)
 
         return response
+
+    def sql_query(self, sql: str) -> Any:
+        """Executes an SQL query on the configured server using token-based authentication."""
+        if not self.token:
+            raise ValueError("You must have a token to use the SQL endpoint")
+
+        url = f"{self.base_url}/api/v1/engine.sql"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.token}"
+        }
+        payload = json.dumps({
+            "method": "engine.sql",
+            "sql": sql,
+            "pretty": True,
+            "log": False,
+            "output": "json",
+            "resolveIndexList": "false",
+            "engines": "default",
+        })
+        try:
+            response = requests.post(url, headers=headers, data=payload, timeout=10)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"API request failed: {str(e)}")
+        
+    def get_full_texts(self, collection_config_folder: str) -> Any:
+        sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'"
+        return self.sql_query(sql)
\ No newline at end of file
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index f505c942..65b3b158 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -146,37 +146,24 @@ def resolve_title_pattern(title_pattern_id):
 
 
 @celery_app.task
-def fetch_and_update_full_text(collection_id, server_type):
-    try:
-        collection = Collection.objects.get(id=collection_id)
-    except Collection.DoesNotExist:
-        raise Exception(f"Collection with ID {collection_id} does not exist.")
-
-    server_config = get_server_config(server_type)
-    token = server_config["token"]
-    url = server_config["url"]
-
-    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
-
-    payload = json.dumps(
-        {
-            "method": "engine.sql",
-            "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'",
-            "pretty": True,
-            "log": False,
-            "output": "json",
-            "resolveIndexList": "false",
-            "engines": "default",
-        }
-    )
-
-    try:
-        response = requests.post(url, headers=headers, data=payload, timeout=10)
-        response.raise_for_status()  # Raise exception for HTTP errors
-    except requests.exceptions.RequestException as e:
-        raise Exception(f"API request failed: {str(e)}")
-
-    records = response.json().get("Rows", [])
+def fetch_and_update_full_text(collection_id, server_name):
+    """
+    Task to fetch and update full text and metadata for all URLs associated with a specified collection
+    from a given server.
+
+    Args:
+        collection_id (int): The identifier for the collection in the database.
+        server_name (str): The name of the server.
+
+    Returns:
+        str: A message indicating the result of the operation, including the number of URLs processed
+             or a message if no records were found.
+    """
+    collection = Collection.objects.get(id=collection_id)
+    api = Api(server_name)
+    full_texts = api.get_full_texts(collection.config_folder)
+
+    records = full_texts.get("Rows", [])
     if not records:
         return "No records found in the response."
 
@@ -188,14 +175,4 @@ def fetch_and_update_full_text(collection_id, server_type):
         CandidateURL.objects.update_or_create(
             url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title}
         )
-
     return f"Successfully processed {len(records)} records and updated the database."
-
-
-def get_server_config(server_type):
-    if server_type == "LRM_DEV":
-        return {"url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LRMDEV_TOKEN")}
-    elif server_type == "LIS":
-        return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")}
-    else:
-        raise ValueError("Invalid server type.")

From acaf44e0cff32327ea9cb7608c8264748d4d6eec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 Nov 2024 10:42:24 +0000
Subject: [PATCH 065/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 config/settings/base.py        |  4 ++--
 sde_collections/admin.py       |  3 +++
 sde_collections/sinequa_api.py | 32 ++++++++++++++++----------------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/config/settings/base.py b/config/settings/base.py
index d5b111e2..0c16c59b 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -347,5 +347,5 @@
 LRM_DEV_PASSWORD = env("LRM_DEV_PASSWORD")
 LRM_QA_USER = env("LRM_QA_USER")
 LRM_QA_PASSWORD = env("LRM_QA_PASSWORD")
-LRM_DEV_TOKEN=env("LRM_DEV_TOKEN")
-XLI_TOKEN=env("XLI_TOKEN")
+LRM_DEV_TOKEN = env("LRM_DEV_TOKEN")
+XLI_TOKEN = env("XLI_TOKEN")
diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 0357e819..bd8713bb 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -14,14 +14,17 @@ def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name)
         fetch_and_update_full_text.delay(collection.id, server_name)
     modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server")
 
+
 @admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
 def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
     fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev")
 
+
 @admin.action(description="Import candidate URLs from XLI Server with Full Text")
 def fetch_full_text_lis_action(modeladmin, request, queryset):
     fetch_and_update_text_for_server(modeladmin, request, queryset, "xli")
 
+
 @admin.action(description="Generate deployment message")
 def generate_deployment_message(modeladmin, request, queryset):
     # generate deployment message
diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 652f3317..391dab9e 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -1,5 +1,6 @@
-from typing import Any
 import json
+from typing import Any
+
 import requests
 import urllib3
 from django.conf import settings
@@ -100,26 +101,25 @@ def sql_query(self, sql: str) -> Any:
             raise ValueError("You must have a token to use the SQL endpoint")
 
         url = f"{self.base_url}/api/v1/engine.sql"
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self.token}"
-        }
-        payload = json.dumps({
-            "method": "engine.sql",
-            "sql": sql,
-            "pretty": True,
-            "log": False,
-            "output": "json",
-            "resolveIndexList": "false",
-            "engines": "default",
-        })
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.token}"}
+        payload = json.dumps(
+            {
+                "method": "engine.sql",
+                "sql": sql,
+                "pretty": True,
+                "log": False,
+                "output": "json",
+                "resolveIndexList": "false",
+                "engines": "default",
+            }
+        )
         try:
             response = requests.post(url, headers=headers, data=payload, timeout=10)
             response.raise_for_status()
             return response.json()
         except requests.exceptions.RequestException as e:
             raise Exception(f"API request failed: {str(e)}")
-        
+
     def get_full_texts(self, collection_config_folder: str) -> Any:
         sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'"
-        return self.sql_query(sql)
\ No newline at end of file
+        return self.sql_query(sql)

From 9a2e756c3b1ff12318b8d3fa4f0663726edd66bb Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 8 Nov 2024 05:17:45 -0600
Subject: [PATCH 066/441] Update fixes for #1071,and add fix for #1085

---
 sde_collections/admin.py | 1 -
 sde_collections/tasks.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index bd8713bb..9645a098 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -121,7 +121,6 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_
         collection_ids=list(queryset.values_list("id", flat=True)),
         server_name=server_name,
     )
-    collection_names = ", ".join(queryset.values_list("name", flat=True))
     messages.add_message(
         request,
         messages.INFO,
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 65b3b158..77061ee6 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -3,7 +3,6 @@
 import shutil
 
 import boto3
-import requests
 from django.apps import apps
 from django.conf import settings
 from django.core import management

From 70912463069ccbe4814699999e8ac3b94d157365 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 10 Nov 2024 19:17:31 -0600
Subject: [PATCH 067/441] add instruction for running the promotion test

---
 sde_collections/tests/test_promote_collection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py
index 3cbf6ee0..73b4bcd2 100644
--- a/sde_collections/tests/test_promote_collection.py
+++ b/sde_collections/tests/test_promote_collection.py
@@ -1,3 +1,5 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
+
 import pytest
 
 from sde_collections.models.delta_url import CuratedUrl, DeltaUrl

From 98398596cb24ba7e6516de47ae50e28817351452 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 10 Nov 2024 19:24:00 -0600
Subject: [PATCH 068/441] add testing libraries to base requirements

---
 requirements/base.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/base.txt b/requirements/base.txt
index ee7e5f70..b5882ced 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -25,8 +25,11 @@ django-cors-headers==4.4.0
 django-filter==24.3
 djangorestframework-datatables==0.7.2
 djangorestframework==3.15.2
+factory-boy==3.3.0
 lxml==4.9.2
 PyGithub==2.2.0
+pytest-django==4.8.0
+pytest==8.0.0
 tqdm==4.66.3
 unidecode==1.3.8
 xmltodict==0.13.0

From 91acc1d1814050a55dc194b1de45bf01693b7650 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 10 Nov 2024 19:36:57 -0600
Subject: [PATCH 069/441] begin writing delta_patterns

---
 sde_collections/models/delta_patterns.py | 287 +++++++++++++++++++++++
 1 file changed, 287 insertions(+)
 create mode 100644 sde_collections/models/delta_patterns.py

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
new file mode 100644
index 00000000..63f38f41
--- /dev/null
+++ b/sde_collections/models/delta_patterns.py
@@ -0,0 +1,287 @@
+import re
+
+from django.apps import apps
+from django.core.exceptions import ValidationError
+from django.db import models
+
+from ..utils.title_resolver import (
+    is_valid_fstring,
+    is_valid_xpath,
+    parse_title,
+    resolve_title,
+)
+from .collection_choice_fields import Divisions, DocumentTypes
+
+# from .delta_url import CuratedUrl, DeltaUrl
+
+
+class BaseMatchPattern(models.Model):
+    class MatchPatternTypeChoices(models.IntegerChoices):
+        INDIVIDUAL_URL = 1, "Individual URL Pattern"
+        MULTI_URL_PATTERN = 2, "Multi-URL Pattern"
+
+    collection = models.ForeignKey(
+        "Collection",
+        on_delete=models.CASCADE,
+        related_name="%(class)s",
+        related_query_name="%(class)ss",
+    )
+    match_pattern = models.CharField(
+        "Pattern",
+        help_text="This pattern is compared against the URL of all the documents in the collection "
+        "and matching documents will be returned",
+    )
+    match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1)
+    delta_urls = models.ManyToManyField(
+        "DeltaUrl",
+        related_name="%(class)s_urls",
+    )
+    curated_urls = models.ManyToManyField(
+        "CuratedUrl",
+        related_name="%(class)s_urls",
+    )
+
+    def matched_urls(self):
+        """Find all the urls matching the pattern."""
+        escaped_match_pattern = re.escape(self.match_pattern)
+        if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL:
+            return self.collection.candidate_urls.filter(url__regex=f"{escaped_match_pattern}$")
+        elif self.match_pattern_type == self.MatchPatternTypeChoices.MULTI_URL_PATTERN:
+            return self.collection.candidate_urls.filter(
+                url__regex=escaped_match_pattern.replace(r"\*", ".*")  # allow * wildcards
+            )
+        else:
+            raise NotImplementedError
+
+    def _process_match_pattern(self) -> str:
+        """
+        Multi-Url patterns need a star at the beginning and at the end
+        Individual Url Patterns need a star at the beginning
+        """
+        # we don't trust the bracketing stars from the system, so we remove any
+        processed_pattern = self.match_pattern.strip().strip("*").strip()
+        if not processed_pattern.startswith("http"):
+            # if it doesn't begin with http, it must need a star at the beginning
+            processed_pattern = f"*{processed_pattern}"
+        if self.match_pattern_type == BaseMatchPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN:
+            # all multi urls should have a star at the end, but individuals should not
+            processed_pattern = f"{processed_pattern}*"
+        return processed_pattern
+
+    def apply(self):
+        raise NotImplementedError
+
+    def unapply(self):
+        raise NotImplementedError
+
+    def save(self, *args, **kwargs):
+        """Save the pattern and apply it."""
+        super().save(*args, **kwargs)
+        self.apply()
+
+    def delete(self, *args, **kwargs):
+        """Delete the pattern and unapply it."""
+        self.unapply()
+        super().delete(*args, **kwargs)
+
+    class Meta:
+        abstract = True
+        ordering = ["match_pattern"]
+        unique_together = ("collection", "match_pattern")
+
+    def __str__(self):
+        return self.match_pattern
+
+
+class DeltaExcludePattern(BaseMatchPattern):
+    reason = models.TextField("Reason for excluding", default="", blank=True)
+
+    def apply(self) -> None:
+        matched_urls = self.matched_urls()
+        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
+        self.candidate_urls.through.objects.bulk_create(
+            objs=[
+                DeltaExcludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, excludepattern_id=self.id)
+                for candidate_url_id in candidate_url_ids
+            ]
+        )
+
+    def unapply(self) -> None:
+        "Unapplies automatically by deleting include pattern through objects in a cascade"
+        return
+
+    class Meta:
+        """Meta definition for DeltaExcludePattern."""
+
+        verbose_name = "Exclude Pattern"
+        verbose_name_plural = "Exclude Patterns"
+        unique_together = ("collection", "match_pattern")
+
+
+class DeltaIncludePattern(BaseMatchPattern):
+    def apply(self) -> None:
+        matched_urls = self.matched_urls()
+        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
+        self.candidate_urls.through.objects.bulk_create(
+            objs=[
+                DeltaIncludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, includepattern_id=self.id)
+                for candidate_url_id in candidate_url_ids
+            ]
+        )
+
+    def unapply(self) -> None:
+        "Unapplies automatically by deleting includepattern through objects in a cascade"
+        return
+
+    class Meta:
+        """Meta definition for DeltaIncludePattern."""
+
+        verbose_name = "Include Pattern"
+        verbose_name_plural = "Include Patterns"
+        unique_together = ("collection", "match_pattern")
+
+
+def validate_title_pattern(title_pattern_string):
+    parsed_title = parse_title(title_pattern_string)
+
+    for element in parsed_title:
+        element_type, element_value = element
+
+        if element_type == "xpath":
+            if not is_valid_xpath(element_value):
+                raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.")  # noqa: E231
+        elif element_type == "brace":
+            try:
+                is_valid_fstring(element_value)
+            except ValueError as e:
+                raise ValidationError(str(e))
+
+
+class DeltaTitlePattern(BaseMatchPattern):
+    title_pattern = models.CharField(
+        "Title Pattern",
+        help_text="This is the pattern for the new title. You can either write an exact replacement string"
+        " (no quotes required) or you can write sinequa-valid code",
+        validators=[validate_title_pattern],
+    )
+
+    def apply(self) -> None:
+        matched_urls = self.matched_urls()
+        updated_urls = []
+        ResolvedTitle = apps.get_model("sde_collections", "ResolvedTitle")
+        ResolvedTitleError = apps.get_model("sde_collections", "ResolvedTitleError")
+
+        for candidate_url in matched_urls:
+            context = {
+                "url": candidate_url.url,
+                "title": candidate_url.scraped_title,
+                "collection": self.collection.name,
+            }
+            try:
+                generated_title = resolve_title(self.title_pattern, context)
+
+                # check to see if the candidate url has an existing resolved title and delete it
+                ResolvedTitle.objects.filter(candidate_url=candidate_url).delete()
+
+                resolved_title = ResolvedTitle.objects.create(
+                    title_pattern=self, candidate_url=candidate_url, resolved_title=generated_title
+                )
+                resolved_title.save()
+
+                candidate_url.generated_title = generated_title
+                candidate_url.save()
+                updated_urls.append(candidate_url)
+
+            except (ValueError, ValidationError) as e:
+                message = str(e)
+                resolved_title_error = ResolvedTitleError.objects.create(
+                    title_pattern=self, candidate_url=candidate_url, error_string=message
+                )
+
+                status_code = re.search(r"Status code: (\d+)", message)
+                if status_code:
+                    resolved_title_error.http_status_code = int(status_code.group(1))
+
+                resolved_title_error.save()
+
+        DeltaTitlePatternCandidateURL = DeltaTitlePattern.candidate_urls.through
+        pattern_url_associations = [
+            DeltaTitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) for url in updated_urls
+        ]
+        DeltaTitlePatternCandidateURL.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
+
+    def unapply(self) -> None:
+        candidate_urls = self.candidate_urls.all()
+        for candidate_url in candidate_urls:
+            candidate_url.generated_title = ""
+            candidate_url.save()
+        self.candidate_urls.clear()
+
+    def delete(self, *args, **kwargs):
+        self.unapply()
+        super().delete(*args, **kwargs)
+
+    class Meta:
+        """Meta definition for DeltaTitlePattern."""
+
+        verbose_name = "Title Pattern"
+        verbose_name_plural = "Title Patterns"
+        unique_together = ("collection", "match_pattern")
+
+
+class DeltaDocumentTypePattern(BaseMatchPattern):
+    document_type = models.IntegerField(choices=DocumentTypes.choices)
+
+    def apply(self) -> None:
+        matched_urls = self.matched_urls()
+        matched_urls.update(document_type=self.document_type)
+        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
+        self.candidate_urls.through.objects.bulk_create(
+            objs=[
+                DeltaDocumentTypePattern.candidate_urls.through(
+                    candidateurl_id=candidate_url_id, documenttypepattern_id=self.id
+                )
+                for candidate_url_id in candidate_url_ids
+            ]
+        )
+
+    def unapply(self) -> None:
+        self.candidate_urls.update(document_type=None)
+
+    class Meta:
+        """Meta definition for DeltaDocumentTypePattern."""
+
+        verbose_name = "Document Type Pattern"
+        verbose_name_plural = "Document Type Patterns"
+        unique_together = ("collection", "match_pattern")
+
+
+class DeltaDivisionPattern(BaseMatchPattern):
+    division = models.IntegerField(choices=Divisions.choices)
+
+    def apply(self) -> None:
+        matched_urls = self.matched_urls()
+        matched_urls.update(division=self.division)
+        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
+        self.candidate_urls.through.objects.bulk_create(
+            objs=[
+                DeltaDivisionPattern.candidate_urls.through(
+                    candidateurl_id=candidate_url_id, divisionpattern_id=self.id
+                )
+                for candidate_url_id in candidate_url_ids
+            ]
+        )
+
+    def unapply(self) -> None:
+        self.candidate_urls.update(division=None)
+
+    class Meta:
+        verbose_name = "Division Pattern"
+        verbose_name_plural = "Division Patterns"
+        unique_together = ("collection", "match_pattern")
+
+
+# @receiver(post_save, sender=DeltaTitlePattern)
+# def post_save_handler(sender, instance, created, **kwargs):
+#     if created:
+#         transaction.on_commit(lambda: resolve_title_pattern.delay(instance.pk))

From 84c8ed6b3620c2b8266b773d2682b27876ace4a1 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 10 Nov 2024 20:01:47 -0600
Subject: [PATCH 070/441] update the DeltaExcludePattern to use both new urls

---
 sde_collections/models/delta_patterns.py | 45 ++++++++++++++++--------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 63f38f41..76483542 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -34,25 +34,32 @@ class MatchPatternTypeChoices(models.IntegerChoices):
     match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1)
     delta_urls = models.ManyToManyField(
         "DeltaUrl",
-        related_name="%(class)s_urls",
+        related_name="%(class)s_delta_urls",
     )
     curated_urls = models.ManyToManyField(
         "CuratedUrl",
-        related_name="%(class)s_urls",
+        related_name="%(class)s_curated_urls",
     )
 
     def matched_urls(self):
         """Find all the urls matching the pattern."""
         escaped_match_pattern = re.escape(self.match_pattern)
         if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL:
-            return self.collection.candidate_urls.filter(url__regex=f"{escaped_match_pattern}$")
+            regex_pattern = f"{escaped_match_pattern}$"
         elif self.match_pattern_type == self.MatchPatternTypeChoices.MULTI_URL_PATTERN:
-            return self.collection.candidate_urls.filter(
-                url__regex=escaped_match_pattern.replace(r"\*", ".*")  # allow * wildcards
-            )
+            regex_pattern = escaped_match_pattern.replace(r"\*", ".*")  # allow * wildcards
         else:
             raise NotImplementedError
 
+        # Filter both DeltaUrls and CuratedUrls
+        matching_delta_urls = self.delta_urls.filter(url__regex=regex_pattern)
+        matching_curated_urls = self.curated_urls.filter(url__regex=regex_pattern)
+
+        return {
+            "matching_delta_urls": matching_delta_urls,
+            "matching_curated_urls": matching_curated_urls,
+        }
+
     def _process_match_pattern(self) -> str:
         """
         Multi-Url patterns need a star at the beginning and at the end
@@ -98,21 +105,31 @@ class DeltaExcludePattern(BaseMatchPattern):
 
     def apply(self) -> None:
         matched_urls = self.matched_urls()
-        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
-        self.candidate_urls.through.objects.bulk_create(
-            objs=[
-                DeltaExcludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, excludepattern_id=self.id)
-                for candidate_url_id in candidate_url_ids
+
+        # Define a mapping of model attributes to their related URL fields
+        url_mappings = {
+            "delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
+            "curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
+        }
+
+        for field_name, url_ids in url_mappings.items():
+            through_model = getattr(self, field_name).through  # Access the through model dynamically
+            bulk_data = [
+                through_model(**{f"{field_name[:-1]}_id": url_id, "deltaexcludepattern_id": self.id})
+                for url_id in url_ids
             ]
-        )
+            through_model.objects.bulk_create(bulk_data)
 
     def unapply(self) -> None:
+        # this is the new, suggested code
+        # self.delta_urls.clear()
+        # self.curated_urls.clear()
+        # this is the old code
+        # need to study later and decide which is better
         "Unapplies automatically by deleting include pattern through objects in a cascade"
         return
 
     class Meta:
-        """Meta definition for DeltaExcludePattern."""
-
         verbose_name = "Exclude Pattern"
         verbose_name_plural = "Exclude Patterns"
         unique_together = ("collection", "match_pattern")

From 86bdf47c754dbad9e566ea9b08c7d807c5d4f6b0 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 10 Nov 2024 20:34:49 -0600
Subject: [PATCH 071/441] finalize the delta patterns

---
 sde_collections/models/delta_patterns.py | 141 +++++++++++++----------
 1 file changed, 81 insertions(+), 60 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 76483542..11259737 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -12,8 +12,6 @@
 )
 from .collection_choice_fields import Divisions, DocumentTypes
 
-# from .delta_url import CuratedUrl, DeltaUrl
-
 
 class BaseMatchPattern(models.Model):
     class MatchPatternTypeChoices(models.IntegerChoices):
@@ -138,21 +136,26 @@ class Meta:
 class DeltaIncludePattern(BaseMatchPattern):
     def apply(self) -> None:
         matched_urls = self.matched_urls()
-        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
-        self.candidate_urls.through.objects.bulk_create(
-            objs=[
-                DeltaIncludePattern.candidate_urls.through(candidateurl_id=candidate_url_id, includepattern_id=self.id)
-                for candidate_url_id in candidate_url_ids
+
+        # Define a mapping of model attributes to their related URL fields
+        url_mappings = {
+            "delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
+            "curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
+        }
+
+        for field_name, url_ids in url_mappings.items():
+            through_model = getattr(self, field_name).through  # Access the through model dynamically
+            bulk_data = [
+                through_model(**{f"{field_name[:-1]}_id": url_id, "deltaincludepattern_id": self.id})
+                for url_id in url_ids
             ]
-        )
+            through_model.objects.bulk_create(bulk_data)
 
     def unapply(self) -> None:
         "Unapplies automatically by deleting includepattern through objects in a cascade"
         return
 
     class Meta:
-        """Meta definition for DeltaIncludePattern."""
-
         verbose_name = "Include Pattern"
         verbose_name_plural = "Include Patterns"
         unique_together = ("collection", "match_pattern")
@@ -183,64 +186,70 @@ class DeltaTitlePattern(BaseMatchPattern):
     )
 
     def apply(self) -> None:
-        matched_urls = self.matched_urls()
+        matched = self.matched_urls()  # Now returns separate QuerySets for delta and curated URLs
         updated_urls = []
         ResolvedTitle = apps.get_model("sde_collections", "ResolvedTitle")
         ResolvedTitleError = apps.get_model("sde_collections", "ResolvedTitleError")
 
-        for candidate_url in matched_urls:
+        # Process both DeltaUrls and CuratedUrls
+        for url_obj in matched["matching_delta_urls"] | matched["matching_curated_urls"]:
             context = {
-                "url": candidate_url.url,
-                "title": candidate_url.scraped_title,
+                "url": url_obj.url,
+                "title": url_obj.scraped_title,
                 "collection": self.collection.name,
             }
             try:
                 generated_title = resolve_title(self.title_pattern, context)
 
-                # check to see if the candidate url has an existing resolved title and delete it
-                ResolvedTitle.objects.filter(candidate_url=candidate_url).delete()
+                # Remove any existing resolved title for this URL
+                ResolvedTitle.objects.filter(url=url_obj).delete()
 
-                resolved_title = ResolvedTitle.objects.create(
-                    title_pattern=self, candidate_url=candidate_url, resolved_title=generated_title
-                )
-                resolved_title.save()
+                # Create new resolved title entry
+                ResolvedTitle.objects.create(title_pattern=self, url=url_obj, resolved_title=generated_title)
 
-                candidate_url.generated_title = generated_title
-                candidate_url.save()
-                updated_urls.append(candidate_url)
+                # Update generated title and save it to the DeltaUrl or CuratedUrl
+                url_obj.generated_title = generated_title
+                url_obj.save()
+                updated_urls.append(url_obj)
 
             except (ValueError, ValidationError) as e:
                 message = str(e)
                 resolved_title_error = ResolvedTitleError.objects.create(
-                    title_pattern=self, candidate_url=candidate_url, error_string=message
+                    title_pattern=self, url=url_obj, error_string=message
                 )
 
+                # Extract status code if present in the error message
                 status_code = re.search(r"Status code: (\d+)", message)
                 if status_code:
                     resolved_title_error.http_status_code = int(status_code.group(1))
 
                 resolved_title_error.save()
 
-        DeltaTitlePatternCandidateURL = DeltaTitlePattern.candidate_urls.through
-        pattern_url_associations = [
-            DeltaTitlePatternCandidateURL(titlepattern_id=self.id, candidateurl_id=url.id) for url in updated_urls
-        ]
-        DeltaTitlePatternCandidateURL.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
+        # Associate pattern with both delta and curated URLs
+        for field_name, urls in {
+            "delta_urls": matched["matching_delta_urls"],
+            "curated_urls": matched["matching_curated_urls"],
+        }.items():
+            through_model = getattr(self, field_name).through
+            pattern_url_associations = [
+                through_model(deltatitlepattern_id=self.id, **{f"{field_name[:-1]}_id": url.id}) for url in urls
+            ]
+            through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
 
     def unapply(self) -> None:
-        candidate_urls = self.candidate_urls.all()
-        for candidate_url in candidate_urls:
-            candidate_url.generated_title = ""
-            candidate_url.save()
-        self.candidate_urls.clear()
+        """Clears generated titles and dissociates URLs from the pattern."""
+        for url_obj in self.delta_urls.all() | self.curated_urls.all():
+            url_obj.generated_title = ""
+            url_obj.save()
+        self.delta_urls.clear()
+        self.curated_urls.clear()
 
     def delete(self, *args, **kwargs):
+        """Ensures unapply is called before deletion."""
         self.unapply()
         super().delete(*args, **kwargs)
 
     class Meta:
-        """Meta definition for DeltaTitlePattern."""
-
         verbose_name = "Title Pattern"
         verbose_name_plural = "Title Patterns"
         unique_together = ("collection", "match_pattern")
@@ -250,24 +259,29 @@ class DeltaDocumentTypePattern(BaseMatchPattern):
     document_type = models.IntegerField(choices=DocumentTypes.choices)
 
     def apply(self) -> None:
-        matched_urls = self.matched_urls()
-        matched_urls.update(document_type=self.document_type)
-        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
-        self.candidate_urls.through.objects.bulk_create(
-            objs=[
-                DeltaDocumentTypePattern.candidate_urls.through(
-                    candidateurl_id=candidate_url_id, documenttypepattern_id=self.id
-                )
-                for candidate_url_id in candidate_url_ids
+        matched = self.matched_urls()
+        # Apply the document type to both DeltaUrls and CuratedUrls
+        for field_name, urls in {
+            "delta_urls": matched["matching_delta_urls"],
+            "curated_urls": matched["matching_curated_urls"],
+        }.items():
+            urls.update(document_type=self.document_type)  # Update the document type for matched URLs
+            # Bulk create associations in the through table
+            through_model = getattr(self, field_name).through
+            pattern_url_associations = [
+                through_model(**{f"{field_name[:-1]}_id": url.id, "documenttypepattern_id": self.id}) for url in urls
             ]
-        )
+            through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
 
     def unapply(self) -> None:
-        self.candidate_urls.update(document_type=None)
+        """Clear document type from associated delta and curated URLs."""
+        for url_obj in self.delta_urls.all() | self.curated_urls.all():
+            url_obj.document_type = None
+            url_obj.save()
+        self.delta_urls.clear()
+        self.curated_urls.clear()
 
     class Meta:
-        """Meta definition for DeltaDocumentTypePattern."""
-
         verbose_name = "Document Type Pattern"
         verbose_name_plural = "Document Type Patterns"
         unique_together = ("collection", "match_pattern")
@@ -277,20 +291,27 @@ class DeltaDivisionPattern(BaseMatchPattern):
     division = models.IntegerField(choices=Divisions.choices)
 
     def apply(self) -> None:
-        matched_urls = self.matched_urls()
-        matched_urls.update(division=self.division)
-        candidate_url_ids = list(matched_urls.values_list("id", flat=True))
-        self.candidate_urls.through.objects.bulk_create(
-            objs=[
-                DeltaDivisionPattern.candidate_urls.through(
-                    candidateurl_id=candidate_url_id, divisionpattern_id=self.id
-                )
-                for candidate_url_id in candidate_url_ids
+        matched = self.matched_urls()
+        # Apply the division to both DeltaUrls and CuratedUrls
+        for field_name, urls in {
+            "delta_urls": matched["matching_delta_urls"],
+            "curated_urls": matched["matching_curated_urls"],
+        }.items():
+            urls.update(division=self.division)  # Update the division for matched URLs
+            # Bulk create associations in the through table
+            through_model = getattr(self, field_name).through
+            pattern_url_associations = [
+                through_model(**{f"{field_name[:-1]}_id": url.id, "divisionpattern_id": self.id}) for url in urls
             ]
-        )
+            through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
 
     def unapply(self) -> None:
-        self.candidate_urls.update(division=None)
+        """Clear division from associated delta and curated URLs."""
+        for url_obj in self.delta_urls.all() | self.curated_urls.all():
+            url_obj.division = None
+            url_obj.save()
+        self.delta_urls.clear()
+        self.curated_urls.clear()
 
     class Meta:
         verbose_name = "Division Pattern"

From c94f1d18a04ceaad34ae13bb70d4bea3512e6fc2 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 10 Nov 2024 20:35:44 -0600
Subject: [PATCH 072/441] fix pointers to document pattern

---
 sde_collections/models/delta_patterns.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 11259737..0740a435 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -269,7 +269,8 @@ def apply(self) -> None:
             # Bulk create associations in the through table
             through_model = getattr(self, field_name).through
             pattern_url_associations = [
-                through_model(**{f"{field_name[:-1]}_id": url.id, "documenttypepattern_id": self.id}) for url in urls
+                through_model(**{f"{field_name[:-1]}_id": url.id, "deltadocumenttypepattern_id": self.id})
+                for url in urls
             ]
             through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
 
@@ -301,7 +302,7 @@ def apply(self) -> None:
             # Bulk create associations in the through table
             through_model = getattr(self, field_name).through
             pattern_url_associations = [
-                through_model(**{f"{field_name[:-1]}_id": url.id, "divisionpattern_id": self.id}) for url in urls
+                through_model(**{f"{field_name[:-1]}_id": url.id, "deltadivisionpattern_id": self.id}) for url in urls
             ]
             through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
 

From 4c07dc5aea2957fc6b7863be71cb803530760b68 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 10 Nov 2024 20:47:52 -0600
Subject: [PATCH 073/441] add querysets and managers to the deltaurl model

---
 sde_collections/models/delta_url.py | 32 ++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 48b3cf20..3ea6b05c 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -4,6 +4,35 @@
 from django.db import models
 
 from .collection_choice_fields import Divisions, DocumentTypes
+from .delta_patterns import DeltaExcludePattern
+
+
+class DeltaUrlQuerySet(models.QuerySet):
+    def with_exclusion_status(self):
+        return self.annotate(
+            excluded=models.Exists(
+                DeltaExcludePattern.delta_urls.through.objects.filter(deltaurl=models.OuterRef("pk"))
+            )
+        )
+
+
+class CuratedUrlQuerySet(models.QuerySet):
+    def with_exclusion_status(self):
+        return self.annotate(
+            excluded=models.Exists(
+                DeltaExcludePattern.curated_urls.through.objects.filter(curatedurl=models.OuterRef("pk"))
+            )
+        )
+
+
+class DeltaUrlManager(models.Manager):
+    def get_queryset(self):
+        return DeltaUrlQuerySet(self.model, using=self._db).with_exclusion_status()
+
+
+class CuratedUrlManager(models.Manager):
+    def get_queryset(self):
+        return CuratedUrlQuerySet(self.model, using=self._db).with_exclusion_status()
 
 
 class BaseUrl(models.Model):
@@ -71,10 +100,11 @@ class DumpUrl(BaseUrl):
 class DeltaUrl(BaseUrl):
     """Urls that are being curated. Only deltas are stored in this model."""
 
+    objects = DeltaUrlManager()
     delete = models.BooleanField(default=False)
 
 
 class CuratedUrl(BaseUrl):
     """Urls that are curated and ready for production"""
 
-    pass
+    objects = CuratedUrlManager()

From b563f1ef183892a04f848eef9781933da5bdccc2 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 10:35:25 -0600
Subject: [PATCH 074/441] rewrite apply logic

---
 sde_collections/models/delta_patterns.py | 203 ++++++++---------------
 sde_collections/models/delta_url.py      |  30 +++-
 2 files changed, 99 insertions(+), 134 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 0740a435..02b7e341 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -4,6 +4,8 @@
 from django.core.exceptions import ValidationError
 from django.db import models
 
+from sde_collections.models.delta_url import DeltaUrl
+
 from ..utils.title_resolver import (
     is_valid_fstring,
     is_valid_xpath,
@@ -42,42 +44,63 @@ class MatchPatternTypeChoices(models.IntegerChoices):
     def matched_urls(self):
         """Find all the urls matching the pattern."""
         escaped_match_pattern = re.escape(self.match_pattern)
-        if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL:
-            regex_pattern = f"{escaped_match_pattern}$"
-        elif self.match_pattern_type == self.MatchPatternTypeChoices.MULTI_URL_PATTERN:
-            regex_pattern = escaped_match_pattern.replace(r"\*", ".*")  # allow * wildcards
-        else:
-            raise NotImplementedError
-
-        # Filter both DeltaUrls and CuratedUrls
-        matching_delta_urls = self.delta_urls.filter(url__regex=regex_pattern)
-        matching_curated_urls = self.curated_urls.filter(url__regex=regex_pattern)
-
+        regex_pattern = (
+            f"{escaped_match_pattern}$"
+            if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL
+            else escaped_match_pattern.replace(r"\*", ".*")
+        )
         return {
-            "matching_delta_urls": matching_delta_urls,
-            "matching_curated_urls": matching_curated_urls,
+            "matching_delta_urls": self.delta_urls.filter(url__regex=regex_pattern),
+            "matching_curated_urls": self.curated_urls.filter(url__regex=regex_pattern),
         }
 
-    def _process_match_pattern(self) -> str:
+    def generate_delta_url(self, curated_url, fields_to_copy=None):
         """
-        Multi-Url patterns need a star at the beginning and at the end
-        Individual Url Patterns need a star at the beginning
+        Generates or updates a DeltaUrl based on a CuratedUrl.
+        Only specified fields are copied if fields_to_copy is provided.
         """
-        # we don't trust the bracketing stars from the system, so we remove any
-        processed_pattern = self.match_pattern.strip().strip("*").strip()
-        if not processed_pattern.startswith("http"):
-            # if it doesn't begin with http, it must need a star at the beginning
-            processed_pattern = f"*{processed_pattern}"
-        if self.match_pattern_type == BaseMatchPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN:
-            # all multi urls should have a star at the end, but individuals should not
-            processed_pattern = f"{processed_pattern}*"
-        return processed_pattern
-
-    def apply(self):
-        raise NotImplementedError
+        delta_url, created = DeltaUrl.objects.get_or_create(
+            collection=self.collection,
+            url=curated_url.url,
+            defaults={field: getattr(curated_url, field) for field in (fields_to_copy or [])},
+        )
+        if not created and fields_to_copy:
+            # Update only if certain fields are missing in DeltaUrl
+            # in the current codebase, this is only executed for scraped_title, but this
+            # can be extended to other fields as well, if we add a pattern that requires it
+            for field in fields_to_copy:
+                if getattr(delta_url, field, None) in [None, ""]:
+                    setattr(delta_url, field, getattr(curated_url, field))
+            delta_url.save()
+
+    def apply(self, fields_to_copy=None, update_fields=None):
+        matched_urls = self.matched_urls()
+
+        # Iterate over matched CuratedUrls to create or update DeltaUrls as needed
+        for curated_url in matched_urls["matching_curated_urls"]:
+            self.generate_delta_url(curated_url, fields_to_copy)
+
+        # Apply any updates to DeltaUrls based on update_fields
+        if update_fields:
+            for field, value in update_fields.items():
+                matched_urls["matching_delta_urls"].update(**{field: value})
+
+        # Populate through tables for DeltaUrl and CuratedUrl relationships
+        for field_name, url_ids in {
+            "delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
+            "curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
+        }.items():
+            through_model = getattr(self, field_name).through
+            bulk_data = [
+                through_model(**{f"{field_name[:-1]}_id": url_id, f"{self.__class__.__name__.lower()}_id": self.id})
+                for url_id in url_ids
+            ]
+            through_model.objects.bulk_create(bulk_data, ignore_conflicts=True)
 
     def unapply(self):
-        raise NotImplementedError
+        """Default unapply behavior."""
+        self.delta_urls.clear()
+        self.curated_urls.clear()
 
     def save(self, *args, **kwargs):
         """Save the pattern and apply it."""
@@ -101,31 +124,8 @@ def __str__(self):
 class DeltaExcludePattern(BaseMatchPattern):
     reason = models.TextField("Reason for excluding", default="", blank=True)
 
-    def apply(self) -> None:
-        matched_urls = self.matched_urls()
-
-        # Define a mapping of model attributes to their related URL fields
-        url_mappings = {
-            "delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
-            "curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
-        }
-
-        for field_name, url_ids in url_mappings.items():
-            through_model = getattr(self, field_name).through  # Access the through model dynamically
-            bulk_data = [
-                through_model(**{f"{field_name[:-1]}_id": url_id, "deltaexcludepattern_id": self.id})
-                for url_id in url_ids
-            ]
-            through_model.objects.bulk_create(bulk_data)
-
-    def unapply(self) -> None:
-        # this is the new, suggested code
-        # self.delta_urls.clear()
-        # self.curated_urls.clear()
-        # this is the old code
-        # need to study later and decide which is better
-        "Unapplies automatically by deleting include pattern through objects in a cascade"
-        return
+    # No need to override `apply`—we use the base class logic as-is.
+    # This pattern's functionality is handled by the `excluded` annotation in the manager.
 
     class Meta:
         verbose_name = "Exclude Pattern"
@@ -134,26 +134,7 @@ class Meta:
 
 
 class DeltaIncludePattern(BaseMatchPattern):
-    def apply(self) -> None:
-        matched_urls = self.matched_urls()
-
-        # Define a mapping of model attributes to their related URL fields
-        url_mappings = {
-            "delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
-            "curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
-        }
-
-        for field_name, url_ids in url_mappings.items():
-            through_model = getattr(self, field_name).through  # Access the through model dynamically
-            bulk_data = [
-                through_model(**{f"{field_name[:-1]}_id": url_id, "deltaincludepattern_id": self.id})
-                for url_id in url_ids
-            ]
-            through_model.objects.bulk_create(bulk_data)
-
-    def unapply(self) -> None:
-        "Unapplies automatically by deleting includepattern through objects in a cascade"
-        return
+    # No additional logic needed for `apply`—using base class functionality.
 
     class Meta:
         verbose_name = "Include Pattern"
@@ -186,12 +167,13 @@ class DeltaTitlePattern(BaseMatchPattern):
     )
 
     def apply(self) -> None:
-        matched = self.matched_urls()  # Now returns separate QuerySets for delta and curated URLs
-        updated_urls = []
+        # Use `fields_to_copy` to copy `scraped_title` for any matching curated URLs.
+        super().apply(fields_to_copy=["scraped_title"])
+
+        matched = self.matched_urls()  # Separate QuerySets for delta and curated URLs
         ResolvedTitle = apps.get_model("sde_collections", "ResolvedTitle")
         ResolvedTitleError = apps.get_model("sde_collections", "ResolvedTitleError")
 
-        # Process both DeltaUrls and CuratedUrls
         for url_obj in matched["matching_delta_urls"] | matched["matching_curated_urls"]:
             context = {
                 "url": url_obj.url,
@@ -201,16 +183,15 @@ def apply(self) -> None:
             try:
                 generated_title = resolve_title(self.title_pattern, context)
 
-                # Remove any existing resolved title for this URL
+                # Remove existing resolved title entries for this URL
                 ResolvedTitle.objects.filter(url=url_obj).delete()
 
-                # Create new resolved title entry
+                # Create a new resolved title entry
                 ResolvedTitle.objects.create(title_pattern=self, url=url_obj, resolved_title=generated_title)
 
-                # Update generated title and save it to the DeltaUrl or CuratedUrl
+                # Update generated title and save it to DeltaUrl or CuratedUrl
                 url_obj.generated_title = generated_title
                 url_obj.save()
-                updated_urls.append(url_obj)
 
             except (ValueError, ValidationError) as e:
                 message = str(e)
@@ -225,30 +206,14 @@ def apply(self) -> None:
 
                 resolved_title_error.save()
 
-        # Associate pattern with both delta and curated URLs
-        for field_name, urls in {
-            "delta_urls": matched["matching_delta_urls"],
-            "curated_urls": matched["matching_curated_urls"],
-        }.items():
-            through_model = getattr(self, field_name).through
-            pattern_url_associations = [
-                through_model(deltatitlepattern_id=self.id, **{f"{field_name[:-1]}_id": url.id}) for url in urls
-            ]
-            through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
-
     def unapply(self) -> None:
         """Clears generated titles and dissociates URLs from the pattern."""
-        for url_obj in self.delta_urls.all() | self.curated_urls.all():
+        for url_obj in self.delta_urls.all():
             url_obj.generated_title = ""
             url_obj.save()
         self.delta_urls.clear()
         self.curated_urls.clear()
 
-    def delete(self, *args, **kwargs):
-        """Ensures unapply is called before deletion."""
-        self.unapply()
-        super().delete(*args, **kwargs)
-
     class Meta:
         verbose_name = "Title Pattern"
         verbose_name_plural = "Title Patterns"
@@ -258,27 +223,13 @@ class Meta:
 class DeltaDocumentTypePattern(BaseMatchPattern):
     document_type = models.IntegerField(choices=DocumentTypes.choices)
 
+    # We use `update_fields` in the base apply method to set `document_type`.
     def apply(self) -> None:
-        matched = self.matched_urls()
-        # Apply the document type to both DeltaUrls and CuratedUrls
-        for field_name, urls in {
-            "delta_urls": matched["matching_delta_urls"],
-            "curated_urls": matched["matching_curated_urls"],
-        }.items():
-            urls.update(document_type=self.document_type)  # Update the document type for matched URLs
-            # Bulk create associations in the through table
-            through_model = getattr(self, field_name).through
-            pattern_url_associations = [
-                through_model(**{f"{field_name[:-1]}_id": url.id, "deltadocumenttypepattern_id": self.id})
-                for url in urls
-            ]
-            through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
+        super().apply(update_fields={"document_type": self.document_type})
 
     def unapply(self) -> None:
         """Clear document type from associated delta and curated URLs."""
-        for url_obj in self.delta_urls.all() | self.curated_urls.all():
-            url_obj.document_type = None
-            url_obj.save()
+        self.delta_urls.update(document_type=None)
         self.delta_urls.clear()
         self.curated_urls.clear()
 
@@ -291,28 +242,14 @@ class Meta:
 class DeltaDivisionPattern(BaseMatchPattern):
     division = models.IntegerField(choices=Divisions.choices)
 
+    # We use `update_fields` in the base apply method to set `division`.
     def apply(self) -> None:
-        matched = self.matched_urls()
-        # Apply the division to both DeltaUrls and CuratedUrls
-        for field_name, urls in {
-            "delta_urls": matched["matching_delta_urls"],
-            "curated_urls": matched["matching_curated_urls"],
-        }.items():
-            urls.update(division=self.division)  # Update the division for matched URLs
-            # Bulk create associations in the through table
-            through_model = getattr(self, field_name).through
-            pattern_url_associations = [
-                through_model(**{f"{field_name[:-1]}_id": url.id, "deltadivisionpattern_id": self.id}) for url in urls
-            ]
-            through_model.objects.bulk_create(pattern_url_associations, ignore_conflicts=True)
+        super().apply(update_fields={"division": self.division})
 
     def unapply(self) -> None:
         """Clear division from associated delta and curated URLs."""
-        for url_obj in self.delta_urls.all() | self.curated_urls.all():
-            url_obj.division = None
-            url_obj.save()
-        self.delta_urls.clear()
-        self.curated_urls.clear()
+        # TODO: need to double check this logic for complicated cases
+        self.delta_urls.update(division=None)
 
     class Meta:
         verbose_name = "Division Pattern"
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 3ea6b05c..898f3df3 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -4,7 +4,7 @@
 from django.db import models
 
 from .collection_choice_fields import Divisions, DocumentTypes
-from .delta_patterns import DeltaExcludePattern
+from .delta_patterns import DeltaExcludePattern, DeltaTitlePattern
 
 
 class DeltaUrlQuerySet(models.QuerySet):
@@ -108,3 +108,31 @@ class CuratedUrl(BaseUrl):
     """Urls that are curated and ready for production"""
 
     objects = CuratedUrlManager()
+
+
+class DeltaResolvedTitleBase(models.Model):
+    # TODO: need to understand this logic and whether we need to have thess match to CuratedUrls as well
+    title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE)
+    delta_url = models.OneToOneField(DeltaUrl, on_delete=models.CASCADE)
+    created_at = models.DateTimeField(auto_now_add=True)
+
+    class Meta:
+        abstract = True
+
+
+class DeltaResolvedTitle(DeltaResolvedTitleBase):
+    resolved_title = models.CharField(blank=True, default="")
+
+    class Meta:
+        verbose_name = "Resolved Title"
+        verbose_name_plural = "Resolved Titles"
+
+    def save(self, *args, **kwargs):
+        # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it
+        DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete()
+        super().save(*args, **kwargs)
+
+
+class DeltaResolvedTitleError(DeltaResolvedTitleBase):
+    error_string = models.TextField(null=False, blank=False)
+    http_status_code = models.IntegerField(null=True, blank=True)

From 43adec0403f58dcf9c9a142d0be318650b7c3e0d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 10:53:10 -0600
Subject: [PATCH 075/441] remove generated title from dumpurl admin

---
 sde_collections/admin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 88d10fea..e7ff34bb 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -303,7 +303,7 @@ class DivisionPatternAdmin(admin.ModelAdmin):
 class DumpUrlAdmin(admin.ModelAdmin):
     """Admin View for DumpUrl"""
 
-    list_display = ("url", "scraped_title", "generated_title", "collection")
+    list_display = ("url", "scraped_title", "collection")
     list_filter = ("collection",)
 
 

From 49960ac9f5bd9c87efc78400fc981d144b811e3e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 11:18:30 -0600
Subject: [PATCH 076/441] remove circular migration with lazy loads

---
 sde_collections/models/delta_patterns.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 02b7e341..67b1502d 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -4,8 +4,6 @@
 from django.core.exceptions import ValidationError
 from django.db import models
 
-from sde_collections.models.delta_url import DeltaUrl
-
 from ..utils.title_resolver import (
     is_valid_fstring,
     is_valid_xpath,
@@ -59,6 +57,9 @@ def generate_delta_url(self, curated_url, fields_to_copy=None):
         Generates or updates a DeltaUrl based on a CuratedUrl.
         Only specified fields are copied if fields_to_copy is provided.
         """
+        # Import DeltaUrl dynamically to avoid circular import issues
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+
         delta_url, created = DeltaUrl.objects.get_or_create(
             collection=self.collection,
             url=curated_url.url,
@@ -66,8 +67,6 @@ def generate_delta_url(self, curated_url, fields_to_copy=None):
         )
         if not created and fields_to_copy:
             # Update only if certain fields are missing in DeltaUrl
-            # in the current codebase, this is only executed for scraped_title, but this
-            # can be extended to other fields as well, if we add a pattern that requires it
             for field in fields_to_copy:
                 if getattr(delta_url, field, None) in [None, ""]:
                     setattr(delta_url, field, getattr(curated_url, field))

From da1232741a0ec89ff48c09118e7f5af1b463817c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 11:19:01 -0600
Subject: [PATCH 077/441] add migrations for the new DeltaPattern models

---
 ...attern_deltaresolvedtitleerror_and_more.py | 288 ++++++++++++++++++
 1 file changed, 288 insertions(+)
 create mode 100644 sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py

diff --git a/sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py b/sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py
new file mode 100644
index 00000000..48996f5b
--- /dev/null
+++ b/sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py
@@ -0,0 +1,288 @@
+# Generated by Django 4.2.9 on 2024-11-11 17:17
+
+from django.db import migrations, models
+import django.db.models.deletion
+import sde_collections.models.delta_patterns
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0061_dumpurl_deltaurl_curatedurl"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="DeltaTitlePattern",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                (
+                    "match_pattern",
+                    models.CharField(
+                        help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned",
+                        verbose_name="Pattern",
+                    ),
+                ),
+                (
+                    "match_pattern_type",
+                    models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1),
+                ),
+                (
+                    "title_pattern",
+                    models.CharField(
+                        help_text="This is the pattern for the new title. You can either write an exact replacement string (no quotes required) or you can write sinequa-valid code",
+                        validators=[sde_collections.models.delta_patterns.validate_title_pattern],
+                        verbose_name="Title Pattern",
+                    ),
+                ),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s",
+                        related_query_name="%(class)ss",
+                        to="sde_collections.collection",
+                    ),
+                ),
+                (
+                    "curated_urls",
+                    models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"),
+                ),
+                (
+                    "delta_urls",
+                    models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"),
+                ),
+            ],
+            options={
+                "verbose_name": "Title Pattern",
+                "verbose_name_plural": "Title Patterns",
+                "unique_together": {("collection", "match_pattern")},
+            },
+        ),
+        migrations.CreateModel(
+            name="DeltaResolvedTitleError",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("error_string", models.TextField()),
+                ("http_status_code", models.IntegerField(blank=True, null=True)),
+                (
+                    "delta_url",
+                    models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltaurl"),
+                ),
+                (
+                    "title_pattern",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltatitlepattern"
+                    ),
+                ),
+            ],
+            options={
+                "abstract": False,
+            },
+        ),
+        migrations.CreateModel(
+            name="DeltaResolvedTitle",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("resolved_title", models.CharField(blank=True, default="")),
+                (
+                    "delta_url",
+                    models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltaurl"),
+                ),
+                (
+                    "title_pattern",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltatitlepattern"
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "Resolved Title",
+                "verbose_name_plural": "Resolved Titles",
+            },
+        ),
+        migrations.CreateModel(
+            name="DeltaIncludePattern",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                (
+                    "match_pattern",
+                    models.CharField(
+                        help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned",
+                        verbose_name="Pattern",
+                    ),
+                ),
+                (
+                    "match_pattern_type",
+                    models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1),
+                ),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s",
+                        related_query_name="%(class)ss",
+                        to="sde_collections.collection",
+                    ),
+                ),
+                (
+                    "curated_urls",
+                    models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"),
+                ),
+                (
+                    "delta_urls",
+                    models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"),
+                ),
+            ],
+            options={
+                "verbose_name": "Include Pattern",
+                "verbose_name_plural": "Include Patterns",
+                "unique_together": {("collection", "match_pattern")},
+            },
+        ),
+        migrations.CreateModel(
+            name="DeltaExcludePattern",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                (
+                    "match_pattern",
+                    models.CharField(
+                        help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned",
+                        verbose_name="Pattern",
+                    ),
+                ),
+                (
+                    "match_pattern_type",
+                    models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1),
+                ),
+                ("reason", models.TextField(blank=True, default="", verbose_name="Reason for excluding")),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s",
+                        related_query_name="%(class)ss",
+                        to="sde_collections.collection",
+                    ),
+                ),
+                (
+                    "curated_urls",
+                    models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"),
+                ),
+                (
+                    "delta_urls",
+                    models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"),
+                ),
+            ],
+            options={
+                "verbose_name": "Exclude Pattern",
+                "verbose_name_plural": "Exclude Patterns",
+                "unique_together": {("collection", "match_pattern")},
+            },
+        ),
+        migrations.CreateModel(
+            name="DeltaDocumentTypePattern",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                (
+                    "match_pattern",
+                    models.CharField(
+                        help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned",
+                        verbose_name="Pattern",
+                    ),
+                ),
+                (
+                    "match_pattern_type",
+                    models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1),
+                ),
+                (
+                    "document_type",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Images"),
+                            (2, "Data"),
+                            (3, "Documentation"),
+                            (4, "Software and Tools"),
+                            (5, "Missions and Instruments"),
+                        ]
+                    ),
+                ),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s",
+                        related_query_name="%(class)ss",
+                        to="sde_collections.collection",
+                    ),
+                ),
+                (
+                    "curated_urls",
+                    models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"),
+                ),
+                (
+                    "delta_urls",
+                    models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"),
+                ),
+            ],
+            options={
+                "verbose_name": "Document Type Pattern",
+                "verbose_name_plural": "Document Type Patterns",
+                "unique_together": {("collection", "match_pattern")},
+            },
+        ),
+        migrations.CreateModel(
+            name="DeltaDivisionPattern",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                (
+                    "match_pattern",
+                    models.CharField(
+                        help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned",
+                        verbose_name="Pattern",
+                    ),
+                ),
+                (
+                    "match_pattern_type",
+                    models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1),
+                ),
+                (
+                    "division",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Astrophysics"),
+                            (2, "Biological and Physical Sciences"),
+                            (3, "Earth Science"),
+                            (4, "Heliophysics"),
+                            (5, "Planetary Science"),
+                            (6, "General"),
+                        ]
+                    ),
+                ),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="%(class)s",
+                        related_query_name="%(class)ss",
+                        to="sde_collections.collection",
+                    ),
+                ),
+                (
+                    "curated_urls",
+                    models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"),
+                ),
+                (
+                    "delta_urls",
+                    models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"),
+                ),
+            ],
+            options={
+                "verbose_name": "Division Pattern",
+                "verbose_name_plural": "Division Patterns",
+                "unique_together": {("collection", "match_pattern")},
+            },
+        ),
+    ]

From 6d3fc589afcdeb16c3d63436cf4255a15d8abcd2 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 11:40:06 -0600
Subject: [PATCH 078/441] add deltas to the admin and improve verbose names

---
 sde_collections/admin.py                 | 37 +++++++++++++++++++++++-
 sde_collections/models/delta_patterns.py | 20 ++++++-------
 sde_collections/models/delta_url.py      | 15 ++++++++++
 3 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index e7ff34bb..5c2d116c 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -3,9 +3,14 @@
 from django.contrib import admin, messages
 from django.http import HttpResponse
 
+from sde_collections.models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaTitlePattern,
+)
+
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
-from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
 
@@ -300,6 +305,31 @@ class DivisionPatternAdmin(admin.ModelAdmin):
     search_fields = ("match_pattern", "division")
 
 
+# deltas below
+class DeltaTitlePatternAdmin(admin.ModelAdmin):
+    """Admin View for DeltaTitlePattern"""
+
+    list_display = (
+        "match_pattern",
+        "title_pattern",
+        "collection",
+        "match_pattern_type",
+    )
+    list_filter = (
+        "match_pattern_type",
+        "collection",
+    )
+
+
+class DeltaResolvedTitleAdmin(admin.ModelAdmin):
+    list_display = ["title_pattern", "delta_url", "resolved_title", "created_at"]
+
+
+class DeltaDivisionPatternAdmin(admin.ModelAdmin):
+    list_display = ("collection", "match_pattern", "division")
+    search_fields = ("match_pattern", "division")
+
+
 class DumpUrlAdmin(admin.ModelAdmin):
     """Admin View for DumpUrl"""
 
@@ -327,6 +357,11 @@ class CuratedUrlAdmin(admin.ModelAdmin):
 admin.site.register(IncludePattern)
 admin.site.register(ResolvedTitle, ResolvedTitleAdmin)
 admin.site.register(DivisionPattern, DivisionPatternAdmin)
+
+
+admin.site.register(DeltaTitlePattern, DeltaTitlePatternAdmin)
+admin.site.register(DeltaResolvedTitle, DeltaResolvedTitleAdmin)
+admin.site.register(DeltaDivisionPattern, DeltaDivisionPatternAdmin)
 admin.site.register(DumpUrl, DumpUrlAdmin)
 admin.site.register(DeltaUrl, DeltaUrlAdmin)
 admin.site.register(CuratedUrl, CuratedUrlAdmin)
diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 67b1502d..4c21f07a 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -127,8 +127,8 @@ class DeltaExcludePattern(BaseMatchPattern):
     # This pattern's functionality is handled by the `excluded` annotation in the manager.
 
     class Meta:
-        verbose_name = "Exclude Pattern"
-        verbose_name_plural = "Exclude Patterns"
+        verbose_name = "Delta Exclude Pattern"
+        verbose_name_plural = "Delta Exclude Patterns"
         unique_together = ("collection", "match_pattern")
 
 
@@ -136,8 +136,8 @@ class DeltaIncludePattern(BaseMatchPattern):
     # No additional logic needed for `apply`—using base class functionality.
 
     class Meta:
-        verbose_name = "Include Pattern"
-        verbose_name_plural = "Include Patterns"
+        verbose_name = "Delta Include Pattern"
+        verbose_name_plural = "Delta Include Patterns"
         unique_together = ("collection", "match_pattern")
 
 
@@ -214,8 +214,8 @@ def unapply(self) -> None:
         self.curated_urls.clear()
 
     class Meta:
-        verbose_name = "Title Pattern"
-        verbose_name_plural = "Title Patterns"
+        verbose_name = "Delta Title Pattern"
+        verbose_name_plural = "Delta Title Patterns"
         unique_together = ("collection", "match_pattern")
 
 
@@ -233,8 +233,8 @@ def unapply(self) -> None:
         self.curated_urls.clear()
 
     class Meta:
-        verbose_name = "Document Type Pattern"
-        verbose_name_plural = "Document Type Patterns"
+        verbose_name = "Delta Document Type Pattern"
+        verbose_name_plural = "Delta Document Type Patterns"
         unique_together = ("collection", "match_pattern")
 
 
@@ -251,8 +251,8 @@ def unapply(self) -> None:
         self.delta_urls.update(division=None)
 
     class Meta:
-        verbose_name = "Division Pattern"
-        verbose_name_plural = "Division Patterns"
+        verbose_name = "Delta Division Pattern"
+        verbose_name_plural = "Delta Division Patterns"
         unique_together = ("collection", "match_pattern")
 
 
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 898f3df3..db52d75f 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -96,6 +96,11 @@ def __str__(self):
 class DumpUrl(BaseUrl):
     """Stores the raw dump from the server before deltas are calculated."""
 
+    class Meta:
+        verbose_name = "Dump Urls"
+        verbose_name_plural = "Dump Urls"
+        ordering = ["url"]
+
 
 class DeltaUrl(BaseUrl):
     """Urls that are being curated. Only deltas are stored in this model."""
@@ -103,12 +108,22 @@ class DeltaUrl(BaseUrl):
     objects = DeltaUrlManager()
     delete = models.BooleanField(default=False)
 
+    class Meta:
+        verbose_name = "Delta Urls"
+        verbose_name_plural = "Delta Urls"
+        ordering = ["url"]
+
 
 class CuratedUrl(BaseUrl):
     """Urls that are curated and ready for production"""
 
     objects = CuratedUrlManager()
 
+    class Meta:
+        verbose_name = "Curated Urls"
+        verbose_name_plural = "Curated Urls"
+        ordering = ["url"]
+
 
 class DeltaResolvedTitleBase(models.Model):
     # TODO: need to understand this logic and whether we need to have thess match to CuratedUrls as well

From 04286f845425d21f8e9b100f8f15353723c79144 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 14:07:55 -0600
Subject: [PATCH 079/441] Update sde_collections/admin.py

---
 sde_collections/admin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 9645a098..639802b7 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -124,7 +124,7 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_
     messages.add_message(
         request,
         messages.INFO,
-        f"Started importing URLs from {server_name.upper()} Server",
+        f"Started importing URLs from the API for: {collection_names} from {server_name.upper()} Server",
     )
 
 

From ae211ca826037c4bdd4146a2a70b67f0a64f5db8 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 14:23:03 -0600
Subject: [PATCH 080/441] Update sde_collections/sinequa_api.py

---
 sde_collections/sinequa_api.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 391dab9e..85552bda 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -73,7 +73,10 @@ def process_response(self, url: str, payload: dict[str, Any]) -> Any:
         return meaningful_response
 
     def query(self, page: int, collection_config_folder: str = "") -> Any:
-        url = f"{self.base_url}/api/v1/search.query?Password={self.password}&User={self.user}"
+        if self.server_name:
+            url = f"{self.base_url}/api/v1/search.query?Password={self.password}&User={self.user}"
+        else:
+            url = f"{self.base_url}/api/v1/search.query"
         payload = {
             "app": self.app_name,
             "query": {

From 1775cf01cb33dba62e264a88113a2dd40b637868 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 11 Nov 2024 14:23:56 -0600
Subject: [PATCH 081/441] Update sde_collections/sinequa_api.py

---
 sde_collections/sinequa_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 85552bda..52b2ce46 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -52,7 +52,7 @@
 
 
 class Api:
-    def __init__(self, server_name: str) -> None:
+    def __init__(self, server_name: str=None) -> None:
         self.server_name = server_name
         config = server_configs[server_name]
         self.app_name: str = config["app_name"]

From 7ba1078d7c4e9731e18ce7dd817d8b0e1cb6b977 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Nov 2024 20:24:08 +0000
Subject: [PATCH 082/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/sinequa_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 52b2ce46..749dfe2e 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -52,7 +52,7 @@
 
 
 class Api:
-    def __init__(self, server_name: str=None) -> None:
+    def __init__(self, server_name: str = None) -> None:
         self.server_name = server_name
         config = server_configs[server_name]
         self.app_name: str = config["app_name"]

From 5b58e0c42b0a8a9c2d3b6fa085b51a83631bd803 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Mon, 11 Nov 2024 14:43:08 -0600
Subject: [PATCH 083/441] Updated migrated_urls import

---
 sde_collections/management/commands/migrate_urls.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls.py b/sde_collections/management/commands/migrate_urls.py
index 6958c107..bba364a6 100644
--- a/sde_collections/management/commands/migrate_urls.py
+++ b/sde_collections/management/commands/migrate_urls.py
@@ -3,8 +3,7 @@
 from sde_collections.models.candidate_url import CandidateURL
 from sde_collections.models.collection import Collection
 from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
-from sde_collections.models.curated_url import CuratedUrl
-from sde_collections.models.delta_url import DeltaUrl
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
 
 
 class Command(BaseCommand):

From 48b5e6356a445d854a18c20f1a82b486781e651e Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 11 Nov 2024 16:17:00 -0600
Subject: [PATCH 084/441] Issue #1077 and  #1085

---
 sde_collections/admin.py       |  1 +
 sde_collections/sinequa_api.py | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 639802b7..582e5758 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -77,6 +77,7 @@ def import_sinequa_metadata(modeladmin, request, queryset):
     for collection in queryset.all():
         # eventually this needs to be done in celery
         collection.import_metadata_from_sinequa_config()
+        collection_names = ", ".join(queryset.values_list("name", flat=True))
         messages.add_message(
             request,
             messages.INFO,
diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 749dfe2e..eb5d19de 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -124,5 +124,31 @@ def sql_query(self, sql: str) -> Any:
             raise Exception(f"API request failed: {str(e)}")
 
     def get_full_texts(self, collection_config_folder: str) -> Any:
+        '''
+        Retrieves the full texts, URLs, and titles for a specified collection.
+        
+        Returns:
+            dict: A JSON response containing the results of the SQL query in an expected format under the 'Rows' key,
+                where each item has 'url1', 'text', and 'title' .
+
+        Example:
+            Calling get_full_texts("example_collection") might return:
+            {
+                'Rows': [
+                    {
+                        'url1': 'http://example.com/article1',
+                        'text': 'Here is the full text of the first article...',
+                        'title': 'Article One Title'
+                    },
+                    {
+                        'url1': 'http://example.com/article2',
+                        'text': 'Here is the full text of the second article...',
+                        'title': 'Article Two Title'
+                    }
+                ]
+            }
+        
+        '''
         sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'"
         return self.sql_query(sql)
+    
\ No newline at end of file

From 397efc2b2d510981f3d6f578d23c0810c5498a3d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Nov 2024 22:18:00 +0000
Subject: [PATCH 085/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/sinequa_api.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index eb5d19de..cfce71f5 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -124,9 +124,9 @@ def sql_query(self, sql: str) -> Any:
             raise Exception(f"API request failed: {str(e)}")
 
     def get_full_texts(self, collection_config_folder: str) -> Any:
-        '''
+        """
         Retrieves the full texts, URLs, and titles for a specified collection.
-        
+
         Returns:
             dict: A JSON response containing the results of the SQL query in an expected format under the 'Rows' key,
                 where each item has 'url1', 'text', and 'title' .
@@ -147,8 +147,7 @@ def get_full_texts(self, collection_config_folder: str) -> Any:
                     }
                 ]
             }
-        
-        '''
+
+        """
         sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'"
         return self.sql_query(sql)
-    
\ No newline at end of file

From aa4eb0259f9dc9ca5cfd2c96128c7f3bf0ca28c8 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Mon, 11 Nov 2024 16:21:51 -0600
Subject: [PATCH 086/441] Added deltapatterns to views and serializers

---
 .../0063_alter_curatedurl_options_and_more.py |  48 ++++++++
 sde_collections/serializers.py                | 111 ++++++++++++++++--
 sde_collections/views.py                      | 100 ++++++++++++++++
 3 files changed, 247 insertions(+), 12 deletions(-)
 create mode 100644 sde_collections/migrations/0063_alter_curatedurl_options_and_more.py

diff --git a/sde_collections/migrations/0063_alter_curatedurl_options_and_more.py b/sde_collections/migrations/0063_alter_curatedurl_options_and_more.py
new file mode 100644
index 00000000..9a699113
--- /dev/null
+++ b/sde_collections/migrations/0063_alter_curatedurl_options_and_more.py
@@ -0,0 +1,48 @@
+# Generated by Django 4.2.9 on 2024-11-11 22:09
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0062_deltatitlepattern_deltaresolvedtitleerror_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name="curatedurl",
+            options={"ordering": ["url"], "verbose_name": "Curated Urls", "verbose_name_plural": "Curated Urls"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltadivisionpattern",
+            options={"verbose_name": "Delta Division Pattern", "verbose_name_plural": "Delta Division Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltadocumenttypepattern",
+            options={
+                "verbose_name": "Delta Document Type Pattern",
+                "verbose_name_plural": "Delta Document Type Patterns",
+            },
+        ),
+        migrations.AlterModelOptions(
+            name="deltaexcludepattern",
+            options={"verbose_name": "Delta Exclude Pattern", "verbose_name_plural": "Delta Exclude Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltaincludepattern",
+            options={"verbose_name": "Delta Include Pattern", "verbose_name_plural": "Delta Include Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltatitlepattern",
+            options={"verbose_name": "Delta Title Pattern", "verbose_name_plural": "Delta Title Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltaurl",
+            options={"ordering": ["url"], "verbose_name": "Delta Urls", "verbose_name_plural": "Delta Urls"},
+        ),
+        migrations.AlterModelOptions(
+            name="dumpurl",
+            options={"ordering": ["url"], "verbose_name": "Dump Urls", "verbose_name_plural": "Dump Urls"},
+        ),
+    ]
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index aa14ef92..b6a47641 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -3,6 +3,13 @@
 from .models.candidate_url import CandidateURL
 from .models.collection import Collection, WorkflowHistory
 from .models.collection_choice_fields import Divisions, DocumentTypes
+from .models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+    DeltaTitlePattern,
+)
 from .models.delta_url import CuratedUrl, DeltaUrl
 from .models.pattern import (
     DivisionPattern,
@@ -109,16 +116,16 @@ class CuratedURLSerializer(serializers.ModelSerializer):
     curated_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_curated_urls_count(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
-        return titlepattern.curated_urls.count() if titlepattern else 0
+        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        return deltatitlepattern.curated_urls.count() if deltatitlepattern else 0
 
     def get_generated_title_id(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
-        return titlepattern.id if titlepattern else None
+        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        return deltatitlepattern.id if deltatitlepattern else None
 
     def get_match_pattern_type(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
-        return titlepattern.match_pattern_type if titlepattern else None
+        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        return deltatitlepattern.match_pattern_type if deltatitlepattern else None
 
     class Meta:
         model = CuratedUrl
@@ -153,16 +160,16 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_delta_urls_count(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
-        return titlepattern.delta_urls.count() if titlepattern else 0
+        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        return deltatitlepattern.delta_urls.count() if deltatitlepattern else 0
 
     def get_generated_title_id(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
-        return titlepattern.id if titlepattern else None
+        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        return deltatitlepattern.id if deltatitlepattern else None
 
     def get_match_pattern_type(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
-        return titlepattern.match_pattern_type if titlepattern else None
+        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        return deltatitlepattern.match_pattern_type if deltatitlepattern else None
 
     class Meta:
         model = DeltaUrl
@@ -261,12 +268,24 @@ class Meta:
         fields = BasePatternSerializer.Meta.fields + ("reason",)
 
 
+class DeltaExcludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+    class Meta:
+        model = DeltaExcludePattern
+        fields = BasePatternSerializer.Meta.fields + ("reason",)
+
+
 class IncludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = IncludePattern
         fields = BasePatternSerializer.Meta.fields
 
 
+class DeltaIncludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+    class Meta:
+        model = DeltaIncludePattern
+        fields = BasePatternSerializer.Meta.fields
+
+
 class TitlePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = TitlePattern
@@ -284,6 +303,23 @@ def validate_match_pattern(self, value):
         return value
 
 
+class DeltaTitlePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+    class Meta:
+        model = DeltaTitlePattern
+        fields = BasePatternSerializer.Meta.fields + ("delta_title_pattern",)
+
+    def validate_match_pattern(self, value):
+        try:
+            delta_title_pattern = DeltaTitlePattern.objects.get(
+                match_pattern=value,
+                match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
+            )
+            delta_title_pattern.delete()
+        except DeltaTitlePattern.DoesNotExist:
+            pass
+        return value
+
+
 class DocumentTypePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
     document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
     document_type = serializers.ChoiceField(
@@ -312,6 +348,34 @@ def validate_match_pattern(self, value):
         return value
 
 
+class DeltaDocumentTypePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+    document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
+    document_type = serializers.ChoiceField(
+        choices=DocumentTypes.choices
+        + [
+            (0, "None"),
+        ]
+    )
+
+    class Meta:
+        model = DeltaDocumentTypePattern
+        fields = BasePatternSerializer.Meta.fields + (
+            "document_type",
+            "document_type_display",
+        )
+
+    def validate_match_pattern(self, value):
+        try:
+            title_pattern = DeltaDocumentTypePattern.objects.get(
+                match_pattern=value,
+                match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
+            )
+            title_pattern.delete()
+        except DeltaDocumentTypePattern.DoesNotExist:
+            pass
+        return value
+
+
 class DivisionPatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
     division_display = serializers.CharField(source="get_division_display", read_only=True)
     division = serializers.ChoiceField(choices=Divisions.choices)
@@ -333,3 +397,26 @@ def validate_match_pattern(self, value):
         except DivisionPattern.DoesNotExist:
             pass
         return value
+
+
+class DeltaDivisionPatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+    division_display = serializers.CharField(source="get_division_display", read_only=True)
+    division = serializers.ChoiceField(choices=Divisions.choices)
+
+    class Meta:
+        model = DeltaDivisionPattern
+        fields = BasePatternSerializer.Meta.fields + (
+            "division",
+            "division_display",
+        )
+
+    def validate_match_pattern(self, value):
+        try:
+            division_pattern = DeltaDivisionPattern.objects.get(
+                match_pattern=value,
+                match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
+            )
+            division_pattern.delete()
+        except DeltaDivisionPattern.DoesNotExist:
+            pass
+        return value
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 27e4a18b..e6ad6eb0 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -27,6 +27,13 @@
     DocumentTypes,
     WorkflowStatusChoices,
 )
+from .models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+    DeltaTitlePattern,
+)
 from .models.delta_url import CuratedUrl, DeltaUrl
 from .models.pattern import (
     DivisionPattern,
@@ -42,6 +49,11 @@
     CollectionSerializer,
     CuratedUrlAPISerializer,
     CuratedURLSerializer,
+    DeltaDivisionPatternSerializer,
+    DeltaDocumentTypePatternSerializer,
+    DeltaExcludePatternSerializer,
+    DeltaIncludePatternSerializer,
+    DeltaTitlePatternSerializer,
     DeltaURLSerializer,
     DivisionPatternSerializer,
     DocumentTypePatternSerializer,
@@ -407,6 +419,26 @@ def create(self, request, *args, **kwargs):
             return super().create(request, *args, **kwargs)
 
 
+class DeltaExcludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = DeltaExcludePattern.objects.all()
+    serializer_class = DeltaExcludePatternSerializer
+
+    def get_queryset(self):
+        return super().get_queryset().order_by("match_pattern")
+
+    def create(self, request, *args, **kwargs):
+        match_pattern = request.POST.get("match_pattern")
+        collection_id = request.POST.get("collection")
+        try:
+            DeltaExcludePattern.objects.get(
+                collection_id=Collection.objects.get(id=collection_id),
+                match_pattern=match_pattern,
+            ).delete()
+            return Response(status=status.HTTP_200_OK)
+        except DeltaExcludePattern.DoesNotExist:
+            return super().create(request, *args, **kwargs)
+
+
 class IncludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = IncludePattern.objects.all()
     serializer_class = IncludePatternSerializer
@@ -427,6 +459,26 @@ def create(self, request, *args, **kwargs):
             return super().create(request, *args, **kwargs)
 
 
+class DeltaIncludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = DeltaIncludePattern.objects.all()
+    serializer_class = DeltaIncludePatternSerializer
+
+    def get_queryset(self):
+        return super().get_queryset().order_by("match_pattern")
+
+    def create(self, request, *args, **kwargs):
+        match_pattern = request.POST.get("match_pattern")
+        collection_id = request.POST.get("collection")
+        try:
+            DeltaIncludePattern.objects.get(
+                collection_id=Collection.objects.get(id=collection_id),
+                match_pattern=match_pattern,
+            ).delete()
+            return Response(status=status.HTTP_200_OK)
+        except DeltaIncludePattern.DoesNotExist:
+            return super().create(request, *args, **kwargs)
+
+
 class TitlePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = TitlePattern.objects.all()
     serializer_class = TitlePatternSerializer
@@ -435,6 +487,14 @@ def get_queryset(self):
         return super().get_queryset().order_by("match_pattern")
 
 
+class DeltaTitlePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = DeltaTitlePattern.objects.all()
+    serializer_class = DeltaTitlePatternSerializer
+
+    def get_queryset(self):
+        return super().get_queryset().order_by("match_pattern")
+
+
 class DocumentTypePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = DocumentTypePattern.objects.all()
     serializer_class = DocumentTypePatternSerializer
@@ -460,6 +520,31 @@ def create(self, request, *args, **kwargs):
                 return Response(status=status.HTTP_204_NO_CONTENT)
 
 
+class DeltaDocumentTypePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = DeltaDocumentTypePattern.objects.all()
+    serializer_class = DeltaDocumentTypePatternSerializer
+
+    def get_queryset(self):
+        return super().get_queryset().order_by("match_pattern")
+
+    def create(self, request, *args, **kwargs):
+        document_type = request.POST.get("document_type")
+        if not int(document_type) == 0:  # 0=none
+            return super().create(request, *args, **kwargs)
+        else:
+            collection_id = request.POST.get("collection")
+            match_pattern = request.POST.get("match_pattern")
+            try:
+                DeltaDocumentTypePattern.objects.get(
+                    collection_id=Collection.objects.get(id=collection_id),
+                    match_pattern=match_pattern,
+                    match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
+                ).delete()
+                return Response(status=status.HTTP_200_OK)
+            except DeltaDocumentTypePattern.DoesNotExist:
+                return Response(status=status.HTTP_204_NO_CONTENT)
+
+
 class DivisionPatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = DivisionPattern.objects.all()
     serializer_class = DivisionPatternSerializer
@@ -475,6 +560,21 @@ def create(self, request, *args, **kwargs):
             return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
 
 
+class DeltaDivisionPatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = DeltaDivisionPattern.objects.all()
+    serializer_class = DeltaDivisionPatternSerializer
+
+    def get_queryset(self):
+        return super().get_queryset().order_by("match_pattern")
+
+    def create(self, request, *args, **kwargs):
+        division = request.POST.get("division")
+        if division:
+            return super().create(request, *args, **kwargs)
+        else:
+            return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
+
+
 class CollectionViewSet(viewsets.ModelViewSet):
     queryset = Collection.objects.all()
     serializer_class = CollectionSerializer

From 5596e83ce0a76604dfc18a4c5d03be8359afeed9 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 11 Nov 2024 16:28:24 -0600
Subject: [PATCH 087/441] Issue #1077 and #1085_

---
 sde_collections/admin.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 582e5758..cae25c7d 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -76,8 +76,6 @@ def download_candidate_urls_as_csv(modeladmin, request, queryset):
 def import_sinequa_metadata(modeladmin, request, queryset):
     for collection in queryset.all():
         # eventually this needs to be done in celery
-        collection.import_metadata_from_sinequa_config()
-        collection_names = ", ".join(queryset.values_list("name", flat=True))
         messages.add_message(
             request,
             messages.INFO,

From 73134a1ee3286d3b96f9a2e919d4529e89e79917 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 11 Nov 2024 16:33:00 -0600
Subject: [PATCH 088/441] Issue #1077 _and #1085_

---
 sde_collections/admin.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index cae25c7d..6162b84c 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -76,6 +76,7 @@ def download_candidate_urls_as_csv(modeladmin, request, queryset):
 def import_sinequa_metadata(modeladmin, request, queryset):
     for collection in queryset.all():
         # eventually this needs to be done in celery
+        collection_names = ", ".join(queryset.values_list("name", flat=True))
         messages.add_message(
             request,
             messages.INFO,
@@ -120,6 +121,7 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_
         collection_ids=list(queryset.values_list("id", flat=True)),
         server_name=server_name,
     )
+    collection_names = ", ".join(queryset.values_list("name", flat=True))
     messages.add_message(
         request,
         messages.INFO,

From 5309284b96b97ea8787e9d4053686b4906d31480 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 11 Nov 2024 16:36:57 -0600
Subject: [PATCH 089/441] Issue #1077 _and_ #1085_

---
 sde_collections/admin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 6162b84c..1d9ba966 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -76,7 +76,7 @@ def download_candidate_urls_as_csv(modeladmin, request, queryset):
 def import_sinequa_metadata(modeladmin, request, queryset):
     for collection in queryset.all():
         # eventually this needs to be done in celery
-        collection_names = ", ".join(queryset.values_list("name", flat=True))
+        collection.import_metadata_from_sinequa_config()
         messages.add_message(
             request,
             messages.INFO,

From de75ee5b16bd3ba7a4a60283c486a91a616127e5 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 12:00:03 -0600
Subject: [PATCH 090/441] move auth acquisition, segment source assignment,
 improve error handling

---
 sde_collections/sinequa_api.py | 81 ++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 24 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index cfce71f5..a9fb2205 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -52,31 +52,62 @@
 
 
 class Api:
-    def __init__(self, server_name: str = None) -> None:
+    def __init__(self, server_name: str = None, user: str = None, password: str = None, token: str = None) -> None:
         self.server_name = server_name
+        if server_name not in server_configs:
+            raise ValueError(f"Server name '{server_name}' is not in server_configs")
+
         config = server_configs[server_name]
         self.app_name: str = config["app_name"]
         self.query_name: str = config["query_name"]
         self.base_url: str = config["base_url"]
-        self.user = getattr(settings, f"{server_name}_USER".upper(), None)
-        self.password = getattr(settings, f"{server_name}_PASSWORD".upper(), None)
-        self.token = getattr(settings, f"{server_name}_TOKEN".upper(), None)
+        self.dev_servers = ["xli", "lrm_dev", "lrm_qa"]
+
+        # Store provided values only
+        self._provided_user = user
+        self._provided_password = password
+        self._provided_token = token
+
+    def _get_user(self) -> str | None:
+        """Retrieve the user, using the provided value or defaulting to Django settings."""
+        return self._provided_user or getattr(settings, f"{self.server_name}_USER".upper(), None)
+
+    def _get_password(self) -> str | None:
+        """Retrieve the password, using the provided value or defaulting to Django settings."""
+        return self._provided_password or getattr(settings, f"{self.server_name}_PASSWORD".upper(), None)
+
+    def _get_token(self) -> str | None:
+        """Retrieve the token, using the provided value or defaulting to Django settings."""
+        return self._provided_token or getattr(settings, f"{self.server_name}_TOKEN".upper(), None)
+
+    def _get_source_name(self) -> str:
+        """by default, the source is /SDE/. However for the various dev servers, the source is tends to be /scrapers/"""
+        return "scrapers" if self.server_name in self.dev_servers else "SDE"
 
     def process_response(self, url: str, payload: dict[str, Any]) -> Any:
         response = requests.post(url, headers={}, json=payload, verify=False)
 
-        if response.status_code == requests.status_codes.codes.ok:
-            meaningful_response = response.json()
+        if response.status_code == requests.codes.ok:
+            return response.json()
         else:
-            raise Exception(response.text)
-
-        return meaningful_response
+            response.raise_for_status()
 
     def query(self, page: int, collection_config_folder: str = "") -> Any:
-        if self.server_name:
-            url = f"{self.base_url}/api/v1/search.query?Password={self.password}&User={self.user}"
+        url = f"{self.base_url}/api/v1/search.query"
+        if self.server_name in self.dev_servers:
+
+            user = self._get_user()
+            password = self._get_password()
+
+            if not user or not password:
+                raise ValueError(
+                    "User and password are required for the query endpoint on the following servers: {self.dev_servers}"
+                )
+            authentication = f"?Password={password}&User={user}"
+            url = f"{url}{authentication}"
         else:
             url = f"{self.base_url}/api/v1/search.query"
+
         payload = {
             "app": self.app_name,
             "query": {
@@ -89,22 +120,19 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
         }
 
         if collection_config_folder:
-            if self.server_name in ["xli", "lrm_dev", "lrm_qa"]:
-                payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/"
-            else:
-                payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/"
-
-        response = self.process_response(url, payload)
+            source = self._get_source_name()
+            payload["query"]["advanced"]["collection"] = f"/{source}/{collection_config_folder}/"
 
-        return response
+        return self.process_response(url, payload)
 
     def sql_query(self, sql: str) -> Any:
         """Executes an SQL query on the configured server using token-based authentication."""
-        if not self.token:
-            raise ValueError("You must have a token to use the SQL endpoint")
+        token = self._get_token()
+        if not token:
+            raise ValueError("A token is required to use the SQL endpoint")
 
         url = f"{self.base_url}/api/v1/engine.sql"
-        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.token}"}
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
         payload = json.dumps(
             {
                 "method": "engine.sql",
@@ -116,14 +144,15 @@ def sql_query(self, sql: str) -> Any:
                 "engines": "default",
             }
         )
+
         try:
             response = requests.post(url, headers=headers, data=payload, timeout=10)
             response.raise_for_status()
             return response.json()
         except requests.exceptions.RequestException as e:
-            raise Exception(f"API request failed: {str(e)}")
+            raise RuntimeError(f"Api request to SQL endpoint failed: {str(e)}")
 
-    def get_full_texts(self, collection_config_folder: str) -> Any:
+    def get_full_texts(self, collection_config_folder: str, source: str = None) -> Any:
         """
         Retrieves the full texts, URLs, and titles for a specified collection.
 
@@ -149,5 +178,9 @@ def get_full_texts(self, collection_config_folder: str) -> Any:
             }
 
         """
-        sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'"
+
+        if not source:
+            source = self._get_source_name()
+
+        sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/{source}/{collection_config_folder}/'"
         return self.sql_query(sql)

From 91393289a44b98b44d57887ea2c9046e7edbea1d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 13:53:41 -0600
Subject: [PATCH 091/441] use index from config in sql query

---
 sde_collections/sinequa_api.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index a9fb2205..1c7a0f6f 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -17,31 +17,37 @@
         "app_name": "nasa-sba-smd",
         "query_name": "query-smd-primary",
         "base_url": "https://sciencediscoveryengine.test.nasa.gov",
+        "index": "sde_index",
     },
     "production": {
         "app_name": "nasa-sba-smd",
         "query_name": "query-smd-primary",
         "base_url": "https://sciencediscoveryengine.nasa.gov",
+        "index": "sde_index",
     },
     "secret_test": {
         "app_name": "nasa-sba-sde",
         "query_name": "query-sde-primary",
         "base_url": "https://sciencediscoveryengine.test.nasa.gov",
+        "index": "sde_index",
     },
     "secret_production": {
         "app_name": "nasa-sba-sde",
         "query_name": "query-sde-primary",
         "base_url": "https://sciencediscoveryengine.nasa.gov",
+        "index": "sde_index",
     },
     "xli": {
         "app_name": "nasa-sba-smd",
         "query_name": "query-smd-primary",
         "base_url": "http://sde-xli.nasa-impact.net",
+        "index": "sde_index",
     },
     "lrm_dev": {
         "app_name": "sde-init-check",
         "query_name": "query-init-check",
         "base_url": "https://sde-lrm.nasa-impact.net",
+        "index": "sde_init_check",
     },
     "lrm_qa": {
         "app_name": "sde-init-check",
@@ -57,10 +63,10 @@ def __init__(self, server_name: str = None, user: str = None, password: str = No
         if server_name not in server_configs:
             raise ValueError(f"Server name '{server_name}' is not in server_configs")
 
-        config = server_configs[server_name]
-        self.app_name: str = config["app_name"]
-        self.query_name: str = config["query_name"]
-        self.base_url: str = config["base_url"]
+        self.config = server_configs[server_name]
+        self.app_name: str = self.config["app_name"]
+        self.query_name: str = self.config["query_name"]
+        self.base_url: str = self.config["base_url"]
         self.dev_servers = ["xli", "lrm_dev", "lrm_qa"]
 
         # Store provided values only
@@ -130,7 +136,6 @@ def sql_query(self, sql: str) -> Any:
         token = self._get_token()
         if not token:
             raise ValueError("A token is required to use the SQL endpoint")
-
         url = f"{self.base_url}/api/v1/engine.sql"
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
         payload = json.dumps(
@@ -182,5 +187,8 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> A
         if not source:
             source = self._get_source_name()
 
-        sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/{source}/{collection_config_folder}/'"
+        if (index := self.config.get("index")) is None:
+            raise ValueError("Index not defined for this server")
+
+        sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
         return self.sql_query(sql)

From 03c7a8f4a7be51d2caa7ead3f9fb018a0fbd0521 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 14:08:28 -0600
Subject: [PATCH 092/441] change to a cleaner output for api.get_full_texts

---
 sde_collections/sinequa_api.py | 22 +++++++++++++---------
 sde_collections/tasks.py       | 18 ++++++++----------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 1c7a0f6f..166afd6b 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -162,26 +162,23 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> A
         Retrieves the full texts, URLs, and titles for a specified collection.
 
         Returns:
-            dict: A JSON response containing the results of the SQL query in an expected format under the 'Rows' key,
-                where each item has 'url1', 'text', and 'title' .
+            dict: A JSON response containing the results of the SQL query,
+                where each item has 'url', 'text', and 'title'.
 
         Example:
             Calling get_full_texts("example_collection") might return:
-            {
-                'Rows': [
+                [
                     {
-                        'url1': 'http://example.com/article1',
+                        'url': 'http://example.com/article1',
                         'text': 'Here is the full text of the first article...',
                         'title': 'Article One Title'
                     },
                     {
-                        'url1': 'http://example.com/article2',
+                        'url': 'http://example.com/article2',
                         'text': 'Here is the full text of the second article...',
                         'title': 'Article Two Title'
                     }
                 ]
-            }
-
         """
 
         if not source:
@@ -191,4 +188,11 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> A
             raise ValueError("Index not defined for this server")
 
         sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
-        return self.sql_query(sql)
+        full_text_response = self.sql_query(sql)
+        return self._process_full_text_response(full_text_response)
+
+    @staticmethod
+    def _process_full_text_response(full_text_response: str):
+        return [
+            {"url": url, "full_text": full_text, "title": title} for url, full_text, title in full_text_response["Rows"]
+        ]
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 77061ee6..f214dc6e 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -160,18 +160,16 @@ def fetch_and_update_full_text(collection_id, server_name):
     """
     collection = Collection.objects.get(id=collection_id)
     api = Api(server_name)
-    full_texts = api.get_full_texts(collection.config_folder)
+    documents = api.get_full_texts(collection.config_folder)
 
-    records = full_texts.get("Rows", [])
-    if not records:
-        return "No records found in the response."
-
-    for record in records:
-        url, full_text, title = record
-        if not (url and full_text and title):
+    for doc in documents:
+        # if all values are not present, then it is skipped?
+        if not (doc["url"] and doc["full_text"] and doc["title"]):
             continue
 
         CandidateURL.objects.update_or_create(
-            url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title}
+            url=doc["url"],
+            collection=collection,
+            defaults={"scraped_text": doc["full_text"], "scraped_title": doc["title"]},
         )
-    return f"Successfully processed {len(records)} records and updated the database."
+    return f"Successfully processed {len(documents)} records and updated the database."

From 9c2db70e52a7e04b9889470a26ac7f69b896827c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 14:11:28 -0600
Subject: [PATCH 093/441] Update sde_collections/sinequa_api.py

---
 sde_collections/sinequa_api.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 166afd6b..b3b8e918 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -101,7 +101,6 @@ def process_response(self, url: str, payload: dict[str, Any]) -> Any:
     def query(self, page: int, collection_config_folder: str = "") -> Any:
         url = f"{self.base_url}/api/v1/search.query"
         if self.server_name in self.dev_servers:
-
             user = self._get_user()
             password = self._get_password()
 

From e74dfdc9bc8348871d23647c6416c8398997ed3e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 14:18:22 -0600
Subject: [PATCH 094/441] take source as an argument to api.query

---
 sde_collections/sinequa_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index b3b8e918..b28f9784 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -98,7 +98,7 @@ def process_response(self, url: str, payload: dict[str, Any]) -> Any:
         else:
             response.raise_for_status()
 
-    def query(self, page: int, collection_config_folder: str = "") -> Any:
+    def query(self, page: int, collection_config_folder: str = None, source: str = None) -> Any:
         url = f"{self.base_url}/api/v1/search.query"
         if self.server_name in self.dev_servers:
             user = self._get_user()
@@ -125,7 +125,7 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
         }
 
         if collection_config_folder:
-            source = self._get_source_name()
+            source = source if source else self._get_source_name()
             payload["query"]["advanced"]["collection"] = f"/{source}/{collection_config_folder}/"
 
         return self.process_response(url, payload)

From 2235811eea69cbcc1352a8bec7074896410adfcf Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 14:33:08 -0600
Subject: [PATCH 095/441] add scraped_text to delta urls and merge migrations

---
 .../migrations/0063_merge_20241112_1428.py    |  13 ++
 .../0064_alter_curatedurl_options_and_more.py | 129 ++++++++++++++++++
 sde_collections/models/delta_url.py           |  21 ++-
 3 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 sde_collections/migrations/0063_merge_20241112_1428.py
 create mode 100644 sde_collections/migrations/0064_alter_curatedurl_options_and_more.py

diff --git a/sde_collections/migrations/0063_merge_20241112_1428.py b/sde_collections/migrations/0063_merge_20241112_1428.py
new file mode 100644
index 00000000..d104ce16
--- /dev/null
+++ b/sde_collections/migrations/0063_merge_20241112_1428.py
@@ -0,0 +1,13 @@
+# Generated by Django 4.2.9 on 2024-11-12 20:28
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0060_alter_candidateurl_scraped_text"),
+        ("sde_collections", "0062_deltatitlepattern_deltaresolvedtitleerror_and_more"),
+    ]
+
+    operations = []
diff --git a/sde_collections/migrations/0064_alter_curatedurl_options_and_more.py b/sde_collections/migrations/0064_alter_curatedurl_options_and_more.py
new file mode 100644
index 00000000..2a69c5d2
--- /dev/null
+++ b/sde_collections/migrations/0064_alter_curatedurl_options_and_more.py
@@ -0,0 +1,129 @@
+# Generated by Django 4.2.9 on 2024-11-12 20:31
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0063_merge_20241112_1428"),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name="curatedurl",
+            options={"ordering": ["url"], "verbose_name": "Curated Urls", "verbose_name_plural": "Curated Urls"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltadivisionpattern",
+            options={"verbose_name": "Delta Division Pattern", "verbose_name_plural": "Delta Division Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltadocumenttypepattern",
+            options={
+                "verbose_name": "Delta Document Type Pattern",
+                "verbose_name_plural": "Delta Document Type Patterns",
+            },
+        ),
+        migrations.AlterModelOptions(
+            name="deltaexcludepattern",
+            options={"verbose_name": "Delta Exclude Pattern", "verbose_name_plural": "Delta Exclude Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltaincludepattern",
+            options={"verbose_name": "Delta Include Pattern", "verbose_name_plural": "Delta Include Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltatitlepattern",
+            options={"verbose_name": "Delta Title Pattern", "verbose_name_plural": "Delta Title Patterns"},
+        ),
+        migrations.AlterModelOptions(
+            name="deltaurl",
+            options={"ordering": ["url"], "verbose_name": "Delta Urls", "verbose_name_plural": "Delta Urls"},
+        ),
+        migrations.AlterModelOptions(
+            name="dumpurl",
+            options={"ordering": ["url"], "verbose_name": "Dump Urls", "verbose_name_plural": "Dump Urls"},
+        ),
+        migrations.AddField(
+            model_name="curatedurl",
+            name="scraped_text",
+            field=models.TextField(
+                blank=True, default="", help_text="This is the text scraped by Sinequa", verbose_name="Scraped Text"
+            ),
+        ),
+        migrations.AddField(
+            model_name="deltaurl",
+            name="scraped_text",
+            field=models.TextField(
+                blank=True, default="", help_text="This is the text scraped by Sinequa", verbose_name="Scraped Text"
+            ),
+        ),
+        migrations.AddField(
+            model_name="dumpurl",
+            name="scraped_text",
+            field=models.TextField(
+                blank=True, default="", help_text="This is the text scraped by Sinequa", verbose_name="Scraped Text"
+            ),
+        ),
+        migrations.AlterField(
+            model_name="curatedurl",
+            name="generated_title",
+            field=models.CharField(
+                blank=True,
+                default="",
+                help_text="This is the title generated based on a Title Pattern",
+                verbose_name="Generated Title",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="curatedurl",
+            name="scraped_title",
+            field=models.CharField(
+                blank=True,
+                default="",
+                help_text="This is the original title scraped by Sinequa",
+                verbose_name="Scraped Title",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaurl",
+            name="generated_title",
+            field=models.CharField(
+                blank=True,
+                default="",
+                help_text="This is the title generated based on a Title Pattern",
+                verbose_name="Generated Title",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaurl",
+            name="scraped_title",
+            field=models.CharField(
+                blank=True,
+                default="",
+                help_text="This is the original title scraped by Sinequa",
+                verbose_name="Scraped Title",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="dumpurl",
+            name="generated_title",
+            field=models.CharField(
+                blank=True,
+                default="",
+                help_text="This is the title generated based on a Title Pattern",
+                verbose_name="Generated Title",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="dumpurl",
+            name="scraped_title",
+            field=models.CharField(
+                blank=True,
+                default="",
+                help_text="This is the original title scraped by Sinequa",
+                verbose_name="Scraped Title",
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index db52d75f..2edb2380 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -40,8 +40,25 @@ class BaseUrl(models.Model):
 
     collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="%(class)s_urls")
     url = models.CharField("Url", unique=True)
-    scraped_title = models.CharField("Scraped Title", blank=True, default="")
-    generated_title = models.CharField("Generated Title", blank=True, default="")
+    scraped_title = models.CharField(
+        "Scraped Title",
+        default="",
+        blank=True,
+        help_text="This is the original title scraped by Sinequa",
+    )
+    scraped_text = models.TextField(
+        "Scraped Text",
+        default="",
+        blank=True,
+        help_text="This is the text scraped by Sinequa",
+    )
+    generated_title = models.CharField(
+        "Generated Title",
+        default="",
+        blank=True,
+        help_text="This is the title generated based on a Title Pattern",
+    )
+
     visited = models.BooleanField(default=False)
     document_type = models.IntegerField(choices=DocumentTypes.choices, null=True)
     division = models.IntegerField(choices=Divisions.choices, null=True)

From 77a249624d9bec621da4f63de67fd76d3bc7e56d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 14:38:26 -0600
Subject: [PATCH 096/441] change full_text task to point to DumpUrl

---
 sde_collections/tasks.py | 51 ++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index f997364d..902f336e 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -8,7 +8,6 @@
 from django.core import management
 
 from config import celery_app
-from sde_collections.models.candidate_url import CandidateURL
 
 from .models.collection import Collection, WorkflowStatusChoices
 from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
@@ -119,30 +118,31 @@ def _compare_and_populate_delta_urls(collection):
             )
 
 
-def populate_dump_urls(collection):
-    urls = Url.objects.filter(collection=collection)
-
-    for url_instance in urls:
-        try:
-            # Create DumpUrl by passing in the parent Url fields
-            dump_url_instance = DumpUrl(
-                id=url_instance.id,
-                collection=url_instance.collection,
-                url=url_instance.url,
-                scraped_title=url_instance.scraped_title,
-                visited=url_instance.visited,
-                document_type=url_instance.document_type,
-                division=url_instance.division,
-            )
-            dump_url_instance.save()  # Save both Url and DumpUrl entries
+# TODO: Bishwas wrote this but it is outdated.
+# def populate_dump_urls(collection):
+#     urls = Url.objects.filter(collection=collection)
 
-            print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
+#     for url_instance in urls:
+#         try:
+#             # Create DumpUrl by passing in the parent Url fields
+#             dump_url_instance = DumpUrl(
+#                 id=url_instance.id,
+#                 collection=url_instance.collection,
+#                 url=url_instance.url,
+#                 scraped_title=url_instance.scraped_title,
+#                 visited=url_instance.visited,
+#                 document_type=url_instance.document_type,
+#                 division=url_instance.division,
+#             )
+#             dump_url_instance.save()  # Save both Url and DumpUrl entries
 
-        except Exception as e:
-            print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
-            continue
+#             print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
+
+#         except Exception as e:
+#             print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
+#             continue
 
-    print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
+#     print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
 
 
 @celery_app.task(soft_time_limit=10000)
@@ -168,8 +168,9 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
         print("Loading data into Url model using loaddata...")
         management.call_command("loaddata", urls_file)
 
-        print("Creating DumpUrl entries...")
-        populate_dump_urls(collection)
+        # TODO: Bishwas wrote this but it is does not work.
+        # print("Creating DumpUrl entries...")
+        # populate_dump_urls(collection)
 
         print("Applying existing patterns; this may take a while")
         collection.apply_all_patterns()
@@ -253,7 +254,7 @@ def fetch_and_update_full_text(collection_id, server_name):
         if not (doc["url"] and doc["full_text"] and doc["title"]):
             continue
 
-        CandidateURL.objects.update_or_create(
+        DumpUrl.objects.update_or_create(
             url=doc["url"],
             collection=collection,
             defaults={"scraped_text": doc["full_text"], "scraped_title": doc["title"]},

From 573e95511965cc1661cc5ed337bf2558af49f7e0 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 12 Nov 2024 15:04:16 -0600
Subject: [PATCH 097/441] Curated and Delta URLs visible on frontend

---
 .../management/commands/migrate_urls.py       | 42 ++++++++++---------
 sde_collections/serializers.py                | 12 +++---
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls.py b/sde_collections/management/commands/migrate_urls.py
index bba364a6..091407b3 100644
--- a/sde_collections/management/commands/migrate_urls.py
+++ b/sde_collections/management/commands/migrate_urls.py
@@ -19,15 +19,17 @@ def handle(self, *args, **kwargs):
         for collection in collections_for_curated:
             candidate_urls = CandidateURL.objects.filter(collection=collection)
             for candidate_url in candidate_urls:
-                CuratedUrl.objects.create(
-                    collection=candidate_url.collection,
-                    url=candidate_url.url,
-                    scraped_title=candidate_url.scraped_title,
-                    generated_title=candidate_url.generated_title,
-                    visited=candidate_url.visited,
-                    document_type=candidate_url.document_type,
-                    division=candidate_url.division,
-                )
+                # Check if a CuratedUrl with the same URL already exists
+                if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
+                    CuratedUrl.objects.create(
+                        collection=candidate_url.collection,
+                        url=candidate_url.url,
+                        scraped_title=candidate_url.scraped_title,
+                        generated_title=candidate_url.generated_title,
+                        visited=candidate_url.visited,
+                        document_type=candidate_url.document_type,
+                        division=candidate_url.division,
+                    )
             self.stdout.write(
                 f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
             )
@@ -41,16 +43,18 @@ def handle(self, *args, **kwargs):
         for collection in collections_for_delta:
             candidate_urls = CandidateURL.objects.filter(collection=collection)
             for candidate_url in candidate_urls:
-                DeltaUrl.objects.create(
-                    collection=candidate_url.collection,
-                    url=candidate_url.url,
-                    scraped_title=candidate_url.scraped_title,
-                    generated_title=candidate_url.generated_title,
-                    visited=candidate_url.visited,
-                    document_type=candidate_url.document_type,
-                    division=candidate_url.division,
-                    delete=False,
-                )
+                # Check if a DeltaUrl with the same URL already exists
+                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
+                    DeltaUrl.objects.create(
+                        collection=candidate_url.collection,
+                        url=candidate_url.url,
+                        scraped_title=candidate_url.scraped_title,
+                        generated_title=candidate_url.generated_title,
+                        visited=candidate_url.visited,
+                        document_type=candidate_url.document_type,
+                        division=candidate_url.division,
+                        delete=False,
+                    )
             self.stdout.write(
                 f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
             )
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index b6a47641..1953cc08 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -116,15 +116,15 @@ class CuratedURLSerializer(serializers.ModelSerializer):
     curated_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_curated_urls_count(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        deltatitlepattern = obj.deltatitlepattern_curated_urls.last()
         return deltatitlepattern.curated_urls.count() if deltatitlepattern else 0
 
     def get_generated_title_id(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        deltatitlepattern = obj.deltatitlepattern_curated_urls.last()
         return deltatitlepattern.id if deltatitlepattern else None
 
     def get_match_pattern_type(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        deltatitlepattern = obj.deltatitlepattern_curated_urls.last()
         return deltatitlepattern.match_pattern_type if deltatitlepattern else None
 
     class Meta:
@@ -160,15 +160,15 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_delta_urls_count(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        deltatitlepattern = obj.deltatitlepattern_delta_urls.last()
         return deltatitlepattern.delta_urls.count() if deltatitlepattern else 0
 
     def get_generated_title_id(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        deltatitlepattern = obj.deltatitlepattern_delta_urls.last()
         return deltatitlepattern.id if deltatitlepattern else None
 
     def get_match_pattern_type(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_urls.last()
+        deltatitlepattern = obj.deltatitlepattern_delta_urls.last()
         return deltatitlepattern.match_pattern_type if deltatitlepattern else None
 
     class Meta:

From 8530f350b142bb66f0d7c3215be6e00a473a972f Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 16:21:33 -0600
Subject: [PATCH 098/441] refactor process response to use raw payload

---
 sde_collections/sinequa_api.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index b28f9784..868afb77 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -90,28 +90,33 @@ def _get_source_name(self) -> str:
         """by default, the source is /SDE/. However for the various dev servers, the source is tends to be /scrapers/"""
         return "scrapers" if self.server_name in self.dev_servers else "SDE"
 
-    def process_response(self, url: str, payload: dict[str, Any]) -> Any:
-        response = requests.post(url, headers={}, json=payload, verify=False)
-
+    def process_response(
+        self,
+        url: str,
+        payload: dict[str, Any] | None = None,
+        headers: dict[str, str] | None = None,
+        raw_data: str | None = None,
+    ) -> Any:
+        """Sends a POST request and processes the response."""
+        response = requests.post(
+            url, headers=headers, json=payload if raw_data is None else None, data=raw_data, verify=False
+        )
         if response.status_code == requests.codes.ok:
             return response.json()
         else:
             response.raise_for_status()
 
-    def query(self, page: int, collection_config_folder: str = None, source: str = None) -> Any:
+    def query(self, page: int, collection_config_folder: str | None = None, source: str | None = None) -> Any:
         url = f"{self.base_url}/api/v1/search.query"
         if self.server_name in self.dev_servers:
             user = self._get_user()
             password = self._get_password()
-
             if not user or not password:
                 raise ValueError(
                     "User and password are required for the query endpoint on the following servers: {self.dev_servers}"
                 )
             authentication = f"?Password={password}&User={user}"
             url = f"{url}{authentication}"
-        else:
-            url = f"{self.base_url}/api/v1/search.query"
 
         payload = {
             "app": self.app_name,
@@ -135,26 +140,18 @@ def sql_query(self, sql: str) -> Any:
         token = self._get_token()
         if not token:
             raise ValueError("A token is required to use the SQL endpoint")
+
         url = f"{self.base_url}/api/v1/engine.sql"
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
-        payload = json.dumps(
+        raw_payload = json.dumps(
             {
                 "method": "engine.sql",
                 "sql": sql,
                 "pretty": True,
-                "log": False,
-                "output": "json",
-                "resolveIndexList": "false",
-                "engines": "default",
             }
         )
 
-        try:
-            response = requests.post(url, headers=headers, data=payload, timeout=10)
-            response.raise_for_status()
-            return response.json()
-        except requests.exceptions.RequestException as e:
-            raise RuntimeError(f"Api request to SQL endpoint failed: {str(e)}")
+        return self.process_response(url, headers=headers, raw_data=raw_payload)
 
     def get_full_texts(self, collection_config_folder: str, source: str = None) -> Any:
         """

From f6455f116cd5e580bf2e32c1cad132b496004c11 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 22:35:30 -0600
Subject: [PATCH 099/441] add code to migrate dump to delta

---
 sde_collections/models/collection.py | 59 +++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 5841fe6c..fc994b78 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -26,9 +26,10 @@
     UpdateFrequencies,
     WorkflowStatusChoices,
 )
-from .delta_url import CuratedUrl, DeltaUrl
+from .delta_url import CuratedUrl, DeltaUrl, DumpUrl
 
 User = get_user_model()
+DELTA_COMPARISON_FIELDS = ["scraped_title"]  # Add more fields as needed
 
 
 class Collection(models.Model):
@@ -84,6 +85,62 @@ class Meta:
         verbose_name = "Collection"
         verbose_name_plural = "Collections"
 
+    def clear_delta_urls(self):
+        """Clears all DeltaUrls for this collection."""
+        DeltaUrl.objects.filter(collection=self).delete()
+
+    def clear_dump_urls(self):
+        """Clears all DumpUrls for this collection."""
+        DumpUrl.objects.filter(collection=self).delete()
+
+    def migrate_dump_to_delta(self):
+        """Main function to handle migration from DumpUrls to DeltaUrls with specific rules."""
+        # Step 1: Clear existing DeltaUrls for this collection
+        self.clear_delta_urls()
+
+        # Step 2: Fetch all current DumpUrls and CuratedUrls for this collection
+        dump_urls = {url.url: url for url in DumpUrl.objects.filter(collection=self)}
+        curated_urls = {url.url: url for url in CuratedUrl.objects.filter(collection=self)}
+
+        # Step 3: Process each URL in DumpUrls to migrate as needed
+        for url, dump in dump_urls.items():
+            curated = curated_urls.get(url)
+
+            if curated:
+                # Check if any of the comparison fields differ
+                if any(getattr(curated, field) != getattr(dump, field) for field in DELTA_COMPARISON_FIELDS):
+                    self.create_or_update_delta_url(dump, delete=False)
+            else:
+                # New URL, not in CuratedUrls; move it entirely to DeltaUrls
+                self.create_or_update_delta_url(dump, delete=False)
+
+        # Step 4: Identify CuratedUrls missing in DumpUrls and flag them for deletion in DeltaUrls
+        for curated in curated_urls.values():
+            if curated.url not in dump_urls:
+                self.create_or_update_delta_url(curated, delete=True)
+
+        # Step 5: Clear DumpUrls after migration is complete
+        self.clear_dump_urls()
+
+    def create_or_update_delta_url(self, url_instance, delete=False):
+        """
+        Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
+        If delete is True, only sets the delete flag and url.
+        """
+        if delete:
+            # Only set the URL and delete flag
+            DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults={"delete": True})
+        else:
+            # Automatically move over all fields from url_instance
+            fields_to_copy = {
+                field.name: getattr(url_instance, field.name)
+                for field in DumpUrl._meta.fields  # Assumes same fields for CuratedUrl via inheritance
+                if field.name not in ["id", "collection", "url"]
+            }
+            fields_to_copy["delete"] = False  # Ensure delete flag is False
+
+            DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy)
+
     def promote_to_curated(self):
         """
         Promotes all DeltaUrls in this collection to CuratedUrls.

From 8193d8dd7ee06844b3b36e0b9c0052a994ca6636 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 12 Nov 2024 23:12:20 -0600
Subject: [PATCH 100/441] add tests for dump migration

---
 sde_collections/tests/factories.py         |  40 +++-
 sde_collections/tests/test_migrate_dump.py | 228 +++++++++++++++++++++
 2 files changed, 267 insertions(+), 1 deletion(-)
 create mode 100644 sde_collections/tests/test_migrate_dump.py

diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py
index 140bf294..a62251b4 100644
--- a/sde_collections/tests/factories.py
+++ b/sde_collections/tests/factories.py
@@ -5,12 +5,12 @@
 from sde_collections.models.collection import Collection
 from sde_collections.models.collection_choice_fields import (
     ConnectorChoices,
-    CurationStatusChoices,
     Divisions,
     DocumentTypes,
     UpdateFrequencies,
     WorkflowStatusChoices,
 )
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 
 User = get_user_model()
 
@@ -50,3 +50,41 @@ class Meta:
     # ForeignKey to User for `curated_by`
     curated_by = factory.SubFactory(UserFactory)
     curation_started = factory.LazyFunction(timezone.now)
+
+
+class DumpUrlFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = DumpUrl
+
+    collection = factory.SubFactory(CollectionFactory)
+    url = factory.Faker("url")
+    scraped_title = factory.Faker("sentence")
+    scraped_text = factory.Faker("paragraph")
+    # generated_title = factory.Faker("sentence")
+    # visited = factory.Faker("boolean")
+    # document_type = 1
+    # division = 1
+
+
+class CuratedUrlFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = CuratedUrl
+
+    collection = factory.SubFactory(CollectionFactory)
+    url = factory.Faker("url")
+    scraped_title = factory.Faker("sentence")
+    scraped_text = factory.Faker("paragraph")
+    generated_title = factory.Faker("sentence")
+    visited = factory.Faker("boolean")
+    document_type = 1
+    division = 1
+
+
+class DeltaUrlFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = DeltaUrl
+
+    collection = factory.SubFactory(CollectionFactory)
+    url = factory.Faker("url")
+    scraped_title = factory.Faker("sentence")
+    delete = False
diff --git a/sde_collections/tests/test_migrate_dump.py b/sde_collections/tests/test_migrate_dump.py
new file mode 100644
index 00000000..bfcb8920
--- /dev/null
+++ b/sde_collections/tests/test_migrate_dump.py
@@ -0,0 +1,228 @@
+# noqa: F841
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migrate_dump.py
+
+import pytest
+
+from sde_collections.models.delta_url import DeltaUrl, DumpUrl
+from sde_collections.tests.factories import (
+    CollectionFactory,
+    CuratedUrlFactory,
+    DeltaUrlFactory,
+    DumpUrlFactory,
+)
+
+DELTA_COMPARISON_FIELDS = ["scraped_title"]  # Assuming a central definition
+
+
+@pytest.mark.django_db
+class TestMigrationHelpers:
+    def test_clear_delta_urls(self):
+        collection = CollectionFactory()
+        DeltaUrlFactory.create_batch(5, collection=collection)
+        collection.clear_delta_urls()
+        assert DeltaUrl.objects.filter(collection=collection).count() == 0
+
+    def test_clear_dump_urls(self):
+        collection = CollectionFactory()
+        DumpUrlFactory.create_batch(5, collection=collection)
+        collection.clear_dump_urls()
+        assert DumpUrl.objects.filter(collection=collection).count() == 0
+
+    def test_create_or_update_delta_url_add(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection)
+        collection.create_or_update_delta_url(dump_url, delete=False)
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.delete is False
+        for field in DELTA_COMPARISON_FIELDS:
+            assert getattr(delta, field) == getattr(dump_url, field)
+
+    def test_create_or_update_delta_url_delete(self):
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection)
+        collection.create_or_update_delta_url(curated_url, delete=True)
+        delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta.delete is True
+        assert delta.scraped_title == ""
+
+
+@pytest.mark.django_db
+class TestMigrateDumpToDelta:
+    def test_new_url_in_dump_only(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection)
+        collection.migrate_dump_to_delta()
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.delete is False
+        for field in DELTA_COMPARISON_FIELDS:
+            assert getattr(delta, field) == getattr(dump_url, field)
+
+    def test_url_in_both_with_different_field(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection, scraped_title="New Title")
+        curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Old Title")  # noqa
+        collection.migrate_dump_to_delta()
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.delete is False
+        assert delta.scraped_title == "New Title"
+
+    def test_url_in_curated_only(self):
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection)
+        collection.migrate_dump_to_delta()
+        delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta.delete is True
+        assert delta.scraped_title == ""
+
+    def test_identical_url_in_both(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection, scraped_title="Same Title")
+        curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title")  # noqa
+        collection.migrate_dump_to_delta()
+        assert not DeltaUrl.objects.filter(url=dump_url.url).exists()
+
+    def test_full_migration_flow(self):
+        collection = CollectionFactory()
+        dump_url_new = DumpUrlFactory(collection=collection)  # New URL
+        dump_url_update = DumpUrlFactory(collection=collection, scraped_title="Updated Title")
+        CuratedUrlFactory(collection=collection, url=dump_url_update.url, scraped_title="Old Title")
+        curated_url_delete = CuratedUrlFactory(collection=collection)  # Missing in Dump
+
+        collection.migrate_dump_to_delta()
+
+        # New URL moved to DeltaUrls
+        assert DeltaUrl.objects.filter(url=dump_url_new.url, delete=False).exists()
+
+        # Updated URL moved to DeltaUrls
+        delta_update = DeltaUrl.objects.get(url=dump_url_update.url)
+        assert delta_update.scraped_title == "Updated Title"
+        assert delta_update.delete is False
+
+        # Deleted URL in CuratedUrls marked as delete in DeltaUrls
+        delta_delete = DeltaUrl.objects.get(url=curated_url_delete.url)
+        assert delta_delete.delete is True
+
+    def test_empty_collections(self):
+        collection = CollectionFactory()
+        collection.migrate_dump_to_delta()
+        assert DeltaUrl.objects.filter(collection=collection).count() == 0
+
+    def test_partial_data_in_dump_urls(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection, scraped_title="")
+        collection.migrate_dump_to_delta()
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.scraped_title == ""
+        assert delta.delete is False
+
+
+@pytest.mark.django_db
+class TestMigrationIdempotency:
+    def test_migrate_dump_to_delta_idempotency(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection)
+        CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Different Title")
+
+        # First migration run
+        collection.migrate_dump_to_delta()
+        assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1
+
+        # Run migration again
+        collection.migrate_dump_to_delta()
+        assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1  # Ensure no duplicates
+
+    def test_create_or_update_delta_url_idempotency(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection)
+
+        # First call
+        collection.create_or_update_delta_url(dump_url, delete=False)
+        assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1
+
+        # Second call with the same data
+        collection.create_or_update_delta_url(dump_url, delete=False)
+        assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1  # Should still be one
+
+
+@pytest.mark.django_db
+def test_create_or_update_delta_url_field_copy():
+    collection = CollectionFactory()
+    dump_url = DumpUrlFactory(
+        collection=collection,
+        scraped_title="Test Title",
+        scraped_text="Test Text",
+        generated_title="Generated Test Title",
+        visited=True,
+        document_type=1,
+        division=2,
+    )
+
+    collection.create_or_update_delta_url(dump_url, delete=False)
+    delta = DeltaUrl.objects.get(url=dump_url.url)
+
+    # Verify each field is copied correctly
+    for field in DumpUrl._meta.fields:
+        if field.name not in ["id", "collection", "url"]:
+            assert getattr(delta, field.name) == getattr(dump_url, field.name)
+
+
+@pytest.mark.django_db
+class TestGranularFullMigrationFlow:
+    def test_full_migration_new_url(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection)  # New URL
+        collection.migrate_dump_to_delta()
+
+        # New URL should be added to DeltaUrls
+        assert DeltaUrl.objects.filter(url=dump_url.url, delete=False).exists()
+
+    def test_full_migration_updated_url(self):
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection, scraped_title="Updated Title")
+        collection.migrate_dump_to_delta()
+
+        # URL with differing fields should be updated in DeltaUrls
+        delta_update = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta_update.scraped_title == "Updated Title"
+        assert delta_update.delete is False
+
+    def test_full_migration_deleted_url(self):
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection)  # URL to be deleted
+        collection.migrate_dump_to_delta()
+
+        # Missing URL in DumpUrls should be marked as delete in DeltaUrls
+        delta_delete = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_delete.delete is True
+
+
+@pytest.mark.django_db
+def test_empty_delta_comparison_fields():
+    collection = CollectionFactory()
+    dump_url = DumpUrlFactory(collection=collection, scraped_title="Same Title")
+    curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title")  # noqa
+
+    global DELTA_COMPARISON_FIELDS
+    original_fields = DELTA_COMPARISON_FIELDS
+    DELTA_COMPARISON_FIELDS = []  # Simulate empty comparison fields
+
+    try:
+        collection.migrate_dump_to_delta()
+        # No DeltaUrl should be created as there are no fields to compare
+        assert not DeltaUrl.objects.filter(url=dump_url.url).exists()
+    finally:
+        DELTA_COMPARISON_FIELDS = original_fields  # Reset the fields after test
+
+
+@pytest.mark.django_db
+def test_partial_data_in_curated_urls():
+    collection = CollectionFactory()
+    dump_url = DumpUrlFactory(collection=collection, scraped_title="Title Exists")
+    curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="")  # noqa
+
+    collection.migrate_dump_to_delta()
+
+    # Since `scraped_title` differs (None vs "Title Exists"), it should create a DeltaUrl
+    delta = DeltaUrl.objects.get(url=dump_url.url)
+    assert delta.scraped_title == "Title Exists"
+    assert delta.delete is False

From 1b9d8006e6120f449c719bfc9a9fde6e68f8b1b5 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 13 Nov 2024 02:13:53 -0600
Subject: [PATCH 101/441] modify architecture to define one single field to
 generate two

---
 ...candidateurl_tdamm_tag_manual_and_more.py} |  25 +--
 ..._candidateurl_tdamm_tag_manual_and_more.py | 151 ------------------
 sde_collections/models/candidate_url.py       |  46 +-----
 .../utils/paired_field_descriptor.py          |  88 ++++++++--
 4 files changed, 98 insertions(+), 212 deletions(-)
 rename sde_collections/migrations/{0059_candidateurl_tdamm_tag_manual_and_more.py => 0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py} (93%)
 delete mode 100644 sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py

diff --git a/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py
similarity index 93%
rename from sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
rename to sde_collections/migrations/0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py
index 057f1ed6..c635c833 100644
--- a/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
+++ b/sde_collections/migrations/0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.9 on 2024-11-02 04:36
+# Generated by Django 4.2.9 on 2024-11-13 08:01
 
 import django.contrib.postgres.fields
 from django.db import migrations, models
@@ -11,6 +11,13 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
+        migrations.AddField(
+            model_name="candidateurl",
+            name="is_tdamm",
+            field=models.BooleanField(
+                default=False, help_text="Enable TDAMM tagging for this URL", verbose_name="Is TDAMM?"
+            ),
+        ),
         migrations.AddField(
             model_name="candidateurl",
             name="tdamm_tag_manual",
@@ -61,9 +68,9 @@ class Migration(migrations.Migration):
                     max_length=255,
                 ),
                 blank=True,
+                db_column="tdamm_tag_manual",
                 null=True,
                 size=None,
-                verbose_name="TDAMM Manual Tags",
             ),
         ),
         migrations.AddField(
@@ -116,19 +123,13 @@ class Migration(migrations.Migration):
                     max_length=255,
                 ),
                 blank=True,
+                db_column="tdamm_tag_ml",
                 null=True,
                 size=None,
-                verbose_name="TDAMM ML Tags",
             ),
         ),
-        migrations.AddField(
-            model_name="collection",
-            name="tdamm_tag_manual",
-            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"),
-        ),
-        migrations.AddField(
-            model_name="collection",
-            name="tdamm_tag_ml",
-            field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"),
+        migrations.AlterModelTable(
+            name="candidateurl",
+            table="sde_collections_candidateurl",
         ),
     ]
diff --git a/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py
deleted file mode 100644
index d8a0a4a7..00000000
--- a/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Generated by Django 4.2.9 on 2024-11-04 06:33
-
-import django.contrib.postgres.fields
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("sde_collections", "0059_candidateurl_tdamm_tag_manual_and_more"),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name="candidateurl",
-            name="tdamm_tag_manual",
-            field=django.contrib.postgres.fields.ArrayField(
-                base_field=models.CharField(
-                    choices=[
-                        ("MMA_M_EM", "Messenger - EM Radiation"),
-                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
-                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
-                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
-                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
-                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
-                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
-                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
-                        ("MMA_M_G", "Messenger - Gravitational Waves"),
-                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
-                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
-                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
-                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
-                        ("MMA_M_C", "Messenger - Cosmic Rays"),
-                        ("MMA_M_N", "Messenger - Neutrinos"),
-                        ("MMA_O_BI", "Objects - Binaries"),
-                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
-                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
-                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
-                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
-                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
-                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
-                        ("MMA_O_BH", "Objects - Black Holes"),
-                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
-                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
-                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
-                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
-                        ("MMA_O_E", "Objects - Exoplanets"),
-                        ("MMA_O_N", "Objects - Neutron Stars"),
-                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
-                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
-                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
-                        ("MMA_O_S", "Objects - Supernova Remnants"),
-                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
-                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
-                        ("MMA_S_K", "Signals - Kilonovae"),
-                        ("MMA_S_N", "Signals - Novae"),
-                        ("MMA_S_P", "Signals - Pevatrons"),
-                        ("MMA_S_ST", "Signals - Stellar flares"),
-                        ("MMA_S_SU", "Signals - Supernovae"),
-                    ],
-                    max_length=255,
-                ),
-                blank=True,
-                db_column="tdamm_tag_manual",
-                null=True,
-                size=None,
-                verbose_name="TDAMM Manual Tags",
-            ),
-        ),
-        migrations.RenameField(
-            model_name="candidateurl",
-            old_name="tdamm_tag_manual",
-            new_name="_tdamm_tag_manual",
-        ),
-        migrations.AlterField(
-            model_name="candidateurl",
-            name="tdamm_tag_ml",
-            field=django.contrib.postgres.fields.ArrayField(
-                base_field=models.CharField(
-                    choices=[
-                        ("MMA_M_EM", "Messenger - EM Radiation"),
-                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
-                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
-                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
-                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
-                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
-                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
-                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
-                        ("MMA_M_G", "Messenger - Gravitational Waves"),
-                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
-                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
-                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
-                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
-                        ("MMA_M_C", "Messenger - Cosmic Rays"),
-                        ("MMA_M_N", "Messenger - Neutrinos"),
-                        ("MMA_O_BI", "Objects - Binaries"),
-                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
-                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
-                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
-                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
-                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
-                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
-                        ("MMA_O_BH", "Objects - Black Holes"),
-                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
-                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
-                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
-                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
-                        ("MMA_O_E", "Objects - Exoplanets"),
-                        ("MMA_O_N", "Objects - Neutron Stars"),
-                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
-                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
-                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
-                        ("MMA_O_S", "Objects - Supernova Remnants"),
-                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
-                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
-                        ("MMA_S_K", "Signals - Kilonovae"),
-                        ("MMA_S_N", "Signals - Novae"),
-                        ("MMA_S_P", "Signals - Pevatrons"),
-                        ("MMA_S_ST", "Signals - Stellar flares"),
-                        ("MMA_S_SU", "Signals - Supernovae"),
-                    ],
-                    max_length=255,
-                ),
-                blank=True,
-                db_column="tdamm_tag_ml",
-                null=True,
-                size=None,
-                verbose_name="TDAMM ML Tags",
-            ),
-        ),
-        migrations.RenameField(
-            model_name="candidateurl",
-            old_name="tdamm_tag_ml",
-            new_name="_tdamm_tag_ml",
-        ),
-        migrations.RemoveField(
-            model_name="collection",
-            name="tdamm_tag_manual",
-        ),
-        migrations.RemoveField(
-            model_name="collection",
-            name="tdamm_tag_ml",
-        ),
-        migrations.AddField(
-            model_name="candidateurl",
-            name="is_tdamm",
-            field=models.BooleanField(
-                default=False, help_text="Enable TDAMM tagging for this URL", verbose_name="Is TDAMM"
-            ),
-        ),
-    ]
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 8d2776dd..ff37b597 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -126,47 +126,13 @@ class CandidateURL(models.Model):
         ("MMA_S_SU", "Signals - Supernovae"),
     ]
 
-    # Define TDAMM fields but make them optional
-    @property
-    def tdamm_tag_manual(self):
-        if hasattr(self, "_tdamm_tag_manual") and self.is_tdamm:
-            return self._tdamm_tag_manual
-        return None
-
-    @tdamm_tag_manual.setter
-    def tdamm_tag_manual(self, value):
-        if self.is_tdamm:
-            self._tdamm_tag_manual = value
-
-    @property
-    def tdamm_tag_ml(self):
-        if hasattr(self, "_tdamm_tag_ml") and self.is_tdamm:
-            return self._tdamm_tag_ml
-        return None
-
-    @tdamm_tag_ml.setter
-    def tdamm_tag_ml(self, value):
-        if self.is_tdamm:
-            self._tdamm_tag_ml = value
-
-    _tdamm_tag_manual = ArrayField(
-        models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES),
-        blank=True,
-        null=True,
-        verbose_name="TDAMM Manual Tags",
-        db_column="tdamm_tag_manual",
+    tdamm_tag = PairedFieldDescriptor(
+        field_name="tdamm_tag",
+        field_type=ArrayField(models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), blank=True, null=True),
+        switch="is_tdamm",
+        verbose_name="TDAMM Tags",
     )
-
-    _tdamm_tag_ml = ArrayField(
-        models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES),
-        blank=True,
-        null=True,
-        verbose_name="TDAMM ML Tags",
-        db_column="tdamm_tag_ml",
-    )
-
-    tdamm_tag = PairedFieldDescriptor("tdamm_tag")
-
+    
     class Meta:
         """Meta definition for Candidate URL."""
 
diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py
index 9ac0c4e3..6ed29b60 100644
--- a/sde_collections/utils/paired_field_descriptor.py
+++ b/sde_collections/utils/paired_field_descriptor.py
@@ -1,21 +1,91 @@
+from django.db import models
+from django.contrib.postgres.fields import ArrayField
+
 class PairedFieldDescriptor:
-    def __init__(self, field_name):
+    """
+    A descriptor that manages paired manual/ML fields where:
+    - Setting the main field only affects the manual field
+    - ML field must be set explicitly
+    - Getting the main field returns manual if present, otherwise ML
+    """
+    
+    def __init__(self, field_name, field_type, switch, verbose_name=""):
+        self.field_name = field_name
         self.manual_field_name = f"{field_name}_manual"
         self.ml_field_name = f"{field_name}_ml"
+        self.field_type = field_type
+        self.verbose_name = verbose_name or field_name.replace('_', ' ').title()
+        self.switch = switch
+
+    def contribute_to_class(self, cls, name):
+        """Called by Django when the descriptor is added to the model class."""
+        # Create manual field
+        manual_field = self._create_field(
+            verbose_name=f"{self.verbose_name} Manual",
+            db_column=self.manual_field_name
+        )
+        
+        # Create ML field
+        ml_field = self._create_field(
+            verbose_name=f"{self.verbose_name} ML",
+            db_column=self.ml_field_name
+        )
+
+        # Add fields to the model's _meta
+        cls.add_to_class(self.manual_field_name, manual_field)
+        cls.add_to_class(self.ml_field_name, ml_field)
+
+        # Store the descriptor
+        setattr(cls, name, self)
+
+    def _create_field(self, verbose_name, db_column):
+        """Helper method to create a new field instance with the right configuration"""
+        if isinstance(self.field_type, type):
+            # If field_type is a class, instantiate it
+            field = self.field_type()
+        else:
+            # If field_type is already an instance, clone it
+            field = self.field_type.clone()
+        
+        field.verbose_name = verbose_name
+        field.db_column = db_column
+        
+        return field
 
     def __get__(self, instance, owner):
+        """
+        Get the value of the main field:
+        - Returns manual tags if they exist
+        - Otherwise returns ML tags
+        - Returns None if switch is False
+        """
         if instance is None:
             return self
-        # Return manual tag if available, otherwise ML tag
+        
+        if not getattr(instance, self.switch , False):
+            return None
+
         manual_value = getattr(instance, self.manual_field_name, None)
-        machine_learning_value = getattr(instance, self.ml_field_name, None)
-        return manual_value if manual_value is not None else machine_learning_value
+        ml_value = getattr(instance, self.ml_field_name, None)
+        
+        # Return manual if it exists, otherwise ML
+        return manual_value if manual_value is not None else ml_value
 
     def __set__(self, instance, value):
-        # Set the value of the manual field
-        setattr(instance, self.manual_field_name, value)
+        """
+        Set only the manual field when setting the field.
+        ML field must be set explicitly.
+        """
+        if getattr(instance, self.switch, False):
+            # Only set the manual field
+            setattr(instance, self.manual_field_name, value)
 
     def __delete__(self, instance):
-        # Delete both manual and ML fields
-        delattr(instance, self.manual_field_name)
-        delattr(instance, self.ml_field_name)
+        """Delete both manual and ML fields"""
+        setattr(instance, self.manual_field_name, None)
+        setattr(instance, self.ml_field_name, None)
+
+    def set_ml(self, instance, value):
+        """Explicit method to set ML tags"""
+        if getattr(instance, self.switch, False):
+            setattr(instance, self.ml_field_name, value)
\ No newline at end of file

From 74b5b7687cd1a3cf81296bba0a0c4d23ea169508 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 13 Nov 2024 08:14:43 +0000
Subject: [PATCH 102/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/models/candidate_url.py       |  2 +-
 .../utils/paired_field_descriptor.py          | 31 ++++++++-----------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index ff37b597..f85c1388 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -132,7 +132,7 @@ class CandidateURL(models.Model):
         switch="is_tdamm",
         verbose_name="TDAMM Tags",
     )
-    
+
     class Meta:
         """Meta definition for Candidate URL."""
 
diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py
index 6ed29b60..493434fd 100644
--- a/sde_collections/utils/paired_field_descriptor.py
+++ b/sde_collections/utils/paired_field_descriptor.py
@@ -1,5 +1,6 @@
-from django.db import models
 from django.contrib.postgres.fields import ArrayField
+from django.db import models
+
 
 class PairedFieldDescriptor:
     """
@@ -8,28 +9,22 @@ class PairedFieldDescriptor:
     - ML field must be set explicitly
     - Getting the main field returns manual if present, otherwise ML
     """
-    
+
     def __init__(self, field_name, field_type, switch, verbose_name=""):
         self.field_name = field_name
         self.manual_field_name = f"{field_name}_manual"
         self.ml_field_name = f"{field_name}_ml"
         self.field_type = field_type
-        self.verbose_name = verbose_name or field_name.replace('_', ' ').title()
+        self.verbose_name = verbose_name or field_name.replace("_", " ").title()
         self.switch = switch
 
     def contribute_to_class(self, cls, name):
         """Called by Django when the descriptor is added to the model class."""
         # Create manual field
-        manual_field = self._create_field(
-            verbose_name=f"{self.verbose_name} Manual",
-            db_column=self.manual_field_name
-        )
-        
+        manual_field = self._create_field(verbose_name=f"{self.verbose_name} Manual", db_column=self.manual_field_name)
+
         # Create ML field
-        ml_field = self._create_field(
-            verbose_name=f"{self.verbose_name} ML",
-            db_column=self.ml_field_name
-        )
+        ml_field = self._create_field(verbose_name=f"{self.verbose_name} ML", db_column=self.ml_field_name)
 
         # Add fields to the model's _meta
         cls.add_to_class(self.manual_field_name, manual_field)
@@ -46,10 +41,10 @@ def _create_field(self, verbose_name, db_column):
         else:
             # If field_type is already an instance, clone it
             field = self.field_type.clone()
-        
+
         field.verbose_name = verbose_name
         field.db_column = db_column
-        
+
         return field
 
     def __get__(self, instance, owner):
@@ -61,13 +56,13 @@ def __get__(self, instance, owner):
         """
         if instance is None:
             return self
-        
-        if not getattr(instance, self.switch , False):
+
+        if not getattr(instance, self.switch, False):
             return None
 
         manual_value = getattr(instance, self.manual_field_name, None)
         ml_value = getattr(instance, self.ml_field_name, None)
-        
+
         # Return manual if it exists, otherwise ML
         return manual_value if manual_value is not None else ml_value
 
@@ -88,4 +83,4 @@ def __delete__(self, instance):
     def set_ml(self, instance, value):
         """Explicit method to set ML tags"""
         if getattr(instance, self.switch, False):
-            setattr(instance, self.ml_field_name, value)
\ No newline at end of file
+            setattr(instance, self.ml_field_name, value)

From 3ee8845012e78c2a0aee93262ec27ebb1aef6401 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 13 Nov 2024 02:34:24 -0600
Subject: [PATCH 103/441] fix flake8 issues

---
 sde_collections/utils/paired_field_descriptor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py
index 493434fd..1c84e7a3 100644
--- a/sde_collections/utils/paired_field_descriptor.py
+++ b/sde_collections/utils/paired_field_descriptor.py
@@ -1,7 +1,3 @@
-from django.contrib.postgres.fields import ArrayField
-from django.db import models
-
-
 class PairedFieldDescriptor:
     """
     A descriptor that manages paired manual/ML fields where:

From 81d68b30aee3f9df2dbe631bb7d1657b35503f63 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 13 Nov 2024 09:09:56 -0600
Subject: [PATCH 104/441] Started writing migrate file

---
 .../commands/migrate_urls_and_patterns.py     | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 sde_collections/management/commands/migrate_urls_and_patterns.py

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
new file mode 100644
index 00000000..d77b91a9
--- /dev/null
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -0,0 +1,63 @@
+from django.core.management.base import BaseCommand
+
+from sde_collections.models.candidate_url import CandidateURL
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+
+
+class Command(BaseCommand):
+    help = """Migrate CandidateURLs to DeltaUrl, apply the matching patterns,
+            and then promoting to CuratedUrl based on collection workflow status"""
+
+    def handle(self, *args, **kwargs):
+        # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl
+        collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED)
+        self.stdout.write(
+            f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..."
+        )
+
+        for collection in collections_for_curated:
+            candidate_urls = CandidateURL.objects.filter(collection=collection)
+            for candidate_url in candidate_urls:
+                # Check if a CuratedUrl with the same URL already exists
+                if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
+                    CuratedUrl.objects.create(
+                        collection=candidate_url.collection,
+                        url=candidate_url.url,
+                        scraped_title=candidate_url.scraped_title,
+                        generated_title=candidate_url.generated_title,
+                        visited=candidate_url.visited,
+                        document_type=candidate_url.document_type,
+                        division=candidate_url.division,
+                    )
+            self.stdout.write(
+                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
+            )
+
+        # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl
+        collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED)
+        self.stdout.write(
+            f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..."
+        )
+
+        for collection in collections_for_delta:
+            candidate_urls = CandidateURL.objects.filter(collection=collection)
+            for candidate_url in candidate_urls:
+                # Check if a DeltaUrl with the same URL already exists
+                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
+                    DeltaUrl.objects.create(
+                        collection=candidate_url.collection,
+                        url=candidate_url.url,
+                        scraped_title=candidate_url.scraped_title,
+                        generated_title=candidate_url.generated_title,
+                        visited=candidate_url.visited,
+                        document_type=candidate_url.document_type,
+                        division=candidate_url.division,
+                        delete=False,
+                    )
+            self.stdout.write(
+                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
+            )
+
+        self.stdout.write(self.style.SUCCESS("Migration complete."))

From af42a16394dc6715d5a9a2e8e6efcdd165d76162 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 13 Nov 2024 09:50:36 -0600
Subject: [PATCH 105/441] Removed conflicting migration

---
 .../0063_alter_curatedurl_options_and_more.py | 48 -------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 sde_collections/migrations/0063_alter_curatedurl_options_and_more.py

diff --git a/sde_collections/migrations/0063_alter_curatedurl_options_and_more.py b/sde_collections/migrations/0063_alter_curatedurl_options_and_more.py
deleted file mode 100644
index 9a699113..00000000
--- a/sde_collections/migrations/0063_alter_curatedurl_options_and_more.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Generated by Django 4.2.9 on 2024-11-11 22:09
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ("sde_collections", "0062_deltatitlepattern_deltaresolvedtitleerror_and_more"),
-    ]
-
-    operations = [
-        migrations.AlterModelOptions(
-            name="curatedurl",
-            options={"ordering": ["url"], "verbose_name": "Curated Urls", "verbose_name_plural": "Curated Urls"},
-        ),
-        migrations.AlterModelOptions(
-            name="deltadivisionpattern",
-            options={"verbose_name": "Delta Division Pattern", "verbose_name_plural": "Delta Division Patterns"},
-        ),
-        migrations.AlterModelOptions(
-            name="deltadocumenttypepattern",
-            options={
-                "verbose_name": "Delta Document Type Pattern",
-                "verbose_name_plural": "Delta Document Type Patterns",
-            },
-        ),
-        migrations.AlterModelOptions(
-            name="deltaexcludepattern",
-            options={"verbose_name": "Delta Exclude Pattern", "verbose_name_plural": "Delta Exclude Patterns"},
-        ),
-        migrations.AlterModelOptions(
-            name="deltaincludepattern",
-            options={"verbose_name": "Delta Include Pattern", "verbose_name_plural": "Delta Include Patterns"},
-        ),
-        migrations.AlterModelOptions(
-            name="deltatitlepattern",
-            options={"verbose_name": "Delta Title Pattern", "verbose_name_plural": "Delta Title Patterns"},
-        ),
-        migrations.AlterModelOptions(
-            name="deltaurl",
-            options={"ordering": ["url"], "verbose_name": "Delta Urls", "verbose_name_plural": "Delta Urls"},
-        ),
-        migrations.AlterModelOptions(
-            name="dumpurl",
-            options={"ordering": ["url"], "verbose_name": "Dump Urls", "verbose_name_plural": "Dump Urls"},
-        ),
-    ]

From 1d8a06b14700c06b7c25fc3058c94f77de680ffa Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 13 Nov 2024 12:58:33 -0600
Subject: [PATCH 106/441] Migration command updates

---
 .../commands/migrate_urls_and_patterns.py     | 185 ++++++++++++++++--
 1 file changed, 165 insertions(+), 20 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index d77b91a9..91847a4d 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -1,9 +1,25 @@
 from django.core.management.base import BaseCommand
+from django.db import transaction
+from django.db.models import Count
 
 from sde_collections.models.candidate_url import CandidateURL
 from sde_collections.models.collection import Collection
 from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+from sde_collections.models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+    DeltaTitlePattern,
+)
 from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+from sde_collections.models.pattern import (
+    DivisionPattern,
+    DocumentTypePattern,
+    ExcludePattern,
+    IncludePattern,
+    TitlePattern,
+)
 
 
 class Command(BaseCommand):
@@ -11,18 +27,22 @@ class Command(BaseCommand):
             and then promoting to CuratedUrl based on collection workflow status"""
 
     def handle(self, *args, **kwargs):
-        # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl
-        collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED)
-        self.stdout.write(
-            f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..."
+        # all_collections = Collection.objects.all()
+        all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
+            url_count__gt=0
         )
 
-        for collection in collections_for_curated:
+        # all_collections = all_collections.get(id=1494)
+
+        # print()
+
+        # Migrate all CandidateURLs to DeltaUrl
+        for collection in all_collections_with_urls:
             candidate_urls = CandidateURL.objects.filter(collection=collection)
             for candidate_url in candidate_urls:
-                # Check if a CuratedUrl with the same URL already exists
-                if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
-                    CuratedUrl.objects.create(
+                # Check if a DeltaUrl with the same URL already exists
+                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
+                    DeltaUrl.objects.create(
                         collection=candidate_url.collection,
                         url=candidate_url.url,
                         scraped_title=candidate_url.scraped_title,
@@ -30,23 +50,37 @@ def handle(self, *args, **kwargs):
                         visited=candidate_url.visited,
                         document_type=candidate_url.document_type,
                         division=candidate_url.division,
+                        delete=False,
                     )
             self.stdout.write(
-                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
+                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
             )
+            # break
+
+        # Migrate Patterns
+        with transaction.atomic():
+            self.migrate_exclude_patterns()
+            self.migrate_include_patterns()
+            self.migrate_title_patterns()
+            self.migrate_document_type_patterns()
+            self.migrate_division_patterns()
+            self.stdout.write(self.style.SUCCESS("Patterns migration complete."))
 
-        # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl
-        collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED)
+        # Migrate DeltaUrl for collections with CURATED or higher workflow status to CuratedUrl
+        all_curated_collections_with_urls = all_collections_with_urls.filter(
+            workflow_status__gte=WorkflowStatusChoices.CURATED
+        )
         self.stdout.write(
-            f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..."
+            f"""Migrating URLs for {all_curated_collections_with_urls.count()} collections
+            with CURATED or higher status..."""
         )
 
-        for collection in collections_for_delta:
-            candidate_urls = CandidateURL.objects.filter(collection=collection)
+        for collection in all_curated_collections_with_urls:
+            candidate_urls = DeltaUrl.objects.filter(collection=collection)
             for candidate_url in candidate_urls:
-                # Check if a DeltaUrl with the same URL already exists
-                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
-                    DeltaUrl.objects.create(
+                # Check if a CuratedUrl with the same URL already exists
+                if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
+                    CuratedUrl.objects.create(
                         collection=candidate_url.collection,
                         url=candidate_url.url,
                         scraped_title=candidate_url.scraped_title,
@@ -54,10 +88,121 @@ def handle(self, *args, **kwargs):
                         visited=candidate_url.visited,
                         document_type=candidate_url.document_type,
                         division=candidate_url.division,
-                        delete=False,
                     )
             self.stdout.write(
-                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
+                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
+            )
+
+    def migrate_exclude_patterns(self):
+        self.stdout.write("Migrating Exclude Patterns...")
+        for pattern in ExcludePattern.objects.all():
+            delta_pattern, created = DeltaExcludePattern.objects.get_or_create(
+                collection=pattern.collection,
+                match_pattern=pattern.match_pattern,
+                match_pattern_type=pattern.match_pattern_type,
+                reason=pattern.reason,
+            )
+            delta_pattern.apply()
+
+    def migrate_include_patterns(self):
+        self.stdout.write("Migrating Include Patterns...")
+        for pattern in IncludePattern.objects.all():
+            delta_pattern, created = DeltaIncludePattern.objects.get_or_create(
+                collection=pattern.collection,
+                match_pattern=pattern.match_pattern,
+                match_pattern_type=pattern.match_pattern_type,
+            )
+            delta_pattern.apply()
+
+    def migrate_title_patterns(self):
+        self.stdout.write("Migrating Title Patterns...")
+        for pattern in TitlePattern.objects.all():
+            delta_pattern, created = DeltaTitlePattern.objects.get_or_create(
+                collection=pattern.collection,
+                match_pattern=pattern.match_pattern,
+                match_pattern_type=pattern.match_pattern_type,
+                title_pattern=pattern.title_pattern,
             )
+            delta_pattern.apply()
+
+    def migrate_document_type_patterns(self):
+        self.stdout.write("Migrating Document Type Patterns...")
+        for pattern in DocumentTypePattern.objects.all():
+            delta_pattern, created = DeltaDocumentTypePattern.objects.get_or_create(
+                collection=pattern.collection,
+                match_pattern=pattern.match_pattern,
+                match_pattern_type=pattern.match_pattern_type,
+                document_type=pattern.document_type,
+            )
+            delta_pattern.apply()
+
+    def migrate_division_patterns(self):
+        self.stdout.write("Migrating Division Patterns...")
+        for pattern in DivisionPattern.objects.all():
+            delta_pattern, created = DeltaDivisionPattern.objects.get_or_create(
+                collection=pattern.collection,
+                match_pattern=pattern.match_pattern,
+                match_pattern_type=pattern.match_pattern_type,
+                division=pattern.division,
+            )
+            delta_pattern.apply()
+
+        # # Migrate CandidateURLs to DeltaUrl
+        # all_collections = Collection.objects.all()
+        # self.stdout.write(f"Migrating URLs for {all_collections.count()} collections...")
+        # for collection in collections_for_delta:
+        #     # Apply DeltaTitlePattern
+        #     title_patterns = DeltaTitlePattern.objects.filter(collection=collection)
+        #     for title_pattern in title_patterns:
+        #         title_pattern.apply()
+
+        # # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl
+        # collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED)
+        # self.stdout.write(
+        #     f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..."
+        # )
+
+        # for collection in collections_for_curated:
+        #     candidate_urls = CandidateURL.objects.filter(collection=collection)
+        #     for candidate_url in candidate_urls:
+        #         # Check if a CuratedUrl with the same URL already exists
+        #         if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
+        #             CuratedUrl.objects.create(
+        #                 collection=candidate_url.collection,
+        #                 url=candidate_url.url,
+        #                 scraped_title=candidate_url.scraped_title,
+        #                 generated_title=candidate_url.generated_title,
+        #                 visited=candidate_url.visited,
+        #                 document_type=candidate_url.document_type,
+        #                 division=candidate_url.division,
+        #             )
+        #     self.stdout.write(
+        #         f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
+        #     )
+
+        # # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl
+        # collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED)
+        # self.stdout.write(
+        #     f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..."
+        # )
+
+        # for collection in collections_for_delta:
+        #     candidate_urls = CandidateURL.objects.filter(collection=collection)
+        #     for candidate_url in candidate_urls:
+        #         # Check if a DeltaUrl with the same URL already exists
+        #         if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
+        #             DeltaUrl.objects.create(
+        #                 collection=candidate_url.collection,
+        #                 url=candidate_url.url,
+        #                 scraped_title=candidate_url.scraped_title,
+        #                 generated_title=candidate_url.generated_title,
+        #                 visited=candidate_url.visited,
+        #                 document_type=candidate_url.document_type,
+        #                 division=candidate_url.division,
+        #                 delete=False,
+        #             )
+        #     self.stdout.write(
+        #         f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
+        #     )
 
-        self.stdout.write(self.style.SUCCESS("Migration complete."))
+        # self.stdout.write(self.style.SUCCESS("Migration complete."))

From 13649aee948826b62cdac63cdde06690499a6a0c Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 13 Nov 2024 14:21:34 -0600
Subject: [PATCH 107/441] refactor code for admin panel handling TDAMM

---
 sde_collections/admin.py                      | 93 ++-----------------
 sde_collections/serializers.py                |  3 +-
 .../utils/paired_field_descriptor.py          |  6 +-
 3 files changed, 15 insertions(+), 87 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index bf97cf02..466bc875 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -265,17 +265,18 @@ def exclude_and_delete_children(modeladmin, request, queryset):
 
 
 class CandidateURLForm(forms.ModelForm):
-    tdamm_tag_ml = forms.MultipleChoiceField(
+    # Define the fields as MultipleChoiceFields with checkboxes
+    tdamm_tag_manual = forms.MultipleChoiceField(
         choices=CandidateURL.TDAMM_TAG_CHOICES,
         required=False,
-        label="TDAMM ML Tags",
+        label="TDAMM Manual Tags",
         widget=forms.CheckboxSelectMultiple,
     )
 
-    tdamm_tag_manual = forms.MultipleChoiceField(
+    tdamm_tag_ml = forms.MultipleChoiceField(
         choices=CandidateURL.TDAMM_TAG_CHOICES,
         required=False,
-        label="TDAMM Manual Tags",
+        label="TDAMM ML Tags",
         widget=forms.CheckboxSelectMultiple,
     )
 
@@ -283,83 +284,14 @@ class Meta:
         model = CandidateURL
         fields = "__all__"
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        instance = kwargs.get("instance")
-
-        # Only show TDAMM fields if is_tdamm is True
-        if not instance or not instance.is_tdamm:
-            if "tdamm_tag_ml" in self.fields:
-                del self.fields["tdamm_tag_ml"]
-            if "tdamm_tag_manual" in self.fields:
-                del self.fields["tdamm_tag_manual"]
-        else:
-            # Initialize tdamm fields only if is_tdamm is True
-            if hasattr(self.instance, "tdamm_tag_ml"):
-                self.fields["tdamm_tag_ml"].initial = self.instance.tdamm_tag_ml or []
-
-            if hasattr(self.instance, "tdamm_tag_manual"):
-                self.fields["tdamm_tag_manual"].initial = self.instance.tdamm_tag_manual or []
-
-    def clean(self):
-        cleaned_data = super().clean()
-        return cleaned_data
-
-    def save(self, commit=True):
-        instance = super().save(commit=False)
-
-        # Handle TDAMM fields if is_tdamm is True
-        if instance.is_tdamm:
-            # Get values from the form
-            tdamm_tag_ml = self.cleaned_data.get("tdamm_tag_ml", [])
-            tdamm_tag_manual = self.cleaned_data.get("tdamm_tag_manual", [])
-
-            # Set the values directly on the instance
-            instance.tdamm_tag_ml = tdamm_tag_ml or None
-            instance.tdamm_tag_manual = tdamm_tag_manual or None
-        else:
-            # Clear TDAMM fields if is_tdamm is False
-            instance.tdamm_tag_ml = None
-            instance.tdamm_tag_manual = None
-
-        if commit:
-            instance.save()
-
-        return instance
-
 
 class CandidateURLAdmin(admin.ModelAdmin):
-    """Admin View for CandidateURL"""
+    """Admin view for CandidateURL"""
 
     form = CandidateURLForm
-
-    def get_list_display(self, request):
-        list_display = [
-            "url",
-            "scraped_title",
-            "collection",
-            "is_tdamm",
-        ]
-        # Add TDAMM-related fields only if any TDAMM-enabled URLs exist
-        if CandidateURL.objects.filter(is_tdamm=True).exists():
-            list_display.extend(["tdamm_tag_ml_display", "tdamm_tag_manual_display"])
-        return list_display
-
-    list_filter = ("collection", "is_tdamm")
-
-    @admin.display(description="TDAMM ML Tags")
-    def tdamm_tag_ml_display(self, obj):
-        if obj.is_tdamm and obj.tdamm_tag_ml:
-            readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_ml]
-            return ", ".join(readable_tags)
-        return ""
-
-    @admin.display(description="TDAMM Manual Tags")
-    def tdamm_tag_manual_display(self, obj):
-        if obj.is_tdamm and obj.tdamm_tag_manual:
-            readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_manual]
-            return ", ".join(readable_tags)
-        return ""
+    list_display = ["url", "collection", "is_tdamm", "tdamm_tag_manual", "tdamm_tag_ml"]
+    list_filter = ["collection", "is_tdamm"]
+    search_fields = ("url", "collection__name")
 
     def get_fieldsets(self, request, obj=None):
         """Dynamically adjust fieldsets based on is_tdamm"""
@@ -406,13 +338,6 @@ def get_fieldsets(self, request, obj=None):
 
         return fieldsets
 
-    def save_model(self, request, obj, form, change):
-        """Ensure proper saving of the model"""
-        if not obj.is_tdamm:
-            obj.tdamm_tag_ml = None
-            obj.tdamm_tag_manual = None
-        super().save_model(request, obj, form, change)
-
 
 class TitlePatternAdmin(admin.ModelAdmin):
     """Admin View for TitlePattern"""
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 29d86c31..06227b31 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -126,7 +126,8 @@ def to_representation(self, instance):
         return representation
 
     def get_tdamm_tag(self, obj):
-        return obj.tdamm_tag
+        tags = obj.tdamm_tag
+        return tags if tags is not None else []
 
     def get_document_type(self, obj):
         if obj.document_type is not None:
diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py
index 1c84e7a3..8fdb2a2f 100644
--- a/sde_collections/utils/paired_field_descriptor.py
+++ b/sde_collections/utils/paired_field_descriptor.py
@@ -59,8 +59,10 @@ def __get__(self, instance, owner):
         manual_value = getattr(instance, self.manual_field_name, None)
         ml_value = getattr(instance, self.ml_field_name, None)
 
-        # Return manual if it exists, otherwise ML
-        return manual_value if manual_value is not None else ml_value
+        # Return manual value only if it exists and is not empty
+        if manual_value and len(manual_value) > 0:
+            return manual_value
+        return ml_value
 
     def __set__(self, instance, value):
         """

From 6f14664aeb546d511bf1b5ffbcb29d2cbe6e8c57 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 13 Nov 2024 19:09:33 -0600
Subject: [PATCH 108/441] add tests and refactor patterns

---
 sde_collections/models/delta_patterns.py     | 156 +++++----
 sde_collections/tests/test_delta_patterns.py | 316 +++++++++++++++++++
 2 files changed, 418 insertions(+), 54 deletions(-)
 create mode 100644 sde_collections/tests/test_delta_patterns.py

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 4c21f07a..1f073edc 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -40,16 +40,26 @@ class MatchPatternTypeChoices(models.IntegerChoices):
     )
 
     def matched_urls(self):
-        """Find all the urls matching the pattern."""
+        """Find all URLs matching the pattern."""
+        # Dynamically get the DeltaUrl model to avoid circular imports
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
+
+        # Construct the regex pattern based on match type
         escaped_match_pattern = re.escape(self.match_pattern)
         regex_pattern = (
             f"{escaped_match_pattern}$"
             if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL
             else escaped_match_pattern.replace(r"\*", ".*")
         )
+
+        # Directly query DeltaUrl and CuratedUrl with collection filter
+        matching_delta_urls = DeltaUrl.objects.filter(collection=self.collection, url__regex=regex_pattern)
+        matching_curated_urls = CuratedUrl.objects.filter(collection=self.collection, url__regex=regex_pattern)
+
         return {
-            "matching_delta_urls": self.delta_urls.filter(url__regex=regex_pattern),
-            "matching_curated_urls": self.curated_urls.filter(url__regex=regex_pattern),
+            "matching_delta_urls": matching_delta_urls,
+            "matching_curated_urls": matching_curated_urls,
         }
 
     def generate_delta_url(self, curated_url, fields_to_copy=None):
@@ -75,26 +85,17 @@ def generate_delta_url(self, curated_url, fields_to_copy=None):
     def apply(self, fields_to_copy=None, update_fields=None):
         matched_urls = self.matched_urls()
 
-        # Iterate over matched CuratedUrls to create or update DeltaUrls as needed
+        # Step 1: Generate or update DeltaUrls for each matching CuratedUrl
         for curated_url in matched_urls["matching_curated_urls"]:
             self.generate_delta_url(curated_url, fields_to_copy)
 
-        # Apply any updates to DeltaUrls based on update_fields
+        # Step 2: Apply updates to fields on matching DeltaUrls
         if update_fields:
-            for field, value in update_fields.items():
-                matched_urls["matching_delta_urls"].update(**{field: value})
-
-        # Populate through tables for DeltaUrl and CuratedUrl relationships
-        for field_name, url_ids in {
-            "delta_urls": matched_urls["matching_delta_urls"].values_list("id", flat=True),
-            "curated_urls": matched_urls["matching_curated_urls"].values_list("id", flat=True),
-        }.items():
-            through_model = getattr(self, field_name).through
-            bulk_data = [
-                through_model(**{f"{field_name[:-1]}_id": url_id, f"{self.__class__.__name__.lower()}_id": self.id})
-                for url_id in url_ids
-            ]
-            through_model.objects.bulk_create(bulk_data, ignore_conflicts=True)
+            matched_urls["matching_delta_urls"].update(**update_fields)
+
+        # Step 3: Populate ManyToMany relationships for DeltaUrls and CuratedUrls
+        self.delta_urls.add(*matched_urls["matching_delta_urls"])
+        self.curated_urls.add(*matched_urls["matching_curated_urls"])
 
     def unapply(self):
         """Default unapply behavior."""
@@ -166,50 +167,97 @@ class DeltaTitlePattern(BaseMatchPattern):
     )
 
     def apply(self) -> None:
-        # Use `fields_to_copy` to copy `scraped_title` for any matching curated URLs.
-        super().apply(fields_to_copy=["scraped_title"])
-
-        matched = self.matched_urls()  # Separate QuerySets for delta and curated URLs
-        ResolvedTitle = apps.get_model("sde_collections", "ResolvedTitle")
-        ResolvedTitleError = apps.get_model("sde_collections", "ResolvedTitleError")
-
-        for url_obj in matched["matching_delta_urls"] | matched["matching_curated_urls"]:
-            context = {
-                "url": url_obj.url,
-                "title": url_obj.scraped_title,
-                "collection": self.collection.name,
-            }
-            try:
-                generated_title = resolve_title(self.title_pattern, context)
+        # Dynamically get the DeltaResolvedTitle and DeltaResolvedTitleError models to avoid circular import issues
+        DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle")
+        DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError")
 
-                # Remove existing resolved title entries for this URL
-                ResolvedTitle.objects.filter(url=url_obj).delete()
+        matched_urls = self.matched_urls()
 
-                # Create a new resolved title entry
-                ResolvedTitle.objects.create(title_pattern=self, url=url_obj, resolved_title=generated_title)
+        # Step 1: Apply title pattern to matching DeltaUrls
+        for delta_url in matched_urls["matching_delta_urls"]:
+            self.apply_title_to_url(delta_url, DeltaResolvedTitle, DeltaResolvedTitleError)
 
-                # Update generated title and save it to DeltaUrl or CuratedUrl
-                url_obj.generated_title = generated_title
-                url_obj.save()
+        # Step 2: Check and potentially create DeltaUrls for matching CuratedUrls
+        for curated_url in matched_urls["matching_curated_urls"]:
+            self.create_delta_if_title_differs(curated_url, DeltaResolvedTitle, DeltaResolvedTitleError)
+
+        # Step 3: Update ManyToMany relationships for DeltaUrls and CuratedUrls
+        self.delta_urls.add(*matched_urls["matching_delta_urls"])
+        self.curated_urls.add(*matched_urls["matching_curated_urls"])
 
-            except (ValueError, ValidationError) as e:
-                message = str(e)
-                resolved_title_error = ResolvedTitleError.objects.create(
-                    title_pattern=self, url=url_obj, error_string=message
+    def create_delta_if_title_differs(self, curated_url, DeltaResolvedTitle, DeltaResolvedTitleError):
+        """
+        Checks if the title generated by the pattern differs from the existing generated title
+        in CuratedUrl. If it does, creates or updates a DeltaUrl with the new title.
+        """
+        # Calculate the title that would be generated if the pattern is applied
+        context = {
+            "url": curated_url.url,
+            "title": curated_url.scraped_title,
+            "collection": self.collection.name,
+        }
+        try:
+            new_generated_title = resolve_title(self.title_pattern, context)
+
+            # Compare against the existing generated title in CuratedUrl
+            if curated_url.generated_title != new_generated_title:
+                # Only create a DeltaUrl if the titles differ
+                DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+                delta_url, created = DeltaUrl.objects.get_or_create(
+                    collection=self.collection,
+                    url=curated_url.url,
+                    defaults={"scraped_title": curated_url.scraped_title},
                 )
+                delta_url.generated_title = new_generated_title
+                delta_url.save()
+                self.apply_title_to_url(delta_url, DeltaResolvedTitle, DeltaResolvedTitleError)
+
+        except (ValueError, ValidationError) as e:
+            self.log_title_error(curated_url, DeltaResolvedTitleError, str(e))
 
-                # Extract status code if present in the error message
-                status_code = re.search(r"Status code: (\d+)", message)
-                if status_code:
-                    resolved_title_error.http_status_code = int(status_code.group(1))
+    def apply_title_to_url(self, url_obj, DeltaResolvedTitle, DeltaResolvedTitleError):
+        """
+        Applies the title pattern to a DeltaUrl or CuratedUrl and records the resolved title or errors.
+        """
+        context = {
+            "url": url_obj.url,
+            "title": url_obj.scraped_title,
+            "collection": self.collection.name,
+        }
+        try:
+            generated_title = resolve_title(self.title_pattern, context)
 
-                resolved_title_error.save()
+            # Remove existing resolved title entries for this URL
+            DeltaResolvedTitle.objects.filter(delta_url=url_obj).delete()
 
-    def unapply(self) -> None:
-        """Clears generated titles and dissociates URLs from the pattern."""
-        for url_obj in self.delta_urls.all():
-            url_obj.generated_title = ""
+            # Create a new resolved title entry
+            DeltaResolvedTitle.objects.create(title_pattern=self, delta_url=url_obj, resolved_title=generated_title)
+
+            # Set generated title only on DeltaUrl
+            url_obj.generated_title = generated_title
             url_obj.save()
+
+        except (ValueError, ValidationError) as e:
+            self.log_title_error(url_obj, DeltaResolvedTitleError, str(e))
+
+    def log_title_error(self, url_obj, DeltaResolvedTitleError, message):
+        """Logs an error when resolving a title."""
+        resolved_title_error = DeltaResolvedTitleError.objects.create(
+            title_pattern=self, delta_url=url_obj, error_string=message
+        )
+        status_code = re.search(r"Status code: (\d+)", message)
+        if status_code:
+            resolved_title_error.http_status_code = int(status_code.group(1))
+        resolved_title_error.save()
+
+    def unapply(self) -> None:
+        """Clears generated titles for DeltaUrls affected by this pattern and dissociates URLs from the pattern."""
+        matched_urls = self.matched_urls()
+
+        # Clear the `generated_title` for all matching DeltaUrls
+        matched_urls["matching_delta_urls"].update(generated_title="")
+
+        # Clear relationships
         self.delta_urls.clear()
         self.curated_urls.clear()
 
diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py
new file mode 100644
index 00000000..0e796b76
--- /dev/null
+++ b/sde_collections/tests/test_delta_patterns.py
@@ -0,0 +1,316 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_delta_patterns.py
+
+import pytest
+
+from sde_collections.models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+    DeltaTitlePattern,
+)
+from sde_collections.models.delta_url import (
+    CuratedUrl,
+    DeltaResolvedTitleError,
+    DeltaUrl,
+)
+from sde_collections.tests.factories import (
+    CollectionFactory,
+    CuratedUrlFactory,
+    DeltaUrlFactory,
+)
+from sde_collections.utils.title_resolver import resolve_title
+
+
+@pytest.mark.django_db
+def test_exclusion_status():
+    collection = CollectionFactory()
+    curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
+
+    # Create an exclusion pattern that should apply to this URL
+    DeltaExcludePattern.objects.create(collection=collection, match_pattern="https://example.com/page")
+
+    # Assert that the `excluded` field is set to True, as expected
+    assert CuratedUrl.objects.get(pk=curated_url.pk).excluded
+
+
+@pytest.mark.django_db
+class TestBaseMatchPattern:
+    def test_individual_url_pattern_matching(self):
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
+        pattern = DeltaIncludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/page", match_pattern_type=1  # INDIVIDUAL_URL
+        )
+        pattern.apply()
+        matching_urls = pattern.matched_urls()
+        CuratedUrl.objects.filter(collection=collection, url__regex=pattern.match_pattern)
+
+        assert curated_url in matching_urls["matching_curated_urls"]
+
+    def test_multi_url_pattern_matching(self):
+        collection = CollectionFactory()
+        curated_url_1 = CuratedUrlFactory(collection=collection, url="https://example.com/page1")
+        curated_url_2 = CuratedUrlFactory(collection=collection, url="https://example.com/page2")
+        pattern = DeltaIncludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2  # MULTI_URL_PATTERN
+        )
+
+        matching_urls = pattern.matched_urls()
+        assert curated_url_1 in matching_urls["matching_curated_urls"]
+        assert curated_url_2 in matching_urls["matching_curated_urls"]
+
+    def test_generate_delta_url_creation_and_update(self):
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
+        pattern = DeltaIncludePattern.objects.create(collection=collection, match_pattern="https://example.com/page")
+
+        # First call to generate DeltaUrl
+        pattern.generate_delta_url(curated_url, fields_to_copy=["scraped_title"])
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        original_delta_title = delta_url.scraped_title
+        assert delta_url.scraped_title == curated_url.scraped_title
+
+        # Update DeltaUrl with additional fields
+        # this is kinda weird, but basically if you have a deltaurl with a
+        # scraped_title, that value is gospel. if for some reason generate_delta_url is called
+        # again and it hits that deltaurl, it will not update the scraped_title field, since that
+        # field already exists and is assumed correct.
+        # this is true of title. but i think not of other fields?
+        curated_url.scraped_title = "Updated Title"
+        curated_url.save()
+        curated_url.refresh_from_db()
+        pattern.generate_delta_url(curated_url, fields_to_copy=["scraped_title"])
+        delta_url.refresh_from_db()
+        assert delta_url.scraped_title == original_delta_title
+
+    def test_apply_and_unapply_pattern(self):
+        # if we make a new exclude pattern and it affects an old url
+        # that wasn't previously affected, what should happen?
+        # for now, let's say the curated_url should be excluded, and a delta_url is created which is also excluded
+        # when the pattern is deleted, they should both be unexcluded again
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
+        assert not CuratedUrl.objects.get(pk=curated_url.pk).excluded
+
+        pattern = DeltaExcludePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,  # MULTI_URL_PATTERN
+        )
+
+        assert CuratedUrl.objects.get(pk=curated_url.pk).excluded
+        assert DeltaUrl.objects.get(url=curated_url.url).excluded
+
+        pattern.delete()
+
+        # TODO: for now the DeltaUrl is persisting, but i think we might want to find a way to delete it eventually
+        assert not CuratedUrl.objects.get(pk=curated_url.pk).excluded
+        assert not DeltaUrl.objects.get(url=curated_url.url).excluded
+
+
+@pytest.mark.django_db
+class TestDeltaTitlePattern:
+
+    def test_apply_generates_delta_url_if_title_differs(self):
+        collection = CollectionFactory()
+        # Step 1: Create a `CuratedUrl` with a `generated_title` that should differ from the new pattern
+        curated_url = CuratedUrlFactory(
+            collection=collection,
+            url="https://example.com/page",
+            scraped_title="Sample Title",
+            generated_title="Old Title - Processed",
+        )
+
+        # Step 2: Create a `DeltaTitlePattern` with a new title pattern
+        pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,  # MULTI_URL_PATTERN
+            title_pattern="{title} - Processed New",
+        )
+
+        # Apply the pattern
+        pattern.apply()
+
+        # Step 3: A new DeltaUrl should be created with the updated `generated_title`
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        expected_generated_title = resolve_title(
+            pattern.title_pattern,
+            {"title": curated_url.scraped_title, "url": curated_url.url, "collection": collection.name},
+        )
+        assert delta_url.generated_title == expected_generated_title
+
+    def test_apply_does_not_generate_delta_url_if_titles_match(self):
+        collection = CollectionFactory()
+        title_pattern = "{title} - Processed"
+        context = {
+            "url": "https://example.com/page",
+            "title": "Sample Title",
+            "collection": collection.name,
+        }
+        curated_url = CuratedUrlFactory(
+            collection=collection,
+            url=context["url"],
+            scraped_title=context["title"],
+            generated_title=resolve_title(title_pattern, context),
+        )
+
+        # Create and apply a `DeltaTitlePattern` with the same title pattern
+        DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,
+            title_pattern=title_pattern,
+        )
+        # pattern.apply()
+
+        # Since the title matches, no new `DeltaUrl` should be created
+        DeltaUrl.objects.filter(url=curated_url.url).first()
+
+        assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
+
+    def test_apply_resolves_title_for_delta_urls(self):
+        collection = CollectionFactory()
+        # Create a `DeltaUrl` that will be matched and have the title pattern applied
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title")
+
+        # Create and apply a `DeltaTitlePattern` to apply a generated title
+        pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,
+            title_pattern="{title} - Processed",
+        )
+        pattern.apply()
+
+        # The `generated_title` in `DeltaUrl` should reflect the applied pattern
+        delta_url.refresh_from_db()
+        expected_generated_title = resolve_title(pattern.title_pattern, {"title": delta_url.scraped_title})
+        assert delta_url.generated_title == expected_generated_title
+
+    def test_apply_logs_error_on_title_resolution_failure(self):
+        # TODO: note that if you apply a pattern with an error multiple times
+        # it will not log multiple errors on a url. it will instead throw a duplicate key error
+        # at some point, the error code should be made more robust to handle this
+        collection = CollectionFactory()
+        # Create a `DeltaUrl` that will trigger a resolution error
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title")
+
+        # Create a `DeltaTitlePattern` with an invalid title pattern to trigger an error
+        DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,
+            title_pattern="{invalid_field} - Processed",
+        )
+
+        # Check that a `DeltaResolvedTitleError` was logged
+        error_entry = DeltaResolvedTitleError.objects.get(delta_url__url=delta_url.url)
+        assert "invalid_field" in error_entry.error_string
+
+    def test_unapply_clears_generated_titles_from_delta_urls(self):
+        collection = CollectionFactory()
+        # Create a `DeltaUrl` with an existing `scraped_title`
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title")
+
+        # Create and apply a `DeltaTitlePattern`
+        pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,
+            title_pattern="{title} - Processed",
+        )
+        delta_url.refresh_from_db()
+        assert delta_url.generated_title == "Sample Title - Processed"
+
+        # Unapply the pattern, which should clear the `generated_title` in `DeltaUrl`
+        pattern.delete()
+        delta_url.refresh_from_db()
+        assert delta_url.generated_title == ""
+
+    def test_unapply_removes_pattern_relationships(self):
+        collection = CollectionFactory()
+        # Create a `CuratedUrl` and matching `DeltaUrl`
+        curated_url = CuratedUrlFactory(
+            collection=collection, url="https://example.com/page", scraped_title="Sample Title"
+        )
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title")
+
+        # Create and apply a `DeltaTitlePattern`
+        pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,
+            title_pattern="{title} - Processed",
+        )
+        pattern.apply()
+        pattern.refresh_from_db()
+
+        # Ensure relationships are set
+        assert pattern.delta_urls.filter(pk=delta_url.pk).exists()
+        assert pattern.curated_urls.filter(pk=curated_url.pk).exists()
+
+        # Unapply the pattern
+        pattern.unapply()
+
+        # Verify relationships have been cleared
+        assert not pattern.delta_urls.filter(pk=delta_url.pk).exists()
+        assert not pattern.curated_urls.filter(pk=curated_url.pk).exists()
+
+
+@pytest.mark.django_db
+class TestDeltaDocumentTypePattern:
+    def test_apply_document_type_pattern(self):
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
+        pattern = DeltaDocumentTypePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/page",
+            document_type=2,  # A different document type than default
+        )
+        pattern.apply()
+
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url.document_type == 2
+
+    def test_unapply_document_type_pattern(self):
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
+        pattern = DeltaDocumentTypePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, document_type=2
+        )
+        pattern.apply()
+
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url.document_type == 2
+
+        pattern.unapply()
+        delta_url.refresh_from_db()
+        assert delta_url.document_type is None
+
+
+@pytest.mark.django_db
+class TestDeltaDivisionPattern:
+    def test_apply_and_unapply_division_pattern(self):
+        # Step 1: Create a collection and a CuratedUrl that matches the pattern
+        collection = CollectionFactory()
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page", division=1)
+
+        # Step 2: Create a DeltaDivisionPattern to apply to matching URLs
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, division=2
+        )
+
+        # Step 3: Apply the pattern, which should generate a DeltaUrl with the division set to 2
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url.division == 2
+
+        # confirm the curated url maintains its original division
+        curated_url = CuratedUrl.objects.get(url=curated_url.url)
+        assert curated_url.division == 1
+
+        # Step 4: Unapply the pattern and confirm the division field is cleared
+        pattern.unapply()
+        delta_url.refresh_from_db()
+        assert delta_url.division is None

From 9a1c304ca061c7c720077e49c6ef427b48adb18a Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 13 Nov 2024 19:14:39 -0600
Subject: [PATCH 109/441] Added delta pattern tabs

---
 .../sde_collections/candidate_urls_list.html  | 469 ++++++++++++++++++
 1 file changed, 469 insertions(+)

diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 09f74207..9ad4a6bb 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -71,6 +71,31 @@ <h3 class="whiteText candidateTitle">
                     href="#Division-Patterns">Division Patterns</a>
             </li>
             {% endif %}
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="deltaExcludePatternsTab" data-toggle="tab"
+                    href="#Delta-Exclude-Patterns">Delta Exclude
+                    Patterns</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="deltaIncludePatternsTab" data-toggle="tab"
+                    href="#Delta-Include-Patterns">Delta Include
+                    Patterns</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="deltaTitlePatternsTab" data-toggle="tab"
+                    href="#Delta-Title-Patterns">Delta Title
+                    Patterns</a>
+            </li>
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="deltaDocumentTypePatternsTab" data-toggle="tab"
+                    href="#Delta-Document-Type-Patterns">Delta Document Type Patterns</a>
+            </li>
+            {% if is_multi_division %}
+            <li class="nav-item">
+                <a class="tab-nav tabStyle" id="deltaDivisionPatternsTab" data-toggle="tab"
+                    href="#Delta-Division-Patterns">Delta Division Patterns</a>
+            </li>
+            {% endif %}
         </ul>
 
         <!-- Tab panes -->
@@ -496,6 +521,196 @@ <h3 class="whiteText candidateTitle">
                 </table>
             </div>
 
+
+            <div class="tab-pane fade" id="Delta-Exclude-Patterns">
+                <table class="table" id="delta-exclude_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td> <select id="delta-exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaReasonFilter"
+                                    placeholder="Reason" /></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+                <hr>
+            </div>
+
+            <div class="tab-pane fade" id="Delta-Include-Patterns">
+                <table class="table" id="delta_include_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Affected URLs</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Actions</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>ID</strong>
+                            </th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td> <select id="delta-include-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+
+            </div>
+            <div class="tab-pane fade" id="Delta-Title-Patterns">
+                <table class="table" id="delta_title_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Title Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Affected URLs</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Actions</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>ID</strong>
+                            </th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td> <select id="delta-title-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+            <div class="tab-pane fade" id="Delta-Document-Type-Patterns">
+                <table class="table" id="delta_document_type_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Match Pattern Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Document Type</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Affected URLs</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>Actions</strong>
+                            </th>
+                            <th scope="col" class="text-center col-1">
+                                <strong>ID</strong>
+                            </th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td><select id="delta-document-type-patterns-dropdown-1"
+                                    class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td><select id="delta-document-type-patterns-dropdown-2"
+                                    class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Images">Images</option>
+                                    <option value="Data">Data</option>
+                                    <option value="Documentation">Documentation</option>
+                                    <option value="Software and Tools">Software and Tools</option>
+                                    <option value="Missions and Instruments">Missions and Instruments</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+            <div class="tab-pane fade" id="Delta-Division-Patterns">
+                <table class="table" id="delta_division_patterns_table" style="width:100%">
+                    <thead class="tableHeader">
+                        <tr>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Division</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                        </tr>
+                        <tr>
+                            <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                    id="deltaDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
+                            <td><select id="delta-division-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                                </select></td>
+                            <td><select id="delta-division-patterns-dropdown-2" class="select-dropdown selectStyling">
+                                    <option value="">SELECT</option>
+                                    <option value="1">Astrophysics</option>
+                                    <option value="2">Biological and Physical Sciences</option>
+                                    <option value="3">Earth Science</option>
+                                    <option value="4">Heliophysics</option>
+                                    <option value="5">Planetary Science</option>
+                                </select></td>
+                            <td></td>
+                            <td></td>
+                            <td></td>
+                        </tr>
+                    </thead>
+                </table>
+            </div>
+
         </div>
     </div>
 
@@ -509,6 +724,11 @@ <h3 class="whiteText candidateTitle">
     <li data-action="title-pattern" class="list-group-item">Create Title Pattern</li>
     <li data-action="document-type-pattern" class="list-group-item">Create Document Type Pattern</li>
     <li data-action="division-pattern" class="list-group-item">Create Division Pattern</li>
+    <li data-action="delta-exclude-pattern" class="list-group-item">Create Delta Exclude Pattern</li>
+    <li data-action="delta-include-pattern" class="list-group-item">Create Delta Include Pattern</li>
+    <li data-action="delta-title-pattern" class="list-group-item">Create Delta Title Pattern</li>
+    <li data-action="delta-document-type-pattern" class="list-group-item">Create Delta Document Type Pattern</li>
+    <li data-action="delta-division-pattern" class="list-group-item">Create Delta Division Pattern</li>
 </ul>
 <div class="modal fade" id="excludePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
     aria-labelledby="excludePatternModalLabel" aria-hidden="true">
@@ -721,6 +941,255 @@ <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Column
 </div>
 
 
+<div class="modal" id="deletePatternModal" tabindex="-1" aria-labelledby="deletePatternModal" aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modalHeader">
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
+                    id="closeDivisionModal">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
+            <div class="modal-body" id="modal-body">
+                <h5 class="modal-title">Are you sure?</h5>
+                <p class="delete-pattern-caption" id="caption"></p>
+            </div>
+            <div class="modal-footer">
+                <form id="deletePatternModalForm">
+                    <div class="button-wrapper">
+                        <button type="submit" class="btn btn-secondary modal-button-1"
+                            id="dontDeletePattern">No</button>
+                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal"
+                            id="deletePattern">Yes</button>
+                    </div>
+                </form>
+            </div>
+        </div>
+    </div>
+</div>
+
+<div class="modal fade" id="deltaExcludePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
+    aria-labelledby="deltaExcludePatternModalLabel" aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h5 class="modal-title" id="deltaExcludePatternModalLabel">Delta Exclude Pattern Form</h5>
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
+            <form id="delta_exclude_pattern_form">
+                <div class="modal-body">
+                    <div class="form-group">
+                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
+                            </div>
+                        </label>
+                        <input type="text" class="form-control" id="delta_match_pattern_input" required
+                            name="delta_match_pattern">
+                    </div>
+                </div>
+                <div class="modal-footer">
+                    <div class="button-wrapper">
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
+            </form>
+        </div>
+    </div>
+</div>
+<div class="modal fade" id="deltaIncludePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
+    aria-labelledby="deltaIncludePatternModalLabel" aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h5 class="modal-title" id="deltaIncludePatternModalLabel">Delta Include Pattern Form</h5>
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
+            <form id="delta_include_pattern_form">
+                <div class="modal-body">
+                    <div class="form-group">
+                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
+                            </div>
+                        </label>
+                        <input type="text" class="form-control" id="delta_match_pattern_input" required
+                            name="delta_match_pattern">
+                    </div>
+                </div>
+                <div class="modal-footer">
+                    <div class="button-wrapper">
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
+            </form>
+        </div>
+    </div>
+</div>
+<div class="modal fade" id="deltaTitlePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
+    aria-labelledby="deltaTitlePatternModalLabel" aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h5 class="modal-title" id="deltaTitlePatternModalLabel">Delta Title Pattern Form</h5>
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
+            <form id="delta_title_pattern_form">
+                <div class="modal-body">
+                    <div class="form-group">
+                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
+                            </div>
+                        </label>
+                        <input type="text" class="form-control" id="delta_match_pattern_input" required
+                            name="delta_match_pattern">
+                    </div>
+                    <div class="form-group title_pattern-form-group">
+                        <label for="delta_title_pattern_input" class="form-label">Title Pattern <div class="asterik">*
+                            </div>
+                        </label>
+                        <input type="text" class="form-control" id="delta_title_pattern_input" required
+                            name="delta_title_pattern">
+                    </div>
+                </div>
+                <div class="modal-footer">
+                    <div class="button-wrapper">
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
+            </form>
+        </div>
+    </div>
+</div>
+<div class="modal fade" id="deltaDocumentTypePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
+    aria-labelledby="deltaDocumentTypePatternModalLabel" aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h5 class="modal-title" id="deltaDocumentTypePatternModalLabel">Delta Document Type Pattern Form</h5>
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
+            <form id="delta_document_type_pattern_form">
+                <div class="modal-body">
+                    <div class="form-group">
+                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
+                            </div>
+                        </label>
+                        <input type="text" class="form-control" id="delta_match_pattern_input" required
+                            name="delta_match_pattern">
+                    </div>
+                    <div class="form-group">
+                        <div class="input-group">
+                            <input type="hidden" name="delta_document_type_pattern" class="form-control"
+                                aria-label="Document Type" id="delta_doc_type_input_field">
+                            <div class="input-group-append doc-dropdown-input">
+                                <button class="btn btn-secondary btn-block dropdown-toggle doc-dropdown" type="button"
+                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Document
+                                    Type</button>
+                                <div class="doc-type-form dropdown-menu">
+                                    <a class="dropdown-item document_type_form_select" value="1">Images</a>
+                                    <a class="dropdown-item document_type_form_select" value="2">Data</a>
+                                    <a class="dropdown-item document_type_form_select" value="3">Documentation</a>
+                                    <a class="dropdown-item document_type_form_select" value="4">Software and Tools</a>
+                                    <a class="dropdown-item document_type_form_select" value="5">Missions and
+                                        Instruments</a>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <div class="modal-footer">
+                    <div class="button-wrapper">
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit"
+                            class="document-type-submit btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
+            </form>
+        </div>
+    </div>
+</div>
+<div class="modal fade" id="deltaDivisionPatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
+    aria-labelledby="deltaDivisionPatternModalLabel" aria-hidden="true">
+    <div class="modal-dialog">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h5 class="modal-title" id="deltaDivisionPatternModalLabel">Delta Division Pattern Form</h5>
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
+                    <span aria-hidden="true">&times;</span>
+                </button>
+            </div>
+            <form id="delta_division_pattern_form">
+                <div class="modal-body">
+                    <div class="form-group">
+                        <label for="delta_division_match_pattern_input" class="form-label">Match Pattern <div
+                                class="asterik">
+                                *</div></label>
+                        <input type="text" class="form-control" id="delta_division_match_pattern_input" required
+                            name="delta_match_pattern">
+                    </div>
+                    <div class="form-group">
+                        <div class="input-group">
+                            <input type="hidden" name="delta_division_pattern" class="form-control"
+                                aria-label="Division" id="delta_division_input_field">
+                            <div class="input-group-append division-dropdown-input">
+                                <button class="btn btn-secondary btn-block dropdown-toggle division-dropdown"
+                                    type="button" data-toggle="dropdown" aria-haspopup="true"
+                                    aria-expanded="false">Select Division</button>
+                                <div class="division-form dropdown-menu">
+                                    <a class="dropdown-item division_form_select" value="1">Astrophysics</a>
+                                    <a class="dropdown-item division_form_select" value="2">Biological and Physical
+                                        Sciences</a>
+                                    <a class="dropdown-item division_form_select" value="3">Earth Science</a>
+                                    <a class="dropdown-item division_form_select" value="4">Heliophysics</a>
+                                    <a class="dropdown-item division_form_select" value="5">Planetary Science</a>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <div class="modal-footer">
+                    <div class="button-wrapper">
+                        <button type="button" class="btn btn-secondary modal-button-1"
+                            data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
+            </form>
+        </div>
+    </div>
+</div>
+
+<!-- Validate if the below is required for delta -->
+<div id="hideShowColumnsModal" class="modal pr-4 pl-4 pt-4 customizeColumnContainer">
+    <div class="modalDialog">
+        <div class="modalContent">
+            <div class="modalHeader ">
+                <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Columns</h5>
+                <p id="subTitle" class="whiteText">Attributes marked with a checkbox will be displayed in the table.</p>
+            </div>
+            <form id="hide_show_columns_form">
+                <div class="modalBody whiteText" id="modalBody">
+                </div>
+                <div class="modalFooter customizeColumnContainer">
+                    <div type="submit" class="btn-prime hideShowSubmitButton" id="hideShowSubmitButton">Confirm</div>
+                </div>
+            </form>
+        </div>
+    </div>
+</div>
+
+
 <div class="modal" id="deletePatternModal" tabindex="-1" aria-labelledby="deletePatternModal" aria-hidden="true">
     <div class="modal-dialog">
         <div class="modal-content">

From d066ccef97158bdd641cea70c899a5b3229ec71c Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 13 Nov 2024 20:00:10 -0600
Subject: [PATCH 110/441] Fixes issue #1088

---
 sde_collections/tests/api_tests.py | 139 +++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 sde_collections/tests/api_tests.py

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
new file mode 100644
index 00000000..430b076b
--- /dev/null
+++ b/sde_collections/tests/api_tests.py
@@ -0,0 +1,139 @@
+#docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+import unittest
+from unittest.mock import patch, Mock
+from requests import HTTPError
+from ..sinequa_api import Api
+
+class TestApi(unittest.TestCase):
+    def setUp(self):
+        # Set up an instance of the Api class with parameters for testing
+        self.api = Api(server_name="test", user="test_user", password="test_password", token="test_token")
+
+    @patch('requests.post')
+    def test_process_response_success(self, mock_post):
+        # This test checks the process_response method when the HTTP request is successful
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"key": "value"}
+        mock_post.return_value = mock_response
+
+        response = self.api.process_response("http://example.com/api", payload={"test": "data"})
+        self.assertEqual(response, {"key": "value"})
+        mock_post.assert_called_once()
+
+    @patch('requests.post')
+    def test_process_response_failure(self, mock_post):
+        # Create a mock response object with a 500 status code
+        mock_response = Mock()
+        mock_response.status_code = 500
+        mock_response.json.return_value = {"error": "Internal Server Error"}
+        mock_post.return_value = mock_response
+
+        def raise_for_status():
+            if mock_response.status_code != 200:
+                raise HTTPError(f"{mock_response.status_code} Server Error: Internal Server Error for url: http://example.com/api")
+
+        mock_response.raise_for_status = raise_for_status
+
+        # Attempt to process the response and check if it correctly handles the HTTP error
+        with self.assertRaises(HTTPError):
+            self.api.process_response("http://example.com/api", payload={"test": "data"})
+
+    @patch('requests.post')
+    def test_query(self, mock_post):
+        """
+        The test ensures that the query method constructs the correct URL and payload based on input parameters,
+          and processes a successful API response to return the expected data
+        """
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"key": "value"}
+        mock_post.return_value = mock_response
+
+        response = self.api.query(page=1, collection_config_folder="sample_folder")
+        self.assertEqual(response, {"key": "value"})
+
+        expected_url = "https://sciencediscoveryengine.test.nasa.gov/api/v1/search.query"
+        expected_payload = {
+            "app": "nasa-sba-smd",
+            "query": {
+                "name": "query-smd-primary",
+                "text": "",
+                "page": 1,
+                "pageSize": 1000,
+                "advanced": {"collection": "/SDE/sample_folder/"},
+            },
+        }
+
+        mock_post.assert_called_once_with(
+            expected_url,
+            headers=None,
+            json=expected_payload,
+            data=None,
+            verify=False
+        )
+
+    @patch('requests.post')
+    def test_sql_query(self, mock_post):
+        # Mock response for the `sql_query` function with token-based authentication
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"Rows": [["http://example.com", "sample text", "sample title"]]}
+        mock_post.return_value = mock_response
+
+        sql = "SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/sample_folder/'"
+        response = self.api.sql_query(sql)
+        self.assertEqual(response, {"Rows": [["http://example.com", "sample text", "sample title"]]})
+
+    @patch('requests.post')
+    def test_get_full_texts(self, mock_post):
+        # Mock response for the `get_full_texts` method
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {
+            "Rows": [
+                ["http://example.com/article1", "Here is the full text of the first article...", "Article One Title"],
+                ["http://example.com/article2", "Here is the full text of the second article...", "Article Two Title"]
+            ]
+        }
+        mock_post.return_value = mock_response
+
+        result = self.api.get_full_texts(collection_config_folder="sample_folder")
+        expected = [
+            {
+                "url": "http://example.com/article1",
+                "full_text": "Here is the full text of the first article...",
+                "title": "Article One Title"
+            },
+            {
+                "url": "http://example.com/article2",
+                "full_text": "Here is the full text of the second article...",
+                "title": "Article Two Title"
+            }
+        ]
+        self.assertEqual(result, expected)
+
+    def test_missing_token_for_sql_query(self):
+        # To test when token is missing for sql_query 
+        api = Api(server_name="test", token=None)
+        with self.assertRaises(ValueError):
+            api.sql_query("SELECT * FROM test_table")
+
+
+    def test_process_full_text_response(self):
+        # Test `_process_full_text_response` parsing functionality
+        raw_response = {
+            "Rows": [
+                ["http://example.com/article1", "Full text for article 1", "Title 1"],
+                ["http://example.com/article2", "Full text for article 2", "Title 2"]
+            ]
+        }
+        processed_response = Api._process_full_text_response(raw_response)
+        expected = [
+            {"url": "http://example.com/article1", "full_text": "Full text for article 1", "title": "Title 1"},
+            {"url": "http://example.com/article2", "full_text": "Full text for article 2", "title": "Title 2"},
+        ]
+        self.assertEqual(processed_response, expected)
+
+if __name__ == '__main__':
+    unittest.main()

From 0dc65dd426f009a643e916e6978c528a1b56465e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Nov 2024 02:14:04 +0000
Subject: [PATCH 111/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/tests/api_tests.py | 45 +++++++++++++++---------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 430b076b..9944ad85 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -1,15 +1,18 @@
-#docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
 import unittest
-from unittest.mock import patch, Mock
+from unittest.mock import Mock, patch
+
 from requests import HTTPError
+
 from ..sinequa_api import Api
 
+
 class TestApi(unittest.TestCase):
     def setUp(self):
         # Set up an instance of the Api class with parameters for testing
         self.api = Api(server_name="test", user="test_user", password="test_password", token="test_token")
 
-    @patch('requests.post')
+    @patch("requests.post")
     def test_process_response_success(self, mock_post):
         # This test checks the process_response method when the HTTP request is successful
         mock_response = Mock()
@@ -21,7 +24,7 @@ def test_process_response_success(self, mock_post):
         self.assertEqual(response, {"key": "value"})
         mock_post.assert_called_once()
 
-    @patch('requests.post')
+    @patch("requests.post")
     def test_process_response_failure(self, mock_post):
         # Create a mock response object with a 500 status code
         mock_response = Mock()
@@ -31,7 +34,9 @@ def test_process_response_failure(self, mock_post):
 
         def raise_for_status():
             if mock_response.status_code != 200:
-                raise HTTPError(f"{mock_response.status_code} Server Error: Internal Server Error for url: http://example.com/api")
+                raise HTTPError(
+                    f"{mock_response.status_code} Server Error: Internal Server Error for url: http://example.com/api"
+                )
 
         mock_response.raise_for_status = raise_for_status
 
@@ -39,7 +44,7 @@ def raise_for_status():
         with self.assertRaises(HTTPError):
             self.api.process_response("http://example.com/api", payload={"test": "data"})
 
-    @patch('requests.post')
+    @patch("requests.post")
     def test_query(self, mock_post):
         """
         The test ensures that the query method constructs the correct URL and payload based on input parameters,
@@ -65,15 +70,9 @@ def test_query(self, mock_post):
             },
         }
 
-        mock_post.assert_called_once_with(
-            expected_url,
-            headers=None,
-            json=expected_payload,
-            data=None,
-            verify=False
-        )
+        mock_post.assert_called_once_with(expected_url, headers=None, json=expected_payload, data=None, verify=False)
 
-    @patch('requests.post')
+    @patch("requests.post")
     def test_sql_query(self, mock_post):
         # Mock response for the `sql_query` function with token-based authentication
         mock_response = Mock()
@@ -85,7 +84,7 @@ def test_sql_query(self, mock_post):
         response = self.api.sql_query(sql)
         self.assertEqual(response, {"Rows": [["http://example.com", "sample text", "sample title"]]})
 
-    @patch('requests.post')
+    @patch("requests.post")
     def test_get_full_texts(self, mock_post):
         # Mock response for the `get_full_texts` method
         mock_response = Mock()
@@ -93,7 +92,7 @@ def test_get_full_texts(self, mock_post):
         mock_response.json.return_value = {
             "Rows": [
                 ["http://example.com/article1", "Here is the full text of the first article...", "Article One Title"],
-                ["http://example.com/article2", "Here is the full text of the second article...", "Article Two Title"]
+                ["http://example.com/article2", "Here is the full text of the second article...", "Article Two Title"],
             ]
         }
         mock_post.return_value = mock_response
@@ -103,29 +102,28 @@ def test_get_full_texts(self, mock_post):
             {
                 "url": "http://example.com/article1",
                 "full_text": "Here is the full text of the first article...",
-                "title": "Article One Title"
+                "title": "Article One Title",
             },
             {
                 "url": "http://example.com/article2",
                 "full_text": "Here is the full text of the second article...",
-                "title": "Article Two Title"
-            }
+                "title": "Article Two Title",
+            },
         ]
         self.assertEqual(result, expected)
 
     def test_missing_token_for_sql_query(self):
-        # To test when token is missing for sql_query 
+        # To test when token is missing for sql_query
         api = Api(server_name="test", token=None)
         with self.assertRaises(ValueError):
             api.sql_query("SELECT * FROM test_table")
 
-
     def test_process_full_text_response(self):
         # Test `_process_full_text_response` parsing functionality
         raw_response = {
             "Rows": [
                 ["http://example.com/article1", "Full text for article 1", "Title 1"],
-                ["http://example.com/article2", "Full text for article 2", "Title 2"]
+                ["http://example.com/article2", "Full text for article 2", "Title 2"],
             ]
         }
         processed_response = Api._process_full_text_response(raw_response)
@@ -135,5 +133,6 @@ def test_process_full_text_response(self):
         ]
         self.assertEqual(processed_response, expected)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()

From c1ac6fe015aa98d55472f29671b642e2699d8f69 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 13 Nov 2024 23:03:22 -0600
Subject: [PATCH 112/441] Refactored migration code with bulk create

---
 .../commands/migrate_urls_and_patterns.py     | 90 ++++++++++++-------
 1 file changed, 57 insertions(+), 33 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index 91847a4d..c291fdc0 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -27,35 +27,34 @@ class Command(BaseCommand):
             and then promoting to CuratedUrl based on collection workflow status"""
 
     def handle(self, *args, **kwargs):
-        # all_collections = Collection.objects.all()
         all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
             url_count__gt=0
         )
 
-        # all_collections = all_collections.get(id=1494)
-
-        # print()
-
         # Migrate all CandidateURLs to DeltaUrl
         for collection in all_collections_with_urls:
             candidate_urls = CandidateURL.objects.filter(collection=collection)
+            delta_urls = []
             for candidate_url in candidate_urls:
                 # Check if a DeltaUrl with the same URL already exists
                 if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
-                    DeltaUrl.objects.create(
-                        collection=candidate_url.collection,
-                        url=candidate_url.url,
-                        scraped_title=candidate_url.scraped_title,
-                        generated_title=candidate_url.generated_title,
-                        visited=candidate_url.visited,
-                        document_type=candidate_url.document_type,
-                        division=candidate_url.division,
-                        delete=False,
+                    delta_urls.append(
+                        DeltaUrl(
+                            collection=candidate_url.collection,
+                            url=candidate_url.url,
+                            scraped_title=candidate_url.scraped_title,
+                            generated_title=candidate_url.generated_title,
+                            visited=candidate_url.visited,
+                            document_type=candidate_url.document_type,
+                            division=candidate_url.division,
+                            delete=False,
+                        )
                     )
+            if delta_urls:
+                DeltaUrl.objects.bulk_create(delta_urls, ignore_conflicts=True)
             self.stdout.write(
                 f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
             )
-            # break
 
         # Migrate Patterns
         with transaction.atomic():
@@ -75,77 +74,102 @@ def handle(self, *args, **kwargs):
             with CURATED or higher status..."""
         )
 
+        curated_urls_to_create = []
         for collection in all_curated_collections_with_urls:
             candidate_urls = DeltaUrl.objects.filter(collection=collection)
             for candidate_url in candidate_urls:
                 # Check if a CuratedUrl with the same URL already exists
                 if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
-                    CuratedUrl.objects.create(
-                        collection=candidate_url.collection,
-                        url=candidate_url.url,
-                        scraped_title=candidate_url.scraped_title,
-                        generated_title=candidate_url.generated_title,
-                        visited=candidate_url.visited,
-                        document_type=candidate_url.document_type,
-                        division=candidate_url.division,
+                    curated_urls_to_create.append(
+                        CuratedUrl(
+                            collection=candidate_url.collection,
+                            url=candidate_url.url,
+                            scraped_title=candidate_url.scraped_title,
+                            generated_title=candidate_url.generated_title,
+                            visited=candidate_url.visited,
+                            document_type=candidate_url.document_type,
+                            division=candidate_url.division,
+                        )
                     )
+            if curated_urls_to_create:
+                CuratedUrl.objects.bulk_create(curated_urls_to_create, ignore_conflicts=True)
             self.stdout.write(
                 f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
             )
 
     def migrate_exclude_patterns(self):
         self.stdout.write("Migrating Exclude Patterns...")
+        exclude_patterns_to_create = []
         for pattern in ExcludePattern.objects.all():
-            delta_pattern, created = DeltaExcludePattern.objects.get_or_create(
+            exclude_pattern = DeltaExcludePattern(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 reason=pattern.reason,
             )
-            delta_pattern.apply()
+            exclude_patterns_to_create.append(exclude_pattern)
+            exclude_pattern.apply()
+        if exclude_patterns_to_create:
+            DeltaExcludePattern.objects.bulk_create(exclude_patterns_to_create)
 
     def migrate_include_patterns(self):
         self.stdout.write("Migrating Include Patterns...")
+        include_patterns_to_create = []
         for pattern in IncludePattern.objects.all():
-            delta_pattern, created = DeltaIncludePattern.objects.get_or_create(
+            include_pattern = DeltaIncludePattern(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
             )
-            delta_pattern.apply()
+            include_patterns_to_create.append(include_pattern)
+            include_pattern.apply()
+        if include_patterns_to_create:
+            DeltaIncludePattern.objects.bulk_create(include_patterns_to_create)
 
     def migrate_title_patterns(self):
         self.stdout.write("Migrating Title Patterns...")
+        title_patterns_to_create = []
         for pattern in TitlePattern.objects.all():
-            delta_pattern, created = DeltaTitlePattern.objects.get_or_create(
+            title_pattern = DeltaTitlePattern(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 title_pattern=pattern.title_pattern,
             )
-            delta_pattern.apply()
+            title_patterns_to_create.append(title_pattern)
+            title_pattern.apply()
+        if title_patterns_to_create:
+            DeltaTitlePattern.objects.bulk_create(title_patterns_to_create)
 
     def migrate_document_type_patterns(self):
         self.stdout.write("Migrating Document Type Patterns...")
+        doc_type_patterns_to_create = []
         for pattern in DocumentTypePattern.objects.all():
-            delta_pattern, created = DeltaDocumentTypePattern.objects.get_or_create(
+            doc_type_pattern = DeltaDocumentTypePattern(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 document_type=pattern.document_type,
             )
-            delta_pattern.apply()
+            doc_type_patterns_to_create.append(doc_type_pattern)
+            doc_type_pattern.apply()
+        if doc_type_patterns_to_create:
+            DeltaDocumentTypePattern.objects.bulk_create(doc_type_patterns_to_create)
 
     def migrate_division_patterns(self):
         self.stdout.write("Migrating Division Patterns...")
+        division_patterns_to_create = []
         for pattern in DivisionPattern.objects.all():
-            delta_pattern, created = DeltaDivisionPattern.objects.get_or_create(
+            division_pattern = DeltaDivisionPattern(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 division=pattern.division,
             )
-            delta_pattern.apply()
+            division_patterns_to_create.append(division_pattern)
+            division_pattern.apply()
+        if division_patterns_to_create:
+            DeltaDivisionPattern.objects.bulk_create(division_patterns_to_create)
 
         # # Migrate CandidateURLs to DeltaUrl
         # all_collections = Collection.objects.all()

From 7e1bfe5b8279fb298840750fc9c68e8c34c04785 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 14 Nov 2024 00:55:30 -0600
Subject: [PATCH 113/441] Issue #1088

---
 sde_collections/tests/api_tests.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 9944ad85..768d9fb8 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -1,12 +1,9 @@
 # docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
 import unittest
 from unittest.mock import Mock, patch
-
 from requests import HTTPError
-
 from ..sinequa_api import Api
 
-
 class TestApi(unittest.TestCase):
     def setUp(self):
         # Set up an instance of the Api class with parameters for testing
@@ -69,8 +66,7 @@ def test_query(self, mock_post):
                 "advanced": {"collection": "/SDE/sample_folder/"},
             },
         }
-
-        mock_post.assert_called_once_with(expected_url, headers=None, json=expected_payload, data=None, verify=False)
+        mock_post.assert_called_once_with(expected_url, headers={}, json=expected_payload, verify=False)
 
     @patch("requests.post")
     def test_sql_query(self, mock_post):
@@ -133,6 +129,5 @@ def test_process_full_text_response(self):
         ]
         self.assertEqual(processed_response, expected)
 
-
 if __name__ == "__main__":
     unittest.main()

From 0085df79119c7f24c65121ca16e92d8e8fa66117 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Nov 2024 06:56:03 +0000
Subject: [PATCH 114/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/tests/api_tests.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 768d9fb8..73f2a77a 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -1,9 +1,12 @@
 # docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
 import unittest
 from unittest.mock import Mock, patch
+
 from requests import HTTPError
+
 from ..sinequa_api import Api
 
+
 class TestApi(unittest.TestCase):
     def setUp(self):
         # Set up an instance of the Api class with parameters for testing
@@ -129,5 +132,6 @@ def test_process_full_text_response(self):
         ]
         self.assertEqual(processed_response, expected)
 
+
 if __name__ == "__main__":
     unittest.main()

From 2a8b0d0d9620e0e9d3aa73b28c73aced8e5a886e Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 11:33:07 -0600
Subject: [PATCH 115/441] Refactored migration

---
 .../commands/migrate_urls_and_patterns.py     | 118 +++++++++++++-----
 1 file changed, 88 insertions(+), 30 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index c291fdc0..d1e36629 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -2,7 +2,6 @@
 from django.db import transaction
 from django.db.models import Count
 
-from sde_collections.models.candidate_url import CandidateURL
 from sde_collections.models.collection import Collection
 from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
 from sde_collections.models.delta_patterns import (
@@ -26,35 +25,40 @@ class Command(BaseCommand):
     help = """Migrate CandidateURLs to DeltaUrl, apply the matching patterns,
             and then promoting to CuratedUrl based on collection workflow status"""
 
+    BATCH_SIZE = 100000  # Adjusted batch size for memory management
+
     def handle(self, *args, **kwargs):
         all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
             url_count__gt=0
         )
 
         # Migrate all CandidateURLs to DeltaUrl
-        for collection in all_collections_with_urls:
-            candidate_urls = CandidateURL.objects.filter(collection=collection)
-            delta_urls = []
-            for candidate_url in candidate_urls:
-                # Check if a DeltaUrl with the same URL already exists
-                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
-                    delta_urls.append(
-                        DeltaUrl(
-                            collection=candidate_url.collection,
-                            url=candidate_url.url,
-                            scraped_title=candidate_url.scraped_title,
-                            generated_title=candidate_url.generated_title,
-                            visited=candidate_url.visited,
-                            document_type=candidate_url.document_type,
-                            division=candidate_url.division,
-                            delete=False,
-                        )
-                    )
-            if delta_urls:
-                DeltaUrl.objects.bulk_create(delta_urls, ignore_conflicts=True)
-            self.stdout.write(
-                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
-            )
+        # for collection in all_collections_with_urls:
+        #     candidate_urls = CandidateURL.objects.filter(collection=collection)
+        #     delta_urls = []
+        #     for candidate_url in candidate_urls:
+        #         if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
+        #             delta_urls.append(
+        #                 DeltaUrl(
+        #                     collection=candidate_url.collection,
+        #                     url=candidate_url.url,
+        #                     scraped_title=candidate_url.scraped_title,
+        #                     generated_title=candidate_url.generated_title,
+        #                     visited=candidate_url.visited,
+        #                     document_type=candidate_url.document_type,
+        #                     division=candidate_url.division,
+        #                     delete=False,
+        #                 )
+        #             )
+        #         if len(delta_urls) >= self.BATCH_SIZE:
+        #             DeltaUrl.objects.bulk_create(delta_urls, ignore_conflicts=True)
+        #             delta_urls = []  # Clear list after batch insertion
+
+        #     if delta_urls:
+        #         DeltaUrl.objects.bulk_create(delta_urls, ignore_conflicts=True)
+        #     self.stdout.write(
+        #         f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
+        #     )
 
         # Migrate Patterns
         with transaction.atomic():
@@ -78,7 +82,6 @@ def handle(self, *args, **kwargs):
         for collection in all_curated_collections_with_urls:
             candidate_urls = DeltaUrl.objects.filter(collection=collection)
             for candidate_url in candidate_urls:
-                # Check if a CuratedUrl with the same URL already exists
                 if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
                     curated_urls_to_create.append(
                         CuratedUrl(
@@ -91,6 +94,10 @@ def handle(self, *args, **kwargs):
                             division=candidate_url.division,
                         )
                     )
+                if len(curated_urls_to_create) >= self.BATCH_SIZE:
+                    CuratedUrl.objects.bulk_create(curated_urls_to_create, ignore_conflicts=True)
+                    curated_urls_to_create = []  # Clear list after batch insertion
+
             if curated_urls_to_create:
                 CuratedUrl.objects.bulk_create(curated_urls_to_create, ignore_conflicts=True)
             self.stdout.write(
@@ -108,9 +115,18 @@ def migrate_exclude_patterns(self):
                 reason=pattern.reason,
             )
             exclude_patterns_to_create.append(exclude_pattern)
-            exclude_pattern.apply()
+            if len(exclude_patterns_to_create) >= self.BATCH_SIZE:
+                DeltaExcludePattern.objects.bulk_create(exclude_patterns_to_create)
+                created_patterns = DeltaExcludePattern.objects.filter(pk__in=[p.pk for p in exclude_patterns_to_create])
+                for pattern in created_patterns:
+                    pattern.apply()
+                exclude_patterns_to_create = []
+
         if exclude_patterns_to_create:
             DeltaExcludePattern.objects.bulk_create(exclude_patterns_to_create)
+            created_patterns = DeltaExcludePattern.objects.filter(pk__in=[p.pk for p in exclude_patterns_to_create])
+            for pattern in created_patterns:
+                pattern.apply()
 
     def migrate_include_patterns(self):
         self.stdout.write("Migrating Include Patterns...")
@@ -122,9 +138,18 @@ def migrate_include_patterns(self):
                 match_pattern_type=pattern.match_pattern_type,
             )
             include_patterns_to_create.append(include_pattern)
-            include_pattern.apply()
+            if len(include_patterns_to_create) >= self.BATCH_SIZE:
+                DeltaIncludePattern.objects.bulk_create(include_patterns_to_create)
+                created_patterns = DeltaIncludePattern.objects.filter(pk__in=[p.pk for p in include_patterns_to_create])
+                for pattern in created_patterns:
+                    pattern.apply()
+                include_patterns_to_create = []
+
         if include_patterns_to_create:
             DeltaIncludePattern.objects.bulk_create(include_patterns_to_create)
+            created_patterns = DeltaIncludePattern.objects.filter(pk__in=[p.pk for p in include_patterns_to_create])
+            for pattern in created_patterns:
+                pattern.apply()
 
     def migrate_title_patterns(self):
         self.stdout.write("Migrating Title Patterns...")
@@ -137,9 +162,18 @@ def migrate_title_patterns(self):
                 title_pattern=pattern.title_pattern,
             )
             title_patterns_to_create.append(title_pattern)
-            title_pattern.apply()
+            if len(title_patterns_to_create) >= self.BATCH_SIZE:
+                DeltaTitlePattern.objects.bulk_create(title_patterns_to_create)
+                created_patterns = DeltaTitlePattern.objects.filter(pk__in=[p.pk for p in title_patterns_to_create])
+                for pattern in created_patterns:
+                    pattern.apply()
+                title_patterns_to_create = []
+
         if title_patterns_to_create:
             DeltaTitlePattern.objects.bulk_create(title_patterns_to_create)
+            created_patterns = DeltaTitlePattern.objects.filter(pk__in=[p.pk for p in title_patterns_to_create])
+            for pattern in created_patterns:
+                pattern.apply()
 
     def migrate_document_type_patterns(self):
         self.stdout.write("Migrating Document Type Patterns...")
@@ -152,9 +186,22 @@ def migrate_document_type_patterns(self):
                 document_type=pattern.document_type,
             )
             doc_type_patterns_to_create.append(doc_type_pattern)
-            doc_type_pattern.apply()
+            if len(doc_type_patterns_to_create) >= self.BATCH_SIZE:
+                DeltaDocumentTypePattern.objects.bulk_create(doc_type_patterns_to_create)
+                created_patterns = DeltaDocumentTypePattern.objects.filter(
+                    pk__in=[p.pk for p in doc_type_patterns_to_create]
+                )
+                for pattern in created_patterns:
+                    pattern.apply()
+                doc_type_patterns_to_create = []
+
         if doc_type_patterns_to_create:
             DeltaDocumentTypePattern.objects.bulk_create(doc_type_patterns_to_create)
+            created_patterns = DeltaDocumentTypePattern.objects.filter(
+                pk__in=[p.pk for p in doc_type_patterns_to_create]
+            )
+            for pattern in created_patterns:
+                pattern.apply()
 
     def migrate_division_patterns(self):
         self.stdout.write("Migrating Division Patterns...")
@@ -167,9 +214,20 @@ def migrate_division_patterns(self):
                 division=pattern.division,
             )
             division_patterns_to_create.append(division_pattern)
-            division_pattern.apply()
+            if len(division_patterns_to_create) >= self.BATCH_SIZE:
+                DeltaDivisionPattern.objects.bulk_create(division_patterns_to_create)
+                created_patterns = DeltaDivisionPattern.objects.filter(
+                    pk__in=[p.pk for p in division_patterns_to_create]
+                )
+                for pattern in created_patterns:
+                    pattern.apply()
+                division_patterns_to_create = []
+
         if division_patterns_to_create:
             DeltaDivisionPattern.objects.bulk_create(division_patterns_to_create)
+            created_patterns = DeltaDivisionPattern.objects.filter(pk__in=[p.pk for p in division_patterns_to_create])
+            for pattern in created_patterns:
+                pattern.apply()
 
         # # Migrate CandidateURLs to DeltaUrl
         # all_collections = Collection.objects.all()

From ea644a0e56c8a8d791eafd7e32e48619b95b10b0 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 12:38:51 -0600
Subject: [PATCH 116/441] Refactored migration

---
 .../commands/migrate_urls_and_patterns.py     | 236 ++++--------------
 1 file changed, 42 insertions(+), 194 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index d1e36629..2a0ebc16 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -1,7 +1,7 @@
 from django.core.management.base import BaseCommand
-from django.db import transaction
 from django.db.models import Count
 
+from sde_collections.models.candidate_url import CandidateURL
 from sde_collections.models.collection import Collection
 from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
 from sde_collections.models.delta_patterns import (
@@ -25,49 +25,41 @@ class Command(BaseCommand):
     help = """Migrate CandidateURLs to DeltaUrl, apply the matching patterns,
             and then promoting to CuratedUrl based on collection workflow status"""
 
-    BATCH_SIZE = 100000  # Adjusted batch size for memory management
-
     def handle(self, *args, **kwargs):
+        # all_collections = Collection.objects.all()
         all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
             url_count__gt=0
         )
 
         # Migrate all CandidateURLs to DeltaUrl
-        # for collection in all_collections_with_urls:
-        #     candidate_urls = CandidateURL.objects.filter(collection=collection)
-        #     delta_urls = []
-        #     for candidate_url in candidate_urls:
-        #         if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
-        #             delta_urls.append(
-        #                 DeltaUrl(
-        #                     collection=candidate_url.collection,
-        #                     url=candidate_url.url,
-        #                     scraped_title=candidate_url.scraped_title,
-        #                     generated_title=candidate_url.generated_title,
-        #                     visited=candidate_url.visited,
-        #                     document_type=candidate_url.document_type,
-        #                     division=candidate_url.division,
-        #                     delete=False,
-        #                 )
-        #             )
-        #         if len(delta_urls) >= self.BATCH_SIZE:
-        #             DeltaUrl.objects.bulk_create(delta_urls, ignore_conflicts=True)
-        #             delta_urls = []  # Clear list after batch insertion
-
-        #     if delta_urls:
-        #         DeltaUrl.objects.bulk_create(delta_urls, ignore_conflicts=True)
-        #     self.stdout.write(
-        #         f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
-        #     )
+        for collection in all_collections_with_urls:
+            candidate_urls = CandidateURL.objects.filter(collection=collection)
+            for candidate_url in candidate_urls:
+                # Check if a DeltaUrl with the same URL already exists
+                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
+                    DeltaUrl.objects.create(
+                        collection=candidate_url.collection,
+                        url=candidate_url.url,
+                        scraped_title=candidate_url.scraped_title,
+                        generated_title=candidate_url.generated_title,
+                        visited=candidate_url.visited,
+                        document_type=candidate_url.document_type,
+                        division=candidate_url.division,
+                        delete=False,
+                    )
+            self.stdout.write(
+                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
+            )
+            # break
 
         # Migrate Patterns
-        with transaction.atomic():
-            self.migrate_exclude_patterns()
-            self.migrate_include_patterns()
-            self.migrate_title_patterns()
-            self.migrate_document_type_patterns()
-            self.migrate_division_patterns()
-            self.stdout.write(self.style.SUCCESS("Patterns migration complete."))
+
+        self.migrate_exclude_patterns()
+        self.migrate_include_patterns()
+        self.migrate_title_patterns()
+        self.migrate_document_type_patterns()
+        self.migrate_division_patterns()
+        self.stdout.write(self.style.SUCCESS("Patterns migration complete."))
 
         # Migrate DeltaUrl for collections with CURATED or higher workflow status to CuratedUrl
         all_curated_collections_with_urls = all_collections_with_urls.filter(
@@ -78,213 +70,69 @@ def handle(self, *args, **kwargs):
             with CURATED or higher status..."""
         )
 
-        curated_urls_to_create = []
         for collection in all_curated_collections_with_urls:
             candidate_urls = DeltaUrl.objects.filter(collection=collection)
             for candidate_url in candidate_urls:
+                # Check if a CuratedUrl with the same URL already exists
                 if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
-                    curated_urls_to_create.append(
-                        CuratedUrl(
-                            collection=candidate_url.collection,
-                            url=candidate_url.url,
-                            scraped_title=candidate_url.scraped_title,
-                            generated_title=candidate_url.generated_title,
-                            visited=candidate_url.visited,
-                            document_type=candidate_url.document_type,
-                            division=candidate_url.division,
-                        )
+                    CuratedUrl.objects.create(
+                        collection=candidate_url.collection,
+                        url=candidate_url.url,
+                        scraped_title=candidate_url.scraped_title,
+                        generated_title=candidate_url.generated_title,
+                        visited=candidate_url.visited,
+                        document_type=candidate_url.document_type,
+                        division=candidate_url.division,
                     )
-                if len(curated_urls_to_create) >= self.BATCH_SIZE:
-                    CuratedUrl.objects.bulk_create(curated_urls_to_create, ignore_conflicts=True)
-                    curated_urls_to_create = []  # Clear list after batch insertion
-
-            if curated_urls_to_create:
-                CuratedUrl.objects.bulk_create(curated_urls_to_create, ignore_conflicts=True)
             self.stdout.write(
                 f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
             )
 
     def migrate_exclude_patterns(self):
         self.stdout.write("Migrating Exclude Patterns...")
-        exclude_patterns_to_create = []
         for pattern in ExcludePattern.objects.all():
-            exclude_pattern = DeltaExcludePattern(
+            delta_pattern, created = DeltaExcludePattern.objects.get_or_create(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 reason=pattern.reason,
             )
-            exclude_patterns_to_create.append(exclude_pattern)
-            if len(exclude_patterns_to_create) >= self.BATCH_SIZE:
-                DeltaExcludePattern.objects.bulk_create(exclude_patterns_to_create)
-                created_patterns = DeltaExcludePattern.objects.filter(pk__in=[p.pk for p in exclude_patterns_to_create])
-                for pattern in created_patterns:
-                    pattern.apply()
-                exclude_patterns_to_create = []
-
-        if exclude_patterns_to_create:
-            DeltaExcludePattern.objects.bulk_create(exclude_patterns_to_create)
-            created_patterns = DeltaExcludePattern.objects.filter(pk__in=[p.pk for p in exclude_patterns_to_create])
-            for pattern in created_patterns:
-                pattern.apply()
 
     def migrate_include_patterns(self):
         self.stdout.write("Migrating Include Patterns...")
-        include_patterns_to_create = []
         for pattern in IncludePattern.objects.all():
-            include_pattern = DeltaIncludePattern(
+            delta_pattern, created = DeltaIncludePattern.objects.get_or_create(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
             )
-            include_patterns_to_create.append(include_pattern)
-            if len(include_patterns_to_create) >= self.BATCH_SIZE:
-                DeltaIncludePattern.objects.bulk_create(include_patterns_to_create)
-                created_patterns = DeltaIncludePattern.objects.filter(pk__in=[p.pk for p in include_patterns_to_create])
-                for pattern in created_patterns:
-                    pattern.apply()
-                include_patterns_to_create = []
-
-        if include_patterns_to_create:
-            DeltaIncludePattern.objects.bulk_create(include_patterns_to_create)
-            created_patterns = DeltaIncludePattern.objects.filter(pk__in=[p.pk for p in include_patterns_to_create])
-            for pattern in created_patterns:
-                pattern.apply()
 
     def migrate_title_patterns(self):
         self.stdout.write("Migrating Title Patterns...")
-        title_patterns_to_create = []
         for pattern in TitlePattern.objects.all():
-            title_pattern = DeltaTitlePattern(
+            delta_pattern, created = DeltaTitlePattern.objects.get_or_create(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 title_pattern=pattern.title_pattern,
             )
-            title_patterns_to_create.append(title_pattern)
-            if len(title_patterns_to_create) >= self.BATCH_SIZE:
-                DeltaTitlePattern.objects.bulk_create(title_patterns_to_create)
-                created_patterns = DeltaTitlePattern.objects.filter(pk__in=[p.pk for p in title_patterns_to_create])
-                for pattern in created_patterns:
-                    pattern.apply()
-                title_patterns_to_create = []
-
-        if title_patterns_to_create:
-            DeltaTitlePattern.objects.bulk_create(title_patterns_to_create)
-            created_patterns = DeltaTitlePattern.objects.filter(pk__in=[p.pk for p in title_patterns_to_create])
-            for pattern in created_patterns:
-                pattern.apply()
 
     def migrate_document_type_patterns(self):
         self.stdout.write("Migrating Document Type Patterns...")
-        doc_type_patterns_to_create = []
         for pattern in DocumentTypePattern.objects.all():
-            doc_type_pattern = DeltaDocumentTypePattern(
+            delta_pattern, created = DeltaDocumentTypePattern.objects.get_or_create(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 document_type=pattern.document_type,
             )
-            doc_type_patterns_to_create.append(doc_type_pattern)
-            if len(doc_type_patterns_to_create) >= self.BATCH_SIZE:
-                DeltaDocumentTypePattern.objects.bulk_create(doc_type_patterns_to_create)
-                created_patterns = DeltaDocumentTypePattern.objects.filter(
-                    pk__in=[p.pk for p in doc_type_patterns_to_create]
-                )
-                for pattern in created_patterns:
-                    pattern.apply()
-                doc_type_patterns_to_create = []
-
-        if doc_type_patterns_to_create:
-            DeltaDocumentTypePattern.objects.bulk_create(doc_type_patterns_to_create)
-            created_patterns = DeltaDocumentTypePattern.objects.filter(
-                pk__in=[p.pk for p in doc_type_patterns_to_create]
-            )
-            for pattern in created_patterns:
-                pattern.apply()
 
     def migrate_division_patterns(self):
         self.stdout.write("Migrating Division Patterns...")
-        division_patterns_to_create = []
         for pattern in DivisionPattern.objects.all():
-            division_pattern = DeltaDivisionPattern(
+            delta_pattern, created = DeltaDivisionPattern.objects.get_or_create(
                 collection=pattern.collection,
                 match_pattern=pattern.match_pattern,
                 match_pattern_type=pattern.match_pattern_type,
                 division=pattern.division,
             )
-            division_patterns_to_create.append(division_pattern)
-            if len(division_patterns_to_create) >= self.BATCH_SIZE:
-                DeltaDivisionPattern.objects.bulk_create(division_patterns_to_create)
-                created_patterns = DeltaDivisionPattern.objects.filter(
-                    pk__in=[p.pk for p in division_patterns_to_create]
-                )
-                for pattern in created_patterns:
-                    pattern.apply()
-                division_patterns_to_create = []
-
-        if division_patterns_to_create:
-            DeltaDivisionPattern.objects.bulk_create(division_patterns_to_create)
-            created_patterns = DeltaDivisionPattern.objects.filter(pk__in=[p.pk for p in division_patterns_to_create])
-            for pattern in created_patterns:
-                pattern.apply()
-
-        # # Migrate CandidateURLs to DeltaUrl
-        # all_collections = Collection.objects.all()
-        # self.stdout.write(f"Migrating URLs for {all_collections.count()} collections...")
-        # for collection in collections_for_delta:
-        #     # Apply DeltaTitlePattern
-        #     title_patterns = DeltaTitlePattern.objects.filter(collection=collection)
-        #     for title_pattern in title_patterns:
-        #         title_pattern.apply()
-
-        # # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl
-        # collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED)
-        # self.stdout.write(
-        #     f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..."
-        # )
-
-        # for collection in collections_for_curated:
-        #     candidate_urls = CandidateURL.objects.filter(collection=collection)
-        #     for candidate_url in candidate_urls:
-        #         # Check if a CuratedUrl with the same URL already exists
-        #         if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
-        #             CuratedUrl.objects.create(
-        #                 collection=candidate_url.collection,
-        #                 url=candidate_url.url,
-        #                 scraped_title=candidate_url.scraped_title,
-        #                 generated_title=candidate_url.generated_title,
-        #                 visited=candidate_url.visited,
-        #                 document_type=candidate_url.document_type,
-        #                 division=candidate_url.division,
-        #             )
-        #     self.stdout.write(
-        #         f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
-        #     )
-
-        # # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl
-        # collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED)
-        # self.stdout.write(
-        #     f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..."
-        # )
-
-        # for collection in collections_for_delta:
-        #     candidate_urls = CandidateURL.objects.filter(collection=collection)
-        #     for candidate_url in candidate_urls:
-        #         # Check if a DeltaUrl with the same URL already exists
-        #         if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
-        #             DeltaUrl.objects.create(
-        #                 collection=candidate_url.collection,
-        #                 url=candidate_url.url,
-        #                 scraped_title=candidate_url.scraped_title,
-        #                 generated_title=candidate_url.generated_title,
-        #                 visited=candidate_url.visited,
-        #                 document_type=candidate_url.document_type,
-        #                 division=candidate_url.division,
-        #                 delete=False,
-        #             )
-        #     self.stdout.write(
-        #         f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
-        #     )
-
-        # self.stdout.write(self.style.SUCCESS("Migration complete."))

From 74de5adfbbed79cd6ca8fec71c22104ee618726b Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 14:00:24 -0600
Subject: [PATCH 117/441] Fix HTML formatting

---
 .../sde_collections/candidate_urls_list.html  | 1489 ++++++++---------
 1 file changed, 732 insertions(+), 757 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 9ad4a6bb..5fcc0cb0 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -6,715 +6,713 @@
 {% endblock title %}
 {% block stylesheets %}
 {{ block.super }}
-<link
-    href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css"
-    rel="stylesheet">
-<link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css"
-    rel="stylesheet" />
+<link href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css" rel="stylesheet">
+<link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css" rel="stylesheet" />
 <link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
 <link href="{% static 'css/project.css' %}" rel="stylesheet">
 {% endblock stylesheets %}
 {% block content %}
 {% csrf_token %}
 <div class="headerDiv">
-    <h1 class="pageTitle">Candidate URLs</h1>
-    <button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
-        type="button" data-toggle="dropdown" aria-haspopup="true" id="workflow-status-button-{{ collection.id }}"
-        aria-expanded="false">{{ collection.get_workflow_status_display }}</button>
-    <div class="dropdown-menu" aria-labelledby="workflow-status-button-{{ collection.id }}">
-        {% for choice in workflow_status_choices %}
-        <a class="dropdown-item workflow_status_select" value="{{ choice }}" data-collection-id={{ collection.id }}>{{
-            choice.label }}</a>
-        {% endfor %}
-    </div>
+<h1 class="pageTitle">Candidate URLs</h1>
+<button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
+type="button"
+data-toggle="dropdown"
+aria-haspopup="true"
+id="workflow-status-button-{{ collection.id }}"
+aria-expanded="false">{{ collection.get_workflow_status_display }}</button>
+<div class="dropdown-menu"
+aria-labelledby="workflow-status-button-{{ collection.id }}">
+{% for choice in workflow_status_choices %}
+<a class="dropdown-item workflow_status_select" value="{{ choice }}" data-collection-id={{ collection.id }} >{{ choice.label }}</a>
+{% endfor %}
 </div>
 <div class="candidateUrlContainer">
-    <h3 class="whiteText candidateTitle">
-        {{ candidate_urls.count|intcomma }} Candidate URLs for <a
-            href="{% url 'sde_collections:detail' collection.pk %}"><strong class="urlStyle underline">{{
-                collection.name }}</strong></a>
-        <br>
-        <!-- <small class="muted">Base URL: <a href="{{ collection.url }}" target="_blank">{{ collection.url }}</a></small> -->
-    </h3>
+<h3 class="whiteText candidateTitle">
+    {{ candidate_urls.count|intcomma }} Candidate URLs for <a
+        href="{% url 'sde_collections:detail' collection.pk %}"><strong class="urlStyle underline">{{ collection.name }}</strong></a>
+    <br>
+    <!-- <small class="muted">Base URL: <a href="{{ collection.url }}" target="_blank">{{ collection.url }}</a></small> -->
+</h3>
 
-    <div>
-        <!-- Nav tabs -->
-        <ul class="nav nav-tabs">
-            <li class="nav-item">
-                <a class="tab-nav active tabStyle" data-toggle="tab" href="#Candidate-URLs">Candidate URLs</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" data-toggle="tab" href="#Curated-URLs">Curated URLs</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" data-toggle="tab" href="#Delta-URLs">Delta URLs</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="excludePatternsTab" data-toggle="tab" href="#Exclude-Patterns">Exclude
-                    Patterns</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="includePatternsTab" data-toggle="tab" href="#Include-Patterns">Include
-                    Patterns</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="titlePatternsTab" data-toggle="tab" href="#Title-Patterns">Title
-                    Patterns</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="documentTypePatternsTab" data-toggle="tab"
-                    href="#Document-Type-Patterns">Document Type Patterns</a>
-            </li>
-            {% if is_multi_division %}
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="divisionPatternsTab" data-toggle="tab"
-                    href="#Division-Patterns">Division Patterns</a>
-            </li>
-            {% endif %}
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="deltaExcludePatternsTab" data-toggle="tab"
-                    href="#Delta-Exclude-Patterns">Delta Exclude
-                    Patterns</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="deltaIncludePatternsTab" data-toggle="tab"
-                    href="#Delta-Include-Patterns">Delta Include
-                    Patterns</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="deltaTitlePatternsTab" data-toggle="tab"
-                    href="#Delta-Title-Patterns">Delta Title
-                    Patterns</a>
-            </li>
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="deltaDocumentTypePatternsTab" data-toggle="tab"
-                    href="#Delta-Document-Type-Patterns">Delta Document Type Patterns</a>
-            </li>
-            {% if is_multi_division %}
-            <li class="nav-item">
-                <a class="tab-nav tabStyle" id="deltaDivisionPatternsTab" data-toggle="tab"
-                    href="#Delta-Division-Patterns">Delta Division Patterns</a>
-            </li>
-            {% endif %}
-        </ul>
+<div>
+    <!-- Nav tabs -->
+    <ul class="nav nav-tabs">
+        <li class="nav-item">
+            <a class="tab-nav active tabStyle" data-toggle="tab" href="#Candidate-URLs">Candidate URLs</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" data-toggle="tab" href="#Curated-URLs">Curated URLs</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" data-toggle="tab" href="#Delta-URLs">Delta URLs</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="excludePatternsTab" data-toggle="tab" href="#Exclude-Patterns">Exclude
+                Patterns</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="includePatternsTab" data-toggle="tab" href="#Include-Patterns">Include
+                Patterns</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="titlePatternsTab" data-toggle="tab" href="#Title-Patterns">Title
+                Patterns</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="documentTypePatternsTab" data-toggle="tab"
+                href="#Document-Type-Patterns">Document Type Patterns</a>
+        </li>
+        {% if is_multi_division %}
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="divisionPatternsTab" data-toggle="tab"
+                href="#Division-Patterns">Division Patterns</a>
+        </li>
+        {% endif %}
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="deltaExcludePatternsTab" data-toggle="tab"
+                href="#Delta-Exclude-Patterns">Delta Exclude
+                Patterns</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="deltaIncludePatternsTab" data-toggle="tab"
+                href="#Delta-Include-Patterns">Delta Include
+                Patterns</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="deltaTitlePatternsTab" data-toggle="tab"
+                href="#Delta-Title-Patterns">Delta Title
+                Patterns</a>
+        </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="deltaDocumentTypePatternsTab" data-toggle="tab"
+                href="#Delta-Document-Type-Patterns">Delta Document Type Patterns</a>
+        </li>
+        {% if is_multi_division %}
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" id="deltaDivisionPatternsTab" data-toggle="tab"
+                href="#Delta-Division-Patterns">Delta Division Patterns</a>
+        </li>
+        {% endif %}
+    </ul>
 
-        <!-- Tab panes -->
-        <div class="tab-content">
-            <div class="tab-pane active" id="Candidate-URLs">
-                <table class="table" id="candidate_urls_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">URL</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Exclude</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Scraped Title</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">New Title</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Document Type</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Division</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">ID</div>
-                            </th>
-                            <th></th>
-                            <th></th>
-                            <th></th>
-                            <th></th>
-                            <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
-                            <!-- {% if is_multi_division %} -->
-                            <!-- {% endif %} -->
+    <!-- Tab panes -->
+    <div class="tab-content">
+        <div class="tab-pane active" id="Candidate-URLs">
+            <table class="table" id="candidate_urls_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">URL</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Exclude</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Scraped Title</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">New Title</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Document Type</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Division</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">ID</div>
+                        </th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                    <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                    <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
+                        <!-- {% if is_multi_division %} -->
+                        <!-- {% endif %} -->
 
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="candidateUrlFilter"
-                                    placeholder="URL" /></td>
-                            <td><select class="dropdown-1 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="false">FALSE</option>
-                                    <option value="true">TRUE</option>
-                                </select></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateScrapedTitleFilter" placeholder="Scraped Title" /></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateNewTitleFilter" placeholder="New Title" /></td>
-                            <td><select class="dropdown-4 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="0">None</option>
-                                    <option value="1">Images</option>
-                                    <option value="2">Data</option>
-                                    <option value="3">Documentation</option>
-                                    <option value="4">Software and Tools</option>
-                                    <option value="5">Missions and Instruments</option>
-                                </select></td>
-                            <td><select class="dropdown-5 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="0">None</option>
-                                    <option value="1">Astrophysics</option>
-                                    <option value="2">Biological and Physical Sciences</option>
-                                    <option value="3">Earth Science</option>
-                                    <option value="4">Heliophysics</option>
-                                    <option value="5">Planetary Science</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
-            <div class="tab-pane fade" id="Curated-URLs">
-                <table class="table" id="curated_urls_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">URL</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Exclude</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Scraped Title</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">New Title</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Document Type</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Division</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">ID</div>
-                            </th>
-                            <th></th>
-                            <th></th>
-                            <th></th>
-                            <th></th>
-                            <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
-                            <!-- {% if is_multi_division %} -->
-                            <!-- {% endif %} -->
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="candidateUrlFilter"
+                                placeholder="URL" /></td>
+                        <td><select class="dropdown-1 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="false">FALSE</option>
+                                <option value="true">TRUE</option>
+                            </select></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateNewTitleFilter" placeholder="New Title" /></td>
+                        <td><select class="dropdown-4 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="0">None</option>
+                                <option value="1">Images</option>
+                                <option value="2">Data</option>
+                                <option value="3">Documentation</option>
+                                <option value="4">Software and Tools</option>
+                                <option value="5">Missions and Instruments</option>
+                            </select></td>
+                        <td><select class="dropdown-5 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="0">None</option>
+                                <option value="1">Astrophysics</option>
+                                <option value="2">Biological and Physical Sciences</option>
+                                <option value="3">Earth Science</option>
+                                <option value="4">Heliophysics</option>
+                                <option value="5">Planetary Science</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+        <div class="tab-pane fade" id="Curated-URLs">
+            <table class="table" id="curated_urls_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">URL</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Exclude</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Scraped Title</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">New Title</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Document Type</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Division</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">ID</div>
+                        </th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                    <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                    <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
+                        <!-- {% if is_multi_division %} -->
+                        <!-- {% endif %} -->
 
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="curatedUrlFilter"
-                                    placeholder="URL" /></td>
-                            <td><select class="dropdown-1 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="false">FALSE</option>
-                                    <option value="true">TRUE</option>
-                                </select></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="curatedScrapedTitleFilter" placeholder="Scraped Title" /></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="curatedNewTitleFilter" placeholder="New Title" /></td>
-                            <td><select class="dropdown-4 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="0">None</option>
-                                    <option value="1">Images</option>
-                                    <option value="2">Data</option>
-                                    <option value="3">Documentation</option>
-                                    <option value="4">Software and Tools</option>
-                                    <option value="5">Missions and Instruments</option>
-                                </select></td>
-                            <td><select class="dropdown-5 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="0">None</option>
-                                    <option value="1">Astrophysics</option>
-                                    <option value="2">Biological and Physical Sciences</option>
-                                    <option value="3">Earth Science</option>
-                                    <option value="4">Heliophysics</option>
-                                    <option value="5">Planetary Science</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
-            <div class="tab-pane fade" id="Delta-URLs">
-                <table class="table" id="delta_urls_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">URL</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Exclude</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Scraped Title</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">New Title</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Document Type</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">Division</div>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <div class="header-title">ID</div>
-                            </th>
-                            <th></th>
-                            <th></th>
-                            <th></th>
-                            <th></th>
-                            <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
-                        <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
-                            <!-- {% if is_multi_division %} -->
-                            <!-- {% endif %} -->
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="curatedUrlFilter"
+                                placeholder="URL" /></td>
+                        <td><select class="dropdown-1 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="false">FALSE</option>
+                                <option value="true">TRUE</option>
+                            </select></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="curatedScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="curatedNewTitleFilter" placeholder="New Title" /></td>
+                        <td><select class="dropdown-4 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="0">None</option>
+                                <option value="1">Images</option>
+                                <option value="2">Data</option>
+                                <option value="3">Documentation</option>
+                                <option value="4">Software and Tools</option>
+                                <option value="5">Missions and Instruments</option>
+                            </select></td>
+                        <td><select class="dropdown-5 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="0">None</option>
+                                <option value="1">Astrophysics</option>
+                                <option value="2">Biological and Physical Sciences</option>
+                                <option value="3">Earth Science</option>
+                                <option value="4">Heliophysics</option>
+                                <option value="5">Planetary Science</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+        <div class="tab-pane fade" id="Delta-URLs">
+            <table class="table" id="delta_urls_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">URL</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Exclude</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Scraped Title</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">New Title</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Document Type</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">Division</div>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <div class="header-title">ID</div>
+                        </th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                    <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                    <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
+                        <!-- {% if is_multi_division %} -->
+                        <!-- {% endif %} -->
 
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaUrlFilter"
-                                    placeholder="URL" /></td>
-                            <td><select class="dropdown-1 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="false">FALSE</option>
-                                    <option value="true">TRUE</option>
-                                </select></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaScrapedTitleFilter" placeholder="Scraped Title" /></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaNewTitleFilter" placeholder="New Title" /></td>
-                            <td><select class="dropdown-4 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="0">None</option>
-                                    <option value="1">Images</option>
-                                    <option value="2">Data</option>
-                                    <option value="3">Documentation</option>
-                                    <option value="4">Software and Tools</option>
-                                    <option value="5">Missions and Instruments</option>
-                                </select></td>
-                            <td><select class="dropdown-5 select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="0">None</option>
-                                    <option value="1">Astrophysics</option>
-                                    <option value="2">Biological and Physical Sciences</option>
-                                    <option value="3">Earth Science</option>
-                                    <option value="4">Heliophysics</option>
-                                    <option value="5">Planetary Science</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
-            <div class="tab-pane fade" id="Exclude-Patterns">
-                <table class="table" id="exclude_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td> <select id="exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateReasonFilter" placeholder="Reason" /></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-                <hr>
-            </div>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaUrlFilter"
+                                placeholder="URL" /></td>
+                        <td><select class="dropdown-1 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="false">FALSE</option>
+                                <option value="true">TRUE</option>
+                            </select></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaNewTitleFilter" placeholder="New Title" /></td>
+                        <td><select class="dropdown-4 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="0">None</option>
+                                <option value="1">Images</option>
+                                <option value="2">Data</option>
+                                <option value="3">Documentation</option>
+                                <option value="4">Software and Tools</option>
+                                <option value="5">Missions and Instruments</option>
+                            </select></td>
+                        <td><select class="dropdown-5 select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="0">None</option>
+                                <option value="1">Astrophysics</option>
+                                <option value="2">Biological and Physical Sciences</option>
+                                <option value="3">Earth Science</option>
+                                <option value="4">Heliophysics</option>
+                                <option value="5">Planetary Science</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+        <div class="tab-pane fade" id="Exclude-Patterns">
+            <table class="table" id="exclude_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td> <select id="exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateReasonFilter" placeholder="Reason" /></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+            <hr>
+        </div>
 
-            <div class="tab-pane fade" id="Include-Patterns">
-                <table class="table" id="include_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Affected URLs</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Actions</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>ID</strong>
-                            </th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td> <select id="include-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
+        <div class="tab-pane fade" id="Include-Patterns">
+            <table class="table" id="include_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected URLs</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Actions</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>ID</strong>
+                        </th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td> <select id="include-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
 
-            </div>
-            <div class="tab-pane fade" id="Title-Patterns">
-                <table class="table" id="title_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Title Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Affected URLs</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Actions</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>ID</strong>
-                            </th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td> <select id="title-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
-            <div class="tab-pane fade" id="Document-Type-Patterns">
-                <table class="table" id="document_type_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Document Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Affected URLs</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Actions</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>ID</strong>
-                            </th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td><select id="document-type-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td><select id="document-type-patterns-dropdown-2" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Images">Images</option>
-                                    <option value="Data">Data</option>
-                                    <option value="Documentation">Documentation</option>
-                                    <option value="Software and Tools">Software and Tools</option>
-                                    <option value="Missions and Instruments">Missions and Instruments</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
-            <div class="tab-pane fade" id="Division-Patterns">
-                <table class="table" id="division_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Division</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="candidateDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td><select id="division-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td><select id="division-patterns-dropdown-2" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="1">Astrophysics</option>
-                                    <option value="2">Biological and Physical Sciences</option>
-                                    <option value="3">Earth Science</option>
-                                    <option value="4">Heliophysics</option>
-                                    <option value="5">Planetary Science</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
+        </div>
+        <div class="tab-pane fade" id="Title-Patterns">
+            <table class="table" id="title_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Title Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected URLs</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Actions</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>ID</strong>
+                        </th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td> <select id="title-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+        <div class="tab-pane fade" id="Document-Type-Patterns">
+            <table class="table" id="document_type_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Document Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected URLs</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Actions</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>ID</strong>
+                        </th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td><select id="document-type-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td><select id="document-type-patterns-dropdown-2" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Images">Images</option>
+                                <option value="Data">Data</option>
+                                <option value="Documentation">Documentation</option>
+                                <option value="Software and Tools">Software and Tools</option>
+                                <option value="Missions and Instruments">Missions and Instruments</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+        <div class="tab-pane fade" id="Division-Patterns">
+            <table class="table" id="division_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Division</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="candidateDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td><select id="division-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td><select id="division-patterns-dropdown-2" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="1">Astrophysics</option>
+                                <option value="2">Biological and Physical Sciences</option>
+                                <option value="3">Earth Science</option>
+                                <option value="4">Heliophysics</option>
+                                <option value="5">Planetary Science</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
 
 
-            <div class="tab-pane fade" id="Delta-Exclude-Patterns">
-                <table class="table" id="delta-exclude_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td> <select id="delta-exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaReasonFilter"
-                                    placeholder="Reason" /></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-                <hr>
-            </div>
+        <div class="tab-pane fade" id="Delta-Exclude-Patterns">
+            <table class="table" id="delta-exclude_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td> <select id="delta-exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaReasonFilter"
+                                placeholder="Reason" /></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+            <hr>
+        </div>
 
-            <div class="tab-pane fade" id="Delta-Include-Patterns">
-                <table class="table" id="delta_include_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Affected URLs</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Actions</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>ID</strong>
-                            </th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td> <select id="delta-include-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-
-            </div>
-            <div class="tab-pane fade" id="Delta-Title-Patterns">
-                <table class="table" id="delta_title_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Title Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Affected URLs</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Actions</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>ID</strong>
-                            </th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td> <select id="delta-title-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
-            <div class="tab-pane fade" id="Delta-Document-Type-Patterns">
-                <table class="table" id="delta_document_type_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Match Pattern Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Document Type</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Affected URLs</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>Actions</strong>
-                            </th>
-                            <th scope="col" class="text-center col-1">
-                                <strong>ID</strong>
-                            </th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td><select id="delta-document-type-patterns-dropdown-1"
-                                    class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td><select id="delta-document-type-patterns-dropdown-2"
-                                    class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Images">Images</option>
-                                    <option value="Data">Data</option>
-                                    <option value="Documentation">Documentation</option>
-                                    <option value="Software and Tools">Software and Tools</option>
-                                    <option value="Missions and Instruments">Missions and Instruments</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
-            <div class="tab-pane fade" id="Delta-Division-Patterns">
-                <table class="table" id="delta_division_patterns_table" style="width:100%">
-                    <thead class="tableHeader">
-                        <tr>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Division</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                            <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                        </tr>
-                        <tr>
-                            <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                    id="deltaDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
-                            <td><select id="delta-division-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                    <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                                </select></td>
-                            <td><select id="delta-division-patterns-dropdown-2" class="select-dropdown selectStyling">
-                                    <option value="">SELECT</option>
-                                    <option value="1">Astrophysics</option>
-                                    <option value="2">Biological and Physical Sciences</option>
-                                    <option value="3">Earth Science</option>
-                                    <option value="4">Heliophysics</option>
-                                    <option value="5">Planetary Science</option>
-                                </select></td>
-                            <td></td>
-                            <td></td>
-                            <td></td>
-                        </tr>
-                    </thead>
-                </table>
-            </div>
+        <div class="tab-pane fade" id="Delta-Include-Patterns">
+            <table class="table" id="delta_include_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected URLs</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Actions</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>ID</strong>
+                        </th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td> <select id="delta-include-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
 
         </div>
+        <div class="tab-pane fade" id="Delta-Title-Patterns">
+            <table class="table" id="delta_title_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Title Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected URLs</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Actions</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>ID</strong>
+                        </th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td> <select id="delta-title-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+        <div class="tab-pane fade" id="Delta-Document-Type-Patterns">
+            <table class="table" id="delta_document_type_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Match Pattern Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Document Type</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected URLs</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Actions</strong>
+                        </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>ID</strong>
+                        </th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td><select id="delta-document-type-patterns-dropdown-1"
+                                class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td><select id="delta-document-type-patterns-dropdown-2"
+                                class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Images">Images</option>
+                                <option value="Data">Data</option>
+                                <option value="Documentation">Documentation</option>
+                                <option value="Software and Tools">Software and Tools</option>
+                                <option value="Missions and Instruments">Missions and Instruments</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+        <div class="tab-pane fade" id="Delta-Division-Patterns">
+            <table class="table" id="delta_division_patterns_table" style="width:100%">
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Division</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
+                    </tr>
+                    <tr>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling"
+                                id="deltaDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td><select id="delta-division-patterns-dropdown-1" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="Individual URL Pattern">Individual URL Pattern</option>
+                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                            </select></td>
+                        <td><select id="delta-division-patterns-dropdown-2" class="select-dropdown selectStyling">
+                                <option value="">SELECT</option>
+                                <option value="1">Astrophysics</option>
+                                <option value="2">Biological and Physical Sciences</option>
+                                <option value="3">Earth Science</option>
+                                <option value="4">Heliophysics</option>
+                                <option value="5">Planetary Science</option>
+                            </select></td>
+                        <td></td>
+                        <td></td>
+                        <td></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
+
     </div>
+</div>
 
-    <br>
+<br>
 </div>
 
 
@@ -743,15 +741,13 @@ <h5 class="modal-title" id="excludePatternModalLabel">Exclude Pattern Form</h5>
             <form id="exclude_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
-                        </label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
+                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
                         <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
                     </div>
                 </div>
@@ -772,15 +768,13 @@ <h5 class="modal-title" id="includePatternModalLabel">Include Pattern Form</h5>
             <form id="include_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
-                        </label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
+                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
                         <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
                     </div>
                 </div>
@@ -801,20 +795,17 @@ <h5 class="modal-title" id="titlePatternModalLabel">Title Pattern Form</h5>
             <form id="title_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
-                        </label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                     <div class="form-group title_pattern-form-group">
-                        <label for="title_pattern_input" class="form-label">Title Pattern <div class="asterik">*</div>
-                        </label>
+                        <label for="title_pattern_input" class="form-label">Title Pattern <div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="title_pattern_input" required name="title_pattern">
                     </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
+                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
                         <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
                     </div>
                 </div>
@@ -835,8 +826,7 @@ <h5 class="modal-title" id="documentTypePatternModalLabel">Document Type Pattern
             <form id="document_type_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div>
-                        </label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                     <div class="form-group">
@@ -861,10 +851,8 @@ <h5 class="modal-title" id="documentTypePatternModalLabel">Document Type Pattern
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
-                        <button type="submit"
-                            class="document-type-submit btn btn-primary modal-button-2">Submit</button>
+                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
+                        <button type="submit" class="document-type-submit btn btn-primary modal-button-2">Submit</button>
                     </div>
                 </div>
             </form>
@@ -884,23 +872,20 @@ <h5 class="modal-title" id="divisionPatternModalLabel">Division Pattern Form</h5
             <form id="division_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="division_match_pattern_input" class="form-label">Match Pattern <div class="asterik">
-                                *</div></label>
-                        <input type="text" class="form-control" id="division_match_pattern_input" required
-                            name="match_pattern">
+                        <label for="division_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
+                        <input type="text" class="form-control" id="division_match_pattern_input" required name="match_pattern">
                     </div>
                     <div class="form-group">
                         <div class="input-group">
-                            <input type="hidden" name="division_pattern" class="form-control" aria-label="Division"
-                                id="division_input_field">
+                            <input type="hidden" name="division_pattern" class="form-control"
+                            aria-label="Division" id="division_input_field">
                             <div class="input-group-append division-dropdown-input">
                                 <button class="btn btn-secondary btn-block dropdown-toggle division-dropdown"
                                     type="button" data-toggle="dropdown" aria-haspopup="true"
                                     aria-expanded="false">Select Division</button>
                                 <div class="division-form dropdown-menu">
                                     <a class="dropdown-item division_form_select" value="1">Astrophysics</a>
-                                    <a class="dropdown-item division_form_select" value="2">Biological and Physical
-                                        Sciences</a>
+                                    <a class="dropdown-item division_form_select" value="2">Biological and Physical Sciences</a>
                                     <a class="dropdown-item division_form_select" value="3">Earth Science</a>
                                     <a class="dropdown-item division_form_select" value="4">Heliophysics</a>
                                     <a class="dropdown-item division_form_select" value="5">Planetary Science</a>
@@ -911,8 +896,7 @@ <h5 class="modal-title" id="divisionPatternModalLabel">Division Pattern Form</h5
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
+                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
                         <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
                     </div>
                 </div>
@@ -941,32 +925,31 @@ <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Column
 </div>
 
 
-<div class="modal" id="deletePatternModal" tabindex="-1" aria-labelledby="deletePatternModal" aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modalHeader">
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
-                    id="closeDivisionModal">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
-            <div class="modal-body" id="modal-body">
-                <h5 class="modal-title">Are you sure?</h5>
-                <p class="delete-pattern-caption" id="caption"></p>
-            </div>
-            <div class="modal-footer">
-                <form id="deletePatternModalForm">
-                    <div class="button-wrapper">
-                        <button type="submit" class="btn btn-secondary modal-button-1"
-                            id="dontDeletePattern">No</button>
-                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal"
-                            id="deletePattern">Yes</button>
-                    </div>
+<div class="modal" id="deletePatternModal" tabindex="-1"
+aria-labelledby="deletePatternModal" aria-hidden="true">
+<div class="modal-dialog">
+    <div class="modal-content">
+        <div class="modalHeader">
+            <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
+                id="closeDivisionModal">
+                <span aria-hidden="true">&times;</span>
+            </button>
+        </div>
+        <div class="modal-body" id="modal-body">
+            <h5 class="modal-title">Are you sure?</h5>
+            <p class="delete-pattern-caption" id="caption"></p>
+        </div>
+        <div class="modal-footer">
+            <form id="deletePatternModalForm">
+                <div class="button-wrapper">
+                <button type="submit" class="btn btn-secondary modal-button-1" id="dontDeletePattern">No</button>
+                <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="deletePattern">Yes</button>
+                </div>
                 </form>
-            </div>
         </div>
     </div>
 </div>
+</div>
 
 <div class="modal fade" id="deltaExcludePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
     aria-labelledby="deltaExcludePatternModalLabel" aria-hidden="true">
@@ -1171,7 +1154,7 @@ <h5 class="modal-title" id="deltaDivisionPatternModalLabel">Delta Division Patte
 </div>
 
 <!-- Validate if the below is required for delta -->
-<div id="hideShowColumnsModal" class="modal pr-4 pl-4 pt-4 customizeColumnContainer">
+<!-- <div id="hideShowColumnsModal" class="modal pr-4 pl-4 pt-4 customizeColumnContainer">
     <div class="modalDialog">
         <div class="modalContent">
             <div class="modalHeader ">
@@ -1194,8 +1177,7 @@ <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Column
     <div class="modal-dialog">
         <div class="modal-content">
             <div class="modalHeader">
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
-                    id="closeDivisionModal">
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeDivisionModal">
                     <span aria-hidden="true">&times;</span>
                 </button>
             </div>
@@ -1206,24 +1188,21 @@ <h5 class="modal-title">Are you sure?</h5>
             <div class="modal-footer">
                 <form id="deletePatternModalForm">
                     <div class="button-wrapper">
-                        <button type="submit" class="btn btn-secondary modal-button-1"
-                            id="dontDeletePattern">No</button>
-                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal"
-                            id="deletePattern">Yes</button>
+                        <button type="submit" class="btn btn-secondary modal-button-1" id="dontDeletePattern">No</button>
+                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="deletePattern">Yes</button>
                     </div>
                 </form>
             </div>
         </div>
     </div>
-</div>
+</div> -->
 
 <div class="modal" id="workflowStatusChangeModal" tabindex="-1" aria-labelledby="workflowStatusChangeModal"
     aria-hidden="true">
     <div class="modal-dialog">
         <div class="modal-content">
             <div class="modalHeader">
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
-                    id="closeworkflowStatusChangeModal">
+                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeworkflowStatusChangeModal">
                     <span aria-hidden="true">&times;</span>
                 </button>
             </div>
@@ -1234,10 +1213,8 @@ <h5 class="modal-title">Are you sure?</h5>
             <div class="modal-footer">
                 <form id="workflowStatusChangeModalForm">
                     <div class="button-wrapper">
-                        <button type="submit" class="btn btn-secondary modal-button-1"
-                            id="cancelworkflowStatusChange">No</button>
-                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal"
-                            id="changeWorkflowStatus">Yes</button>
+                        <button type="submit" class="btn btn-secondary modal-button-1" id="cancelworkflowStatusChange">No</button>
+                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="changeWorkflowStatus">Yes</button>
                     </div>
                 </form>
             </div>
@@ -1252,11 +1229,9 @@ <h5 class="modal-title">Are you sure?</h5>
     <script>var is_multi_division = "{{ is_multi_division|lower }}";</script>
     <script src="//cdnjs.cloudflare.com/ajax/libs/pdfmake/0.2.7/pdfmake.min.js"></script>
     <script src="//cdnjs.cloudflare.com/ajax/libs/pdfmake/0.2.7/vfs_fonts.js"></script>
-    <script
-        src="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.js"></script>
+    <script src="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.js"></script>
     <script src="//cdnjs.cloudflare.com/ajax/libs/jquery.blockUI/2.70/jquery.blockUI.min.js"></script>
-    <script
-        src="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.js"></script>
+    <script src="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.js"></script>
     <script src="{% static 'js/candidate_url_list.js' %}"></script>
     <!-- <script src="{% static 'js/curated_url_list.js' %}"></script>
     <script src="{% static 'js/delta_url_list.js' %}"></script> -->

From f5e6e67a5e599249321064c7ccfe35788fc5e1d9 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 14:01:43 -0600
Subject: [PATCH 118/441] Fix HTML formatting

---
 .../templates/sde_collections/candidate_urls_list.html | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 5fcc0cb0..1fc97889 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -5,11 +5,11 @@
 {{ collection.name }} Candidate URLs
 {% endblock title %}
 {% block stylesheets %}
-{{ block.super }}
-<link href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css" rel="stylesheet">
-<link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css" rel="stylesheet" />
-<link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
-<link href="{% static 'css/project.css' %}" rel="stylesheet">
+    {{ block.super }}
+    <link href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css" rel="stylesheet">
+    <link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css" rel="stylesheet" />
+    <link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
+    <link href="{% static 'css/project.css' %}" rel="stylesheet">
 {% endblock stylesheets %}
 {% block content %}
 {% csrf_token %}

From 79d3ab7c473a5ebb5be760498973c6b0a1529a5f Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 14:03:10 -0600
Subject: [PATCH 119/441] Fix HTML formatting

---
 .../templates/sde_collections/candidate_urls_list.html           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 1fc97889..05a629b2 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -27,6 +27,7 @@ <h1 class="pageTitle">Candidate URLs</h1>
 <a class="dropdown-item workflow_status_select" value="{{ choice }}" data-collection-id={{ collection.id }} >{{ choice.label }}</a>
 {% endfor %}
 </div>
+</div>
 <div class="candidateUrlContainer">
 <h3 class="whiteText candidateTitle">
     {{ candidate_urls.count|intcomma }} Candidate URLs for <a

From 64d826b37fb944af701ee77fbd497918cc47189c Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 14:09:48 -0600
Subject: [PATCH 120/441] Fix HTML formatting

---
 .../sde_collections/candidate_urls_list.html  | 35 ++++++-------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 05a629b2..5955d74e 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -103,41 +103,26 @@ <h3 class="whiteText candidateTitle">
             <table class="table" id="candidate_urls_table" style="width:100%">
                 <thead class="tableHeader">
                     <tr>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">URL</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Exclude</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Scraped Title</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">New Title</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Document Type</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Division</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">ID</div>
-                        </th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">URL</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Scraped Title</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">New Title</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Division</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">ID</div></th>
                         <th></th>
                         <th></th>
                         <th></th>
                         <th></th>
                         <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
-                    <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
-                    <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
+                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
                         <!-- {% if is_multi_division %} -->
                         <!-- {% endif %} -->
 
                     </tr>
                     <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="candidateUrlFilter"
-                                placeholder="URL" /></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="candidateUrlFilter" placeholder="URL" /></td>
                         <td><select class="dropdown-1 select-dropdown selectStyling">
                                 <option value="">SELECT</option>
                                 <option value="false">FALSE</option>

From 7133b2bb185fda25618c0053e5198501b4c4fc56 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 14:54:56 -0600
Subject: [PATCH 121/441] remove deprecated url migration file

---
 .../management/commands/migrate_urls.py       | 62 -------------------
 1 file changed, 62 deletions(-)
 delete mode 100644 sde_collections/management/commands/migrate_urls.py

diff --git a/sde_collections/management/commands/migrate_urls.py b/sde_collections/management/commands/migrate_urls.py
deleted file mode 100644
index 091407b3..00000000
--- a/sde_collections/management/commands/migrate_urls.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from django.core.management.base import BaseCommand
-
-from sde_collections.models.candidate_url import CandidateURL
-from sde_collections.models.collection import Collection
-from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
-from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
-
-
-class Command(BaseCommand):
-    help = "Migrate CandidateURLs to CuratedUrl or DeltaUrl based on collection workflow status"
-
-    def handle(self, *args, **kwargs):
-        # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl
-        collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED)
-        self.stdout.write(
-            f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..."
-        )
-
-        for collection in collections_for_curated:
-            candidate_urls = CandidateURL.objects.filter(collection=collection)
-            for candidate_url in candidate_urls:
-                # Check if a CuratedUrl with the same URL already exists
-                if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
-                    CuratedUrl.objects.create(
-                        collection=candidate_url.collection,
-                        url=candidate_url.url,
-                        scraped_title=candidate_url.scraped_title,
-                        generated_title=candidate_url.generated_title,
-                        visited=candidate_url.visited,
-                        document_type=candidate_url.document_type,
-                        division=candidate_url.division,
-                    )
-            self.stdout.write(
-                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
-            )
-
-        # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl
-        collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED)
-        self.stdout.write(
-            f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..."
-        )
-
-        for collection in collections_for_delta:
-            candidate_urls = CandidateURL.objects.filter(collection=collection)
-            for candidate_url in candidate_urls:
-                # Check if a DeltaUrl with the same URL already exists
-                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
-                    DeltaUrl.objects.create(
-                        collection=candidate_url.collection,
-                        url=candidate_url.url,
-                        scraped_title=candidate_url.scraped_title,
-                        generated_title=candidate_url.generated_title,
-                        visited=candidate_url.visited,
-                        document_type=candidate_url.document_type,
-                        division=candidate_url.division,
-                        delete=False,
-                    )
-            self.stdout.write(
-                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
-            )
-
-        self.stdout.write(self.style.SUCCESS("Migration complete."))

From 8f48e3d8891954d3a57f781dfb6b3f2240f42e09 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 15:34:33 -0600
Subject: [PATCH 122/441] remove version mapping from plugin indexing template

---
 config_generation/xmls/plugin_indexing_template.xml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml
index 34aea51f..03f0f7aa 100644
--- a/config_generation/xmls/plugin_indexing_template.xml
+++ b/config_generation/xmls/plugin_indexing_template.xml
@@ -268,8 +268,4 @@
         <Name>id</Name>
         <Value>doc.url1</Value>
     </Mapping>
-    <Mapping>
-        <Name>version</Name>
-        <Value>Md5(doc.url1)</Value>
-    </Mapping>
 </Sinequa>

From 9153d33529fc14f313f92a5b5e950d30e63dc24c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 16:35:17 -0600
Subject: [PATCH 123/441] add script to analyze and remove url duplicates

---
 scripts/analyze_and_remove_url_duplicates.py | 76 ++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 scripts/analyze_and_remove_url_duplicates.py

diff --git a/scripts/analyze_and_remove_url_duplicates.py b/scripts/analyze_and_remove_url_duplicates.py
new file mode 100644
index 00000000..61693a9f
--- /dev/null
+++ b/scripts/analyze_and_remove_url_duplicates.py
@@ -0,0 +1,76 @@
+from collections import defaultdict
+
+from django.db import models
+
+from sde_collections.models.candidate_url import CandidateURL
+from sde_collections.models.collection import Collection
+
+# Get all field names except 'id' and 'collection' (since we're already looping by collection)
+duplicate_fields = [field.name for field in CandidateURL._meta.get_fields() if field.name not in ["id", "collection"]]
+
+
+def analyze_duplicates():
+    """Analyze duplicates and print how many would be deleted in each collection."""
+    deletion_stats = defaultdict(lambda: {"total": 0, "to_delete": 0})
+
+    # Loop through each collection
+    for collection in Collection.objects.all():
+        # Count total URLs for the collection
+        total_urls = CandidateURL.objects.filter(collection=collection).count()
+        deletion_stats[collection.config_folder]["total"] = total_urls
+
+        # Group CandidateURL instances by all fields dynamically
+        duplicates_in_collection = (
+            CandidateURL.objects.filter(collection=collection)
+            .values(*duplicate_fields)
+            .annotate(count=models.Count("id"))
+            .filter(count__gt=1)
+        )
+
+        # Count potential deletions without deleting
+        for entry in duplicates_in_collection:
+            duplicates_count = CandidateURL.objects.filter(
+                collection=collection, **{field: entry[field] for field in duplicate_fields}
+            ).count()
+            deletion_stats[collection.config_folder]["to_delete"] += duplicates_count - 1
+
+    # Print analysis results
+    print("Duplicate analysis completed.")
+    for config_folder, stats in deletion_stats.items():
+        print(f"{config_folder}' has {stats['total']} total URL(s), with {stats['to_delete']} duplicates.")
+
+
+def delete_duplicates():
+    """Delete duplicates based on previously analyzed duplicates."""
+    deletion_stats = defaultdict(int)
+
+    # Loop through each collection
+    for collection in Collection.objects.all():
+        # Group CandidateURL instances by all fields dynamically
+        duplicates_in_collection = (
+            CandidateURL.objects.filter(collection=collection)
+            .values(*duplicate_fields)
+            .annotate(count=models.Count("id"))
+            .filter(count__gt=1)
+        )
+
+        # Delete duplicates and track deletions
+        for entry in duplicates_in_collection:
+            duplicates = CandidateURL.objects.filter(
+                collection=collection, **{field: entry[field] for field in duplicate_fields}
+            )
+
+            # Keep the first instance and delete the rest
+            for candidate in duplicates[1:]:  # Skip the first to retain it
+                candidate.delete()
+                deletion_stats[collection.config_folder] += 1
+
+    # Print deletion results
+    print("Duplicate URL cleanup completed.")
+    for config_folder, deleted_count in deletion_stats.items():
+        print(f"Collection '{config_folder}' had {deleted_count} duplicate URL(s) deleted.")
+
+
+# Usage
+analyze_duplicates()  # First analyze duplicates
+delete_duplicates()  # Then delete duplicates based on analysis

From e1cd50787987ebebd6cd68803f0a3147444c5cc1 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 17:28:48 -0600
Subject: [PATCH 124/441] update fulltext fetch to purge previous urls and skip
 duplicates. remove deprecated api import actions

---
 sde_collections/admin.py | 21 +++++++--------------
 sde_collections/tasks.py | 38 ++++++++++++++++++++++++--------------
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 02151b03..9dcc57b9 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -12,23 +12,23 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api
+from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api
 
 
-def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name):
+def fetch_and_replace_text_for_server(modeladmin, request, queryset, server_name):
     for collection in queryset:
-        fetch_and_update_full_text.delay(collection.id, server_name)
+        fetch_and_replace_full_text.delay(collection.id, server_name)
     modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server")
 
 
 @admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
 def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
-    fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev")
+    fetch_and_replace_text_for_server(modeladmin, request, queryset, "lrm_dev")
 
 
 @admin.action(description="Import candidate URLs from XLI Server with Full Text")
-def fetch_full_text_lis_action(modeladmin, request, queryset):
-    fetch_and_update_text_for_server(modeladmin, request, queryset, "xli")
+def fetch_full_text_xli_action(modeladmin, request, queryset):
+    fetch_and_replace_text_for_server(modeladmin, request, queryset, "xli")
 
 
 @admin.action(description="Generate deployment message")
@@ -254,15 +254,8 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
         "export_as_csv",
         "update_config",
         download_candidate_urls_as_csv,
-        import_candidate_urls_test,
-        import_candidate_urls_production,
-        import_candidate_urls_secret_test,
-        import_candidate_urls_secret_production,
-        import_candidate_urls_xli_server,
-        import_candidate_urls_lrm_dev_server,
-        import_candidate_urls_lrm_qa_server,
         fetch_full_text_lrm_dev_action,
-        fetch_full_text_lis_action,
+        fetch_full_text_xli_action,
     ]
     ordering = ("cleaning_order",)
 
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 902f336e..d97aecf7 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -6,6 +6,7 @@
 from django.apps import apps
 from django.conf import settings
 from django.core import management
+from django.db import IntegrityError
 
 from config import celery_app
 
@@ -232,31 +233,40 @@ def resolve_title_pattern(title_pattern_id):
 
 
 @celery_app.task
-def fetch_and_update_full_text(collection_id, server_name):
+def fetch_and_replace_full_text(collection_id, server_name):
     """
-    Task to fetch and update full text and metadata for all URLs associated with a specified collection
-    from a given server.
+    Task to fetch and replace full text and metadata for all URLs associated with a specified collection
+    from a given server. This task deletes all existing DumpUrl entries for the collection and creates
+    new entries based on the latest fetched data.
 
     Args:
         collection_id (int): The identifier for the collection in the database.
         server_name (str): The name of the server.
 
     Returns:
-        str: A message indicating the result of the operation, including the number of URLs processed
-             or a message if no records were found.
+        str: A message indicating the result of the operation, including the number of URLs processed.
     """
     collection = Collection.objects.get(id=collection_id)
     api = Api(server_name)
     documents = api.get_full_texts(collection.config_folder)
 
+    # Step 1: Delete all existing DumpUrl entries for the collection
+    deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
+    print(f"Deleted {deleted_count} existing DumpUrl entries for collection '{collection.config_name}'.")
+
+    # Step 2: Create new DumpUrl entries from the fetched documents
+    processed_count = 0
     for doc in documents:
-        # if all values are not present, then it is skipped?
-        if not (doc["url"] and doc["full_text"] and doc["title"]):
-            continue
-
-        DumpUrl.objects.update_or_create(
-            url=doc["url"],
-            collection=collection,
-            defaults={"scraped_text": doc["full_text"], "scraped_title": doc["title"]},
-        )
+        try:
+            DumpUrl.objects.create(
+                url=doc["url"],
+                collection=collection,
+                scraped_text=doc.get("full_text", ""),
+                scraped_title=doc.get("title", ""),
+            )
+            processed_count += 1
+        except IntegrityError:
+            # Handle duplicate URL case if needed
+            print(f"Duplicate URL found, skipping: {doc['url']}")
+
     return f"Successfully processed {len(documents)} records and updated the database."

From 525056851db4d4fd50b90cbbbc3f982767d6ec33 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 18:01:35 -0600
Subject: [PATCH 125/441] refactor migration script

---
 .../commands/migrate_urls_and_patterns.py     | 172 ++++++++----------
 1 file changed, 75 insertions(+), 97 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index 2a0ebc16..e912b191 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -1,3 +1,4 @@
+from django.apps import apps
 from django.core.management.base import BaseCommand
 from django.db.models import Count
 
@@ -20,119 +21,96 @@
     TitlePattern,
 )
 
+STATUSES_TO_MIGRATE = [
+    WorkflowStatusChoices.CURATED,
+    WorkflowStatusChoices.QUALITY_FIXED,
+    WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
+    WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
+    WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
+    WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
+    WorkflowStatusChoices.QUALITY_CHECK_FAILED,
+    WorkflowStatusChoices.QUALITY_CHECK_MINOR,
+    WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+    WorkflowStatusChoices.PROD_PERFECT,
+    WorkflowStatusChoices.PROD_MINOR,
+    WorkflowStatusChoices.PROD_MAJOR,
+]
+
 
 class Command(BaseCommand):
     help = """Migrate CandidateURLs to DeltaUrl, apply the matching patterns,
-            and then promoting to CuratedUrl based on collection workflow status"""
+            and then promote to CuratedUrl based on collection workflow status"""
 
     def handle(self, *args, **kwargs):
-        # all_collections = Collection.objects.all()
+        # Clear all Delta instances
+        CuratedUrl.objects.all().delete()
+        DeltaUrl.objects.all().delete()
+        DeltaExcludePattern.objects.all().delete()
+        DeltaIncludePattern.objects.all().delete()
+        DeltaTitlePattern.objects.all().delete()
+        DeltaDocumentTypePattern.objects.all().delete()
+        DeltaDivisionPattern.objects.all().delete()
+
+        # Get collections with Candidate URLs
         all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
             url_count__gt=0
         )
 
-        # Migrate all CandidateURLs to DeltaUrl
+        # Migrate all CandidateURLs to DeltaUrl using bulk creation
         for collection in all_collections_with_urls:
-            candidate_urls = CandidateURL.objects.filter(collection=collection)
-            for candidate_url in candidate_urls:
-                # Check if a DeltaUrl with the same URL already exists
-                if not DeltaUrl.objects.filter(url=candidate_url.url).exists():
-                    DeltaUrl.objects.create(
-                        collection=candidate_url.collection,
-                        url=candidate_url.url,
-                        scraped_title=candidate_url.scraped_title,
-                        generated_title=candidate_url.generated_title,
-                        visited=candidate_url.visited,
-                        document_type=candidate_url.document_type,
-                        division=candidate_url.division,
-                        delete=False,
-                    )
-            self.stdout.write(
-                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl."
-            )
-            # break
+            delta_urls = [
+                DeltaUrl(
+                    collection=candidate_url.collection,
+                    url=candidate_url.url,
+                    scraped_title=candidate_url.scraped_title,
+                    generated_title=candidate_url.generated_title,
+                    visited=candidate_url.visited,
+                    document_type=candidate_url.document_type,
+                    division=candidate_url.division,
+                    delete=False,
+                )
+                for candidate_url in CandidateURL.objects.filter(collection=collection)
+            ]
+            DeltaUrl.objects.bulk_create(delta_urls)
 
         # Migrate Patterns
-
-        self.migrate_exclude_patterns()
-        self.migrate_include_patterns()
-        self.migrate_title_patterns()
-        self.migrate_document_type_patterns()
-        self.migrate_division_patterns()
+        self.migrate_patterns(ExcludePattern)
+        self.migrate_patterns(IncludePattern)
+        self.migrate_patterns(TitlePattern)
+        self.migrate_patterns(DocumentTypePattern)
+        self.migrate_patterns(DivisionPattern)
         self.stdout.write(self.style.SUCCESS("Patterns migration complete."))
 
-        # Migrate DeltaUrl for collections with CURATED or higher workflow status to CuratedUrl
-        all_curated_collections_with_urls = all_collections_with_urls.filter(
-            workflow_status__gte=WorkflowStatusChoices.CURATED
-        )
+        # Promote DeltaUrls to CuratedUrl for collections with relevant statuses
+        all_curated_collections_with_urls = all_collections_with_urls.filter(workflow_status__in=STATUSES_TO_MIGRATE)
         self.stdout.write(
             f"""Migrating URLs for {all_curated_collections_with_urls.count()} collections
             with CURATED or higher status..."""
         )
-
         for collection in all_curated_collections_with_urls:
-            candidate_urls = DeltaUrl.objects.filter(collection=collection)
-            for candidate_url in candidate_urls:
-                # Check if a CuratedUrl with the same URL already exists
-                if not CuratedUrl.objects.filter(url=candidate_url.url).exists():
-                    CuratedUrl.objects.create(
-                        collection=candidate_url.collection,
-                        url=candidate_url.url,
-                        scraped_title=candidate_url.scraped_title,
-                        generated_title=candidate_url.generated_title,
-                        visited=candidate_url.visited,
-                        document_type=candidate_url.document_type,
-                        division=candidate_url.division,
-                    )
-            self.stdout.write(
-                f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl."
-            )
-
-    def migrate_exclude_patterns(self):
-        self.stdout.write("Migrating Exclude Patterns...")
-        for pattern in ExcludePattern.objects.all():
-            delta_pattern, created = DeltaExcludePattern.objects.get_or_create(
-                collection=pattern.collection,
-                match_pattern=pattern.match_pattern,
-                match_pattern_type=pattern.match_pattern_type,
-                reason=pattern.reason,
-            )
-
-    def migrate_include_patterns(self):
-        self.stdout.write("Migrating Include Patterns...")
-        for pattern in IncludePattern.objects.all():
-            delta_pattern, created = DeltaIncludePattern.objects.get_or_create(
-                collection=pattern.collection,
-                match_pattern=pattern.match_pattern,
-                match_pattern_type=pattern.match_pattern_type,
-            )
-
-    def migrate_title_patterns(self):
-        self.stdout.write("Migrating Title Patterns...")
-        for pattern in TitlePattern.objects.all():
-            delta_pattern, created = DeltaTitlePattern.objects.get_or_create(
-                collection=pattern.collection,
-                match_pattern=pattern.match_pattern,
-                match_pattern_type=pattern.match_pattern_type,
-                title_pattern=pattern.title_pattern,
-            )
-
-    def migrate_document_type_patterns(self):
-        self.stdout.write("Migrating Document Type Patterns...")
-        for pattern in DocumentTypePattern.objects.all():
-            delta_pattern, created = DeltaDocumentTypePattern.objects.get_or_create(
-                collection=pattern.collection,
-                match_pattern=pattern.match_pattern,
-                match_pattern_type=pattern.match_pattern_type,
-                document_type=pattern.document_type,
-            )
-
-    def migrate_division_patterns(self):
-        self.stdout.write("Migrating Division Patterns...")
-        for pattern in DivisionPattern.objects.all():
-            delta_pattern, created = DeltaDivisionPattern.objects.get_or_create(
-                collection=pattern.collection,
-                match_pattern=pattern.match_pattern,
-                match_pattern_type=pattern.match_pattern_type,
-                division=pattern.division,
-            )
+            collection.promote_to_curated()
+
+    def migrate_patterns(self, non_delta_model):
+        """Migrate patterns from a non-delta model to the corresponding delta model."""
+        # Determine the delta model name and fetch the model class
+        delta_model_name = "Delta" + non_delta_model.__name__
+        delta_model = apps.get_model(non_delta_model._meta.app_label, delta_model_name)
+
+        self.stdout.write(f"Migrating patterns from {non_delta_model.__name__} to {delta_model_name}...")
+
+        # Get all field names from both models except 'id' (primary key)
+        non_delta_fields = {field.name for field in non_delta_model._meta.fields if field.name != "id"}
+        delta_fields = {field.name for field in delta_model._meta.fields if field.name != "id"}
+
+        # Find shared fields
+        shared_fields = non_delta_fields.intersection(delta_fields)
+
+        for pattern in non_delta_model.objects.all():
+            # Build the dictionary of shared fields to copy
+            delta_fields_data = {field: getattr(pattern, field) for field in shared_fields}
+
+            # Create an instance of the delta model and save it to call the custom save() method
+            delta_instance = delta_model(**delta_fields_data)
+            delta_instance.save()  # Explicitly call save() to trigger custom logic
+
+        self.stdout.write(f"Migration completed for {non_delta_model.__name__} to {delta_model_name}.")

From 52843074ff927dea274d39c96d93e9bca6fa6eaf Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 19:46:38 -0600
Subject: [PATCH 126/441] Delta patterns js updated

---
 sde_collections/serializers.py                |  46 +-
 sde_collections/urls.py                       |   5 +
 .../static/js/candidate_url_list.js           | 482 ++++++++++++++++++
 .../sde_collections/candidate_urls_list.html  |   2 +-
 4 files changed, 520 insertions(+), 15 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 1953cc08..15fde3df 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -262,16 +262,34 @@ class Meta:
         abstract = True
 
 
+class DeltaBasePatternSerializer(serializers.ModelSerializer):
+    match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True)
+    delta_urls_count = serializers.SerializerMethodField(read_only=True)
+
+    def get_delta_urls_count(self, instance):
+        return instance.delta_urls.count()
+
+    class Meta:
+        fields = (
+            "id",
+            "collection",
+            "delta_match_pattern",
+            "delta_match_pattern_type",
+            "delta_match_pattern_type_display",
+            "delta_urls_count",
+        )
+
+
 class ExcludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = ExcludePattern
         fields = BasePatternSerializer.Meta.fields + ("reason",)
 
 
-class DeltaExcludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+class DeltaExcludePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = DeltaExcludePattern
-        fields = BasePatternSerializer.Meta.fields + ("reason",)
+        fields = DeltaBasePatternSerializer.Meta.fields + ("reason",)
 
 
 class IncludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
@@ -280,10 +298,10 @@ class Meta:
         fields = BasePatternSerializer.Meta.fields
 
 
-class DeltaIncludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+class DeltaIncludePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = DeltaIncludePattern
-        fields = BasePatternSerializer.Meta.fields
+        fields = DeltaBasePatternSerializer.Meta.fields
 
 
 class TitlePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
@@ -303,10 +321,10 @@ def validate_match_pattern(self, value):
         return value
 
 
-class DeltaTitlePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+class DeltaTitlePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = DeltaTitlePattern
-        fields = BasePatternSerializer.Meta.fields + ("delta_title_pattern",)
+        fields = DeltaBasePatternSerializer.Meta.fields + ("delta_title_pattern",)
 
     def validate_match_pattern(self, value):
         try:
@@ -348,7 +366,7 @@ def validate_match_pattern(self, value):
         return value
 
 
-class DeltaDocumentTypePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+class DeltaDocumentTypePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
     document_type = serializers.ChoiceField(
         choices=DocumentTypes.choices
@@ -359,9 +377,9 @@ class DeltaDocumentTypePatternSerializer(BasePatternSerializer, serializers.Mode
 
     class Meta:
         model = DeltaDocumentTypePattern
-        fields = BasePatternSerializer.Meta.fields + (
-            "document_type",
-            "document_type_display",
+        fields = DeltaBasePatternSerializer.Meta.fields + (
+            "delta_document_type",
+            "delta_document_type_display",
         )
 
     def validate_match_pattern(self, value):
@@ -399,15 +417,15 @@ def validate_match_pattern(self, value):
         return value
 
 
-class DeltaDivisionPatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
+class DeltaDivisionPatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     division_display = serializers.CharField(source="get_division_display", read_only=True)
     division = serializers.ChoiceField(choices=Divisions.choices)
 
     class Meta:
         model = DeltaDivisionPattern
-        fields = BasePatternSerializer.Meta.fields + (
-            "division",
-            "division_display",
+        fields = DeltaBasePatternSerializer.Meta.fields + (
+            "delta_division",
+            "delta_division_display",
         )
 
     def validate_match_pattern(self, value):
diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index a17f6390..3ac5e955 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -12,10 +12,15 @@
 router.register(r"curated-urls", views.CuratedURLViewSet)
 router.register(r"delta-urls", views.DeltaURLViewSet)
 router.register(r"exclude-patterns", views.ExcludePatternViewSet)
+router.register(r"delta-exclude-patterns", views.DeltaExcludePatternViewSet)
 router.register(r"include-patterns", views.IncludePatternViewSet)
+router.register(r"delta-include-patterns", views.DeltaIncludePatternViewSet)
 router.register(r"title-patterns", views.TitlePatternViewSet)
+router.register(r"delta-title-patterns", views.DeltaTitlePatternViewSet)
 router.register(r"document-type-patterns", views.DocumentTypePatternViewSet)
+router.register(r"delta-document-type-patterns", views.DeltaDocumentTypePatternViewSet)
 router.register(r"division-patterns", views.DivisionPatternViewSet)
+router.register(r"delta-division-patterns", views.DeltaDivisionPatternViewSet)
 router.register(r"environmental-justice", EnvironmentalJusticeRowViewSet)
 
 app_name = "sde_collections"
diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js
index 7b01cc6c..b47d6f86 100644
--- a/sde_indexing_helper/static/js/candidate_url_list.js
+++ b/sde_indexing_helper/static/js/candidate_url_list.js
@@ -9,6 +9,11 @@ var newExcludePatternsCount = 0;
 var newTitlePatternsCount = 0;
 var newDocumentTypePatternsCount = 0;
 var newDivisionPatternsCount = 0;
+var newDeltaIncludePatternsCount = 0;
+var newDeltaExcludePatternsCount = 0;
+var newDeltaTitlePatternsCount = 0;
+var newDeltaDocumentTypePatternsCount = 0;
+var newDeltaDivisionPatternsCount = 0;
 var currentTab = ""; //blank for the first tab
 var matchPatternTypeMap = {
   "Individual URL Pattern": 1,
@@ -839,6 +844,93 @@ function initializeDataTable() {
     exclude_patterns_table.columns(2).search(this.value).draw();
   });
 
+  var delta_exclude_patterns_table = $("#delta_exclude_patterns_table").DataTable({
+    // scrollY: true,
+    dom: "lBrtip",
+    buttons: [
+      {
+        text: "Add Pattern",
+        className: "addPattern",
+        action: function () {
+          $modal = $("#deltaExcludePatternModal").modal();
+        },
+      },
+      {
+        text: "Customize Columns",
+        className: "customizeColumns",
+        action: function () {
+          modalContents("#delta_exclude_patterns_table");
+        },
+      },
+    ],
+    lengthMenu: [
+      [25, 50, 100, 500],
+      ["Show 25", "Show 50", "Show 100", "Show 500"],
+    ],
+    orderCellsTop: true,
+    pageLength: 100,
+    ajax: `/api/delta-exclude-patterns/?format=datatables&collection_id=${collection_id}`,
+    initComplete: function (data) {
+      var table = $("#delta_exclude_patterns_table").DataTable();
+
+      this.api()
+        .columns()
+        .every(function (index) {
+          let column = this;
+          if (column.data().length === 0) {
+            $("#delta-exclude-patterns-dropdown-1").prop("disabled", true);
+          } else if (index === 1) {
+            $("#delta-exclude-patterns-dropdown-1").on("change", function () {
+              if ($(this).val() === "") table.columns(6).search("").draw();
+              else {
+                table
+                  .column(6)
+                  .search(matchPatternTypeMap[$(this).val()])
+                  .draw();
+              }
+            });
+          }
+        });
+    },
+    columns: [
+      { data: "delta_match_pattern", class: "whiteText" },
+      {
+        data: "delta_match_pattern_type_display",
+        class: "text-center whiteText",
+        sortable: true,
+      },
+      {
+        data: "reason",
+        class: "text-center whiteText",
+        sortable: false,
+        visible: false,
+      },
+      {
+        data: "delta_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
+      {
+        data: null,
+        sortable: false,
+        class: "text-center",
+        render: function (data, type, row) {
+          return `<button class="btn btn-danger btn-sm delete-exclude-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
+        },
+      },
+      { data: "id", visible: false, searchable: false },
+      { data: "match_pattern_type", visible: false },
+    ],
+  });
+
+  $("#deltaMatchPatternFilter").on("beforeinput", function () {
+    delta_exclude_patterns_table.columns(0).search(this.value).draw();
+  });
+
+  $("#deltaReasonFilter").on("beforeinput", function () {
+    delta_exclude_patterns_table.columns(2).search(this.value).draw();
+  });
+
   var include_patterns_table = $("#include_patterns_table").DataTable({
     // scrollY: true,
     lengthMenu: [
@@ -915,6 +1007,84 @@ function initializeDataTable() {
     include_patterns_table.columns(0).search(this.value).draw();
   });
 
+  var delta_include_patterns_table = $("#delta_include_patterns_table").DataTable({
+    // scrollY: true,
+    lengthMenu: [
+      [25, 50, 100, 500],
+      ["Show 25", "Show 50", "Show 100", "Show 500"],
+    ],
+    dom: "lBrtip",
+    buttons: [
+      {
+        text: "Add Pattern",
+        className: "addPattern",
+        action: function () {
+          $modal = $("#deltaIncludePatternModal").modal();
+        },
+      },
+      {
+        text: "Customize Columns",
+        className: "customizeColumns",
+        action: function () {
+          modalContents("#delta_include_patterns_table");
+        },
+      },
+    ],
+    pageLength: 100,
+    orderCellsTop: true,
+    ajax: `/api/delta-include-patterns/?format=datatables&collection_id=${collection_id}`,
+    initComplete: function (data) {
+      var table = $("#delta_include_patterns_table").DataTable();
+      this.api()
+        .columns()
+        .every(function (index) {
+          let column = this;
+          if (column.data().length === 0) {
+            $("#delta-include-patterns-dropdown-1").prop("disabled", true);
+          } else {
+            if (index === 1) {
+              $("#delta-include-patterns-dropdown-1").on("change", function () {
+                if ($(this).val() === "") table.columns(5).search("").draw();
+                table
+                  .column(5)
+                  .search(matchPatternTypeMap[$(this).val()])
+                  .draw();
+              });
+            }
+          }
+        });
+    },
+    columns: [
+      { data: "delta_match_pattern", class: "whiteText" },
+      {
+        data: "delta_match_pattern_type_display",
+        class: "text-center whiteText",
+        sortable: false,
+      },
+      {
+        data: "delta_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
+      {
+        data: null,
+        sortable: false,
+        class: "text-center",
+        render: function (data, type, row) {
+          return `<button class="btn btn-danger btn-sm delete-include-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
+        },
+      },
+      { data: "id", visible: false, searchable: false },
+      { data: "delta_match_pattern_type", visible: false },
+    ],
+  });
+
+  $("#deltaIncludeMatchPatternFilter").on("beforeinput", function () {
+    delta_include_patterns_table.columns(0).search(this.value).draw();
+  });
+
+
+
   var title_patterns_table = $("#title_patterns_table").DataTable({
     // scrollY: true,
     dom: "lBrtip",
@@ -999,6 +1169,90 @@ function initializeDataTable() {
     title_patterns_table.columns(2).search(this.value).draw();
   });
 
+  var delta_title_patterns_table = $("#delta_title_patterns_table").DataTable({
+    // scrollY: true,
+    dom: "lBrtip",
+    serverSide: true,
+    paging: true,
+    buttons: [
+      {
+        text: "Add Pattern",
+        className: "addPattern",
+        action: function () {
+          $modal = $("#deltaTitlePatternModal").modal();
+        },
+      },
+      {
+        text: "Customize Columns",
+        className: "customizeColumns",
+        action: function () {
+          modalContents("#delta_title_patterns_table");
+        },
+      },
+    ],
+    lengthMenu: [
+      [25, 50, 100, 500, -1],
+      ["Show 25", "Show 50", "Show 100", "Show 500", "Show All"],
+    ],
+    pageLength: 50,
+    orderCellsTop: true,
+    ajax: `/api/delta-title-patterns/?format=datatables&collection_id=${collection_id}`,
+    initComplete: function (data) {
+      var table = $("#delta_title_patterns_table").DataTable();
+
+      this.api()
+        .columns()
+        .every(function (index) {
+          let column = this;
+          if (column.data().length === 0) {
+            $("#delta-title-patterns-dropdown-1").prop("disabled", true);
+          } else if (index === 1) {
+            $("#delta-title-patterns-dropdown-1").on("change", function () {
+              if ($(this).val() === "") table.columns(6).search("").draw();
+              else {
+                table
+                  .column(6)
+                  .search(matchPatternTypeMap[$(this).val()])
+                  .draw();
+              }
+            });
+          }
+        });
+    },
+    columns: [
+      { data: "delta_match_pattern", class: "whiteText" },
+      {
+        data: "delta_match_pattern_type_display",
+        class: "text-center whiteText",
+        sortable: false,
+      },
+      { data: "delta_title_pattern", class: "whiteText" },
+      {
+        data: "delta_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
+      {
+        data: null,
+        sortable: false,
+        class: "text-center",
+        render: function (data, type, row) {
+          return `<button class="btn btn-danger btn-sm delete-title-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
+        },
+      },
+      { data: "id", visible: false, searchable: false },
+      { data: "delta_match_pattern_type", visible: false },
+    ],
+  });
+
+  $("#deltaTitleMatchPatternFilter").on("beforeinput", function (val) {
+    delta_title_patterns_table.columns(0).search(this.value).draw();
+  });
+
+  $("#deltaTitlePatternTypeFilter").on("beforeinput", function (val) {
+    delta_title_patterns_table.columns(2).search(this.value).draw();
+  });
+
   var document_type_patterns_table = $(
     "#document_type_patterns_table"
   ).DataTable({
@@ -1107,6 +1361,115 @@ function initializeDataTable() {
   $("#candidateDocTypeMatchPatternFilter").on("beforeinput", function (val) {
     document_type_patterns_table.columns(0).search(this.value).draw();
   });
+
+  var delta_document_type_patterns_table = $(
+    "#delta_document_type_patterns_table"
+  ).DataTable({
+    // scrollY: true,
+    dom: "lBrtip",
+    buttons: [
+      {
+        text: "Add Pattern",
+        className: "addPattern",
+        action: function () {
+          $modal = $("#deltaDocumentTypePatternModal").modal();
+        },
+      },
+      {
+        text: "Customize Columns",
+        className: "customizeColumns",
+        action: function () {
+          modalContents("#delta_document_type_patterns_table");
+        },
+      },
+    ],
+    lengthMenu: [
+      [25, 50, 100, 500],
+      ["Show 25", "Show 50", "Show 100", "Show 500"],
+    ],
+    orderCellsTop: true,
+    pageLength: 100,
+    ajax: `/api/delta_document-type-patterns/?format=datatables&collection_id=${collection_id}`,
+    initComplete: function (data) {
+      this.api()
+        .columns()
+        .every(function (index) {
+          var table = $("#delta_document_type_patterns_table").DataTable();
+
+          let addDropdownSelect = {
+            1: {
+              columnToSearch: 6,
+              matchPattern: {
+                "Individual URL Pattern": 1,
+                "Multi-URL Pattern": 2,
+              },
+            },
+            2: {
+              columnToSearch: 7,
+              matchPattern: {
+                Images: 1,
+                Data: 2,
+                Documentation: 3,
+                "Software and Tools": 4,
+                "Missions and Instruments": 5,
+              },
+            },
+          };
+
+          let column = this;
+          if (column.data().length === 0) {
+            $(`#delta-document-type-patterns-dropdown-${index}`).prop(
+              "disabled",
+              true
+            );
+          } else if (index in addDropdownSelect) {
+            $("#delta-document-type-patterns-dropdown-" + index).on(
+              "change",
+              function () {
+                let col = addDropdownSelect[index].columnToSearch;
+                let searchInput =
+                  addDropdownSelect[index].matchPattern[$(this).val()];
+                if ($(this).val() === "" || $(this).val() === undefined)
+                  table.columns(col).search("").draw();
+                else {
+                  table.columns(col).search(searchInput).draw();
+                }
+              }
+            );
+          }
+        });
+    },
+
+    columns: [
+      { data: "delta_match_pattern", class: "whiteText" },
+      {
+        data: "delta_match_pattern_type_display",
+        class: "text-center whiteText",
+        sortable: false,
+      },
+      { data: "delta_document_type_display", class: "whiteText" },
+      {
+        data: "delta_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
+      {
+        data: null,
+        sortable: false,
+        class: "text-center",
+        render: function (data, type, row) {
+          return `<button class="btn btn-danger btn-sm delete-document-type-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
+        },
+      },
+      { data: "id", visible: false, searchable: false },
+      { data: "delta_match_pattern_type", visible: false },
+      { data: "delta_document_type", visible: false },
+    ],
+  });
+
+  $("#deltaDocTypeMatchPatternFilter").on("beforeinput", function (val) {
+    delta_document_type_patterns_table.columns(0).search(this.value).draw();
+  });
 }
 
 var division_patterns_table = $("#division_patterns_table").DataTable({
@@ -1209,6 +1572,105 @@ $("#candidateDivisionMatchPatternFilter").on("beforeinput", function (val) {
   division_patterns_table.columns(0).search(this.value).draw();
 });
 
+var delta_division_patterns_table = $("#delta_division_patterns_table").DataTable({
+  dom: "lBrtip",
+  buttons: [
+    {
+      text: "Add Pattern",
+      className: "addPattern",
+      action: function () {
+        $modal = $("#deltaDivisionPatternModal").modal();
+      },
+    },
+    {
+      text: "Customize Columns",
+      className: "customizeColumns",
+      action: function () {
+        modalContents("#delta_division_patterns_table");
+      },
+    },
+  ],
+  lengthMenu: [
+    [25, 50, 100, 500],
+    ["Show 25", "Show 50", "Show 100", "Show 500"],
+  ],
+  orderCellsTop: true,
+  pageLength: 100,
+  ajax: `/api/delta-division-patterns/?format=datatables&collection_id=${collection_id}`,
+  initComplete: function (data) {
+    this.api()
+      .columns()
+      .every(function (index) {
+        var table = $("#delta_division_patterns_table").DataTable();
+
+        let addDropdownSelect = {
+          1: {
+            columnToSearch: 6,
+            matchPattern: {
+              "Individual URL Pattern": 1,
+              "Multi-URL Pattern": 2,
+            },
+          },
+          2: {
+            columnToSearch: 7,
+            matchPattern: {
+              "Astrophysics": 1,
+              "Biological and Physical Sciences": 2,
+              "Earth Science": 3,
+              "Heliophysics": 4,
+              "Planetary Science": 5,
+            },
+          },
+        };
+
+        let column = this;
+        if (column.data().length === 0) {
+          $(`#delta-division-patterns-dropdown-${index}`).prop("disabled", true);
+        } else if (index in addDropdownSelect) {
+          $("#delta-division-patterns-dropdown-" + index).on("change", function () {
+            let col = addDropdownSelect[index].columnToSearch;
+            let searchInput =
+              addDropdownSelect[index].matchPattern[$(this).val()];
+            if ($(this).val() === "" || $(this).val() === undefined)
+              table.columns(col).search("").draw();
+            else {
+              table.columns(col).search(searchInput).draw();
+            }
+          });
+        }
+      });
+  },
+
+  columns: [
+    { data: "delta_match_pattern", class: "whiteText" },
+    {
+      data: "delta_match_pattern_type_display",
+      class: "text-center whiteText",
+      sortable: false,
+    },
+    { data: "delta_division_display", class: "whiteText" },
+    {
+      data: "delta_urls_count",
+      class: "text-center whiteText",
+      sortable: true,
+    },
+    {
+      data: null,
+      sortable: false,
+      class: "text-center",
+      render: function (data, type, row) {
+        return `<button class="btn btn-danger btn-sm delete-division-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
+      },
+    },
+    { data: "id", visible: false, searchable: false },
+    { data: "delta_match_pattern_type", visible: false },
+    { data: "delta_division", visible: false },
+  ],
+});
+
+$("#deltaDivisionMatchPatternFilter").on("beforeinput", function (val) {
+  delta_division_patterns_table.columns(0).search(this.value).draw();
+});
 
 function handleTabsClick() {
   $("#includePatternsTab").on("click", function () {
@@ -1231,6 +1693,26 @@ function handleTabsClick() {
     newDivisionPatternsCount = 0;
     $("#divisionPatternsTab").html(`Division Patterns`);
   });
+  $("#deltaIncludePatternsTab").on("click", function () {
+    newDeltaIncludePatternsCount = 0;
+    $("#deltaIncludePatternsTab").html(`Delta Include Patterns`);
+  });
+  $("#deltaExcludePatternsTab").on("click", function () {
+    newDeltaExcludePatternsCount = 0;
+    $("#deltaExcludePatternsTab").html(`Delta Exclude Patterns`);
+  });
+  $("#deltaTitlePatternsTab").on("click", function () {
+    newDeltaTitlePatternsCount = 0;
+    $("#deltaTitlePatternsTab").html(`Delta Title Patterns`);
+  });
+  $("#deltaDocumentTypePatternsTab").on("click", function () {
+    newDeltaDocumentTypePatternsCount = 0;
+    $("#deltaDocumentTypePatternsTab").html(`Delta Document Type Patterns`);
+  });
+  $("#deltaDivisionPatternsTab").on("click", function () {
+    newDeltaDivisionPatternsCount = 0;
+    $("#deltaDivisionPatternsTab").html(`Delta Division Patterns`);
+  });
 }
 
 function setupClickHandlers() {
diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 5955d74e..322bb245 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -507,7 +507,7 @@ <h3 class="whiteText candidateTitle">
 
 
         <div class="tab-pane fade" id="Delta-Exclude-Patterns">
-            <table class="table" id="delta-exclude_patterns_table" style="width:100%">
+            <table class="table" id="delta_exclude_patterns_table" style="width:100%">
                 <thead class="tableHeader">
                     <tr>
                         <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>

From 343eb9f769bb99b0f4764188ead34f59538c25d0 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 19:47:28 -0600
Subject: [PATCH 127/441] add global handling of duplicate urls and time
 tracking to url migration

---
 .../commands/migrate_urls_and_patterns.py     | 85 ++++++++++++++-----
 1 file changed, 65 insertions(+), 20 deletions(-)

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index e912b191..7110cd30 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -1,3 +1,5 @@
+import time
+
 from django.apps import apps
 from django.core.management.base import BaseCommand
 from django.db.models import Count
@@ -39,10 +41,15 @@
 
 class Command(BaseCommand):
     help = """Migrate CandidateURLs to DeltaUrl, apply the matching patterns,
-            and then promote to CuratedUrl based on collection workflow status"""
+              and then promote to CuratedUrl based on collection workflow status"""
 
     def handle(self, *args, **kwargs):
-        # Clear all Delta instances
+        # Log the start time for the entire process
+        overall_start_time = time.time()
+        self.stdout.write("Starting the migration process...")
+
+        # Step 1: Clear all Delta instances
+        start_time = time.time()
         CuratedUrl.objects.all().delete()
         DeltaUrl.objects.all().delete()
         DeltaExcludePattern.objects.all().delete()
@@ -50,38 +57,72 @@ def handle(self, *args, **kwargs):
         DeltaTitlePattern.objects.all().delete()
         DeltaDocumentTypePattern.objects.all().delete()
         DeltaDivisionPattern.objects.all().delete()
+        self.stdout.write(f"Cleared all Delta instances in {time.time() - start_time:.2f} seconds.")
 
-        # Get collections with Candidate URLs
+        # Step 2: Get collections with Candidate URLs
+        start_time = time.time()
         all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
             url_count__gt=0
         )
+        self.stdout.write(f"Collected collections with URLs in {time.time() - start_time:.2f} seconds.")
+
+        # Step 3: Migrate all CandidateURLs to DeltaUrl
+        start_time = time.time()
+        # Set to track URLs globally across all collections
+        global_unique_urls = set()
 
-        # Migrate all CandidateURLs to DeltaUrl using bulk creation
         for collection in all_collections_with_urls:
-            delta_urls = [
-                DeltaUrl(
-                    collection=candidate_url.collection,
-                    url=candidate_url.url,
-                    scraped_title=candidate_url.scraped_title,
-                    generated_title=candidate_url.generated_title,
-                    visited=candidate_url.visited,
-                    document_type=candidate_url.document_type,
-                    division=candidate_url.division,
-                    delete=False,
-                )
-                for candidate_url in CandidateURL.objects.filter(collection=collection)
-            ]
+            delta_urls = []
+
+            # Filter CandidateURL objects, ensuring each URL is globally unique
+            for candidate_url in CandidateURL.objects.filter(collection=collection):
+                if candidate_url.url not in global_unique_urls:
+                    global_unique_urls.add(candidate_url.url)
+                    delta_urls.append(
+                        DeltaUrl(
+                            collection=candidate_url.collection,
+                            url=candidate_url.url,
+                            scraped_title=candidate_url.scraped_title,
+                            generated_title=candidate_url.generated_title,
+                            visited=candidate_url.visited,
+                            document_type=candidate_url.document_type,
+                            division=candidate_url.division,
+                            delete=False,
+                        )
+                    )
+
+            # Bulk create the unique DeltaUrl instances for this collection
             DeltaUrl.objects.bulk_create(delta_urls)
 
-        # Migrate Patterns
+        self.stdout.write(f"Migrated CandidateURLs to DeltaUrl in {time.time() - start_time:.2f} seconds.")
+
+        # Step 4: Migrate Patterns
+        start_time = time.time()
+
+        pattern_start_time = time.time()
         self.migrate_patterns(ExcludePattern)
+        self.stdout.write(f"ExcludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+
+        pattern_start_time = time.time()
         self.migrate_patterns(IncludePattern)
+        self.stdout.write(f"IncludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+
+        pattern_start_time = time.time()
         self.migrate_patterns(TitlePattern)
+        self.stdout.write(f"TitlePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+
+        pattern_start_time = time.time()
         self.migrate_patterns(DocumentTypePattern)
+        self.stdout.write(f"DocumentTypePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+
+        pattern_start_time = time.time()
         self.migrate_patterns(DivisionPattern)
-        self.stdout.write(self.style.SUCCESS("Patterns migration complete."))
+        self.stdout.write(f"DivisionPattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
 
-        # Promote DeltaUrls to CuratedUrl for collections with relevant statuses
+        self.stdout.write(f"Total patterns migration completed in {time.time() - start_time:.2f} seconds.")
+
+        # Step 5: Promote DeltaUrls to CuratedUrl
+        start_time = time.time()
         all_curated_collections_with_urls = all_collections_with_urls.filter(workflow_status__in=STATUSES_TO_MIGRATE)
         self.stdout.write(
             f"""Migrating URLs for {all_curated_collections_with_urls.count()} collections
@@ -89,6 +130,10 @@ def handle(self, *args, **kwargs):
         )
         for collection in all_curated_collections_with_urls:
             collection.promote_to_curated()
+        self.stdout.write(f"Promotion to CuratedUrl completed in {time.time() - start_time:.2f} seconds.")
+
+        # Log the total time for the process
+        self.stdout.write(f"Total migration process completed in {time.time() - overall_start_time:.2f} seconds.")
 
     def migrate_patterns(self, non_delta_model):
         """Migrate patterns from a non-delta model to the corresponding delta model."""

From 2493b45a65f4d4f8ad406d96eded62017af1dd57 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 21:15:32 -0600
Subject: [PATCH 128/441] replace all instances of candidateurl with deltaurl

---
 sde_collections/serializers.py                |  243 +---
 sde_collections/urls.py                       |   31 +-
 sde_collections/views.py                      |  205 +---
 .../static/css/candidate_url_list.css         |  223 ++--
 .../static/js/candidate_url_list.js           | 1046 +----------------
 .../sde_collections/candidate_urls_list.html  |  923 +++------------
 6 files changed, 352 insertions(+), 2319 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 15fde3df..575ae198 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -1,23 +1,15 @@
 from rest_framework import serializers
 
-from .models.candidate_url import CandidateURL
 from .models.collection import Collection, WorkflowHistory
 from .models.collection_choice_fields import Divisions, DocumentTypes
-from .models.delta_patterns import (
+from .models.delta_url import DeltaURL
+from .models.pattern import (
     DeltaDivisionPattern,
     DeltaDocumentTypePattern,
     DeltaExcludePattern,
     DeltaIncludePattern,
     DeltaTitlePattern,
 )
-from .models.delta_url import CuratedUrl, DeltaUrl
-from .models.pattern import (
-    DivisionPattern,
-    DocumentTypePattern,
-    ExcludePattern,
-    IncludePattern,
-    TitlePattern,
-)
 
 
 class CollectionSerializer(serializers.ModelSerializer):
@@ -62,18 +54,18 @@ class Meta:
         fields = "__all__"
 
 
-class CandidateURLSerializer(serializers.ModelSerializer):
+class DeltaURLSerializer(serializers.ModelSerializer):
     excluded = serializers.BooleanField(required=False)
     document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
     division_display = serializers.CharField(source="get_division_display", read_only=True)
     url = serializers.CharField(required=False)
     generated_title_id = serializers.SerializerMethodField(read_only=True)
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
-    candidate_urls_count = serializers.SerializerMethodField(read_only=True)
+    delta_urls_count = serializers.SerializerMethodField(read_only=True)
 
-    def get_candidate_urls_count(self, obj):
+    def get_delta_urls_count(self, obj):
         titlepattern = obj.titlepattern_urls.last()
-        return titlepattern.candidate_urls.count() if titlepattern else 0
+        return titlepattern.delta_urls.count() if titlepattern else 0
 
     def get_generated_title_id(self, obj):
         titlepattern = obj.titlepattern_urls.last()
@@ -84,95 +76,7 @@ def get_match_pattern_type(self, obj):
         return titlepattern.match_pattern_type if titlepattern else None
 
     class Meta:
-        model = CandidateURL
-        fields = (
-            "id",
-            "excluded",
-            "url",
-            "scraped_title",
-            "generated_title",
-            "generated_title_id",
-            "match_pattern_type",
-            "candidate_urls_count",
-            "document_type",
-            "document_type_display",
-            "division",
-            "division_display",
-            "visited",
-            "test_title",
-            "production_title",
-            "present_on_test",
-            "present_on_prod",
-        )
-
-
-class CuratedURLSerializer(serializers.ModelSerializer):
-    excluded = serializers.BooleanField(required=False)
-    document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
-    division_display = serializers.CharField(source="get_division_display", read_only=True)
-    url = serializers.CharField(required=False)
-    generated_title_id = serializers.SerializerMethodField(read_only=True)
-    match_pattern_type = serializers.SerializerMethodField(read_only=True)
-    curated_urls_count = serializers.SerializerMethodField(read_only=True)
-
-    def get_curated_urls_count(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_curated_urls.last()
-        return deltatitlepattern.curated_urls.count() if deltatitlepattern else 0
-
-    def get_generated_title_id(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_curated_urls.last()
-        return deltatitlepattern.id if deltatitlepattern else None
-
-    def get_match_pattern_type(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_curated_urls.last()
-        return deltatitlepattern.match_pattern_type if deltatitlepattern else None
-
-    class Meta:
-        model = CuratedUrl
-        fields = (
-            "id",
-            "excluded",
-            "url",
-            "scraped_title",
-            "generated_title",
-            "generated_title_id",
-            "match_pattern_type",
-            "curated_urls_count",
-            "document_type",
-            "document_type_display",
-            "division",
-            "division_display",
-            "visited",
-            # "test_title",
-            # "production_title",
-            # "present_on_test",
-            # "present_on_prod",
-        )
-
-
-class DeltaURLSerializer(serializers.ModelSerializer):
-    excluded = serializers.BooleanField(required=False)
-    document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
-    division_display = serializers.CharField(source="get_division_display", read_only=True)
-    url = serializers.CharField(required=False)
-    generated_title_id = serializers.SerializerMethodField(read_only=True)
-    match_pattern_type = serializers.SerializerMethodField(read_only=True)
-    delta_urls_count = serializers.SerializerMethodField(read_only=True)
-
-    def get_delta_urls_count(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_delta_urls.last()
-        return deltatitlepattern.delta_urls.count() if deltatitlepattern else 0
-
-    def get_generated_title_id(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_delta_urls.last()
-        return deltatitlepattern.id if deltatitlepattern else None
-
-    def get_match_pattern_type(self, obj):
-        deltatitlepattern = obj.deltatitlepattern_delta_urls.last()
-        return deltatitlepattern.match_pattern_type if deltatitlepattern else None
-
-    class Meta:
-        model = DeltaUrl
+        model = DeltaURL
         fields = (
             "id",
             "excluded",
@@ -187,34 +91,31 @@ class Meta:
             "division",
             "division_display",
             "visited",
-            # "test_title",
-            # "production_title",
-            # "present_on_test",
-            # "present_on_prod",
         )
 
 
-class CandidateURLBulkCreateSerializer(serializers.ModelSerializer):
+class DeltaURLBulkCreateSerializer(serializers.ModelSerializer):
     class Meta:
-        model = CandidateURL
+        model = DeltaURL
         fields = (
             "url",
             "scraped_title",
         )
 
 
-class CuratedUrlAPISerializer(serializers.ModelSerializer):
+class DeltaURLAPISerializer(serializers.ModelSerializer):
     document_type = serializers.SerializerMethodField()
     title = serializers.SerializerMethodField()
     file_extension = serializers.SerializerMethodField()
     tree_root = serializers.SerializerMethodField()
 
     class Meta:
-        model = CuratedUrl
+        model = DeltaURL
         fields = (
             "url",
             "title",
             "document_type",
+            "hash",
             "file_extension",
             "tree_root",
         )
@@ -244,25 +145,6 @@ def get_tree_root(self, obj):
 
 
 class BasePatternSerializer(serializers.ModelSerializer):
-    match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True)
-    candidate_urls_count = serializers.SerializerMethodField(read_only=True)
-
-    def get_candidate_urls_count(self, instance):
-        return instance.candidate_urls.count()
-
-    class Meta:
-        fields = (
-            "id",
-            "collection",
-            "match_pattern",
-            "match_pattern_type",
-            "match_pattern_type_display",
-            "candidate_urls_count",
-        )
-        abstract = True
-
-
-class DeltaBasePatternSerializer(serializers.ModelSerializer):
     match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True)
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
 
@@ -273,66 +155,38 @@ class Meta:
         fields = (
             "id",
             "collection",
-            "delta_match_pattern",
-            "delta_match_pattern_type",
-            "delta_match_pattern_type_display",
+            "match_pattern",
+            "match_pattern_type",
+            "match_pattern_type_display",
             "delta_urls_count",
         )
+        abstract = True
 
 
 class ExcludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
-    class Meta:
-        model = ExcludePattern
-        fields = BasePatternSerializer.Meta.fields + ("reason",)
-
-
-class DeltaExcludePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = DeltaExcludePattern
-        fields = DeltaBasePatternSerializer.Meta.fields + ("reason",)
+        fields = BasePatternSerializer.Meta.fields + ("reason",)
 
 
 class IncludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
-    class Meta:
-        model = IncludePattern
-        fields = BasePatternSerializer.Meta.fields
-
-
-class DeltaIncludePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = DeltaIncludePattern
-        fields = DeltaBasePatternSerializer.Meta.fields
+        fields = BasePatternSerializer.Meta.fields
 
 
 class TitlePatternSerializer(BasePatternSerializer, serializers.ModelSerializer):
-    class Meta:
-        model = TitlePattern
-        fields = BasePatternSerializer.Meta.fields + ("title_pattern",)
-
-    def validate_match_pattern(self, value):
-        try:
-            title_pattern = TitlePattern.objects.get(
-                match_pattern=value,
-                match_pattern_type=TitlePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
-            )
-            title_pattern.delete()
-        except TitlePattern.DoesNotExist:
-            pass
-        return value
-
-
-class DeltaTitlePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
     class Meta:
         model = DeltaTitlePattern
-        fields = DeltaBasePatternSerializer.Meta.fields + ("delta_title_pattern",)
+        fields = BasePatternSerializer.Meta.fields + ("title_pattern",)
 
     def validate_match_pattern(self, value):
         try:
-            delta_title_pattern = DeltaTitlePattern.objects.get(
+            title_pattern = DeltaTitlePattern.objects.get(
                 match_pattern=value,
                 match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
             )
-            delta_title_pattern.delete()
+            title_pattern.delete()
         except DeltaTitlePattern.DoesNotExist:
             pass
         return value
@@ -348,40 +202,12 @@ class DocumentTypePatternSerializer(BasePatternSerializer, serializers.ModelSeri
     )
 
     class Meta:
-        model = DocumentTypePattern
+        model = DeltaDocumentTypePattern
         fields = BasePatternSerializer.Meta.fields + (
             "document_type",
             "document_type_display",
         )
 
-    def validate_match_pattern(self, value):
-        try:
-            title_pattern = DocumentTypePattern.objects.get(
-                match_pattern=value,
-                match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
-            )
-            title_pattern.delete()
-        except DocumentTypePattern.DoesNotExist:
-            pass
-        return value
-
-
-class DeltaDocumentTypePatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
-    document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
-    document_type = serializers.ChoiceField(
-        choices=DocumentTypes.choices
-        + [
-            (0, "None"),
-        ]
-    )
-
-    class Meta:
-        model = DeltaDocumentTypePattern
-        fields = DeltaBasePatternSerializer.Meta.fields + (
-            "delta_document_type",
-            "delta_document_type_display",
-        )
-
     def validate_match_pattern(self, value):
         try:
             title_pattern = DeltaDocumentTypePattern.objects.get(
@@ -399,35 +225,12 @@ class DivisionPatternSerializer(BasePatternSerializer, serializers.ModelSerializ
     division = serializers.ChoiceField(choices=Divisions.choices)
 
     class Meta:
-        model = DivisionPattern
+        model = DeltaDivisionPattern
         fields = BasePatternSerializer.Meta.fields + (
             "division",
             "division_display",
         )
 
-    def validate_match_pattern(self, value):
-        try:
-            division_pattern = DivisionPattern.objects.get(
-                match_pattern=value,
-                match_pattern_type=DivisionPattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
-            )
-            division_pattern.delete()
-        except DivisionPattern.DoesNotExist:
-            pass
-        return value
-
-
-class DeltaDivisionPatternSerializer(DeltaBasePatternSerializer, serializers.ModelSerializer):
-    division_display = serializers.CharField(source="get_division_display", read_only=True)
-    division = serializers.ChoiceField(choices=Divisions.choices)
-
-    class Meta:
-        model = DeltaDivisionPattern
-        fields = DeltaBasePatternSerializer.Meta.fields + (
-            "delta_division",
-            "delta_division_display",
-        )
-
     def validate_match_pattern(self, value):
         try:
             division_pattern = DeltaDivisionPattern.objects.get(
diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index 3ac5e955..60f61aa4 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -8,19 +8,12 @@
 router = routers.DefaultRouter()
 router.register(r"collections", views.CollectionViewSet, basename="collection")
 router.register(r"collections-read", views.CollectionReadViewSet, basename="collection-read")
-router.register(r"candidate-urls", views.CandidateURLViewSet)
-router.register(r"curated-urls", views.CuratedURLViewSet)
 router.register(r"delta-urls", views.DeltaURLViewSet)
 router.register(r"exclude-patterns", views.ExcludePatternViewSet)
-router.register(r"delta-exclude-patterns", views.DeltaExcludePatternViewSet)
 router.register(r"include-patterns", views.IncludePatternViewSet)
-router.register(r"delta-include-patterns", views.DeltaIncludePatternViewSet)
 router.register(r"title-patterns", views.TitlePatternViewSet)
-router.register(r"delta-title-patterns", views.DeltaTitlePatternViewSet)
 router.register(r"document-type-patterns", views.DocumentTypePatternViewSet)
-router.register(r"delta-document-type-patterns", views.DeltaDocumentTypePatternViewSet)
 router.register(r"division-patterns", views.DivisionPatternViewSet)
-router.register(r"delta-division-patterns", views.DeltaDivisionPatternViewSet)
 router.register(r"environmental-justice", EnvironmentalJusticeRowViewSet)
 
 app_name = "sde_collections"
@@ -39,32 +32,32 @@
         views.IndexingInstructionsView.as_view(),
         name="indexing_instructions",
     ),
-    path("api/assign-division/<int:pk>/", views.CandidateURLViewSet.as_view({"post": "update_division"})),
+    path("api/assign-division/<int:pk>/", views.DeltaURLViewSet.as_view({"post": "update_division"})),
     path(
         "delete-required-url/<int:pk>",
         view=views.RequiredUrlsDeleteView.as_view(),
         name="delete_required_url",
     ),
     path(
-        "<int:pk>/candidate-urls",
-        view=views.CandidateURLsListView.as_view(),
-        name="candidate_urls",
+        "<int:pk>/delta-urls",
+        view=views.DeltaURLsListView.as_view(),
+        name="delta_urls",
     ),
     path(
         "consolidate/",
         view=views.WebappGitHubConsolidationView.as_view(),
         name="consolidate_db_and_github_configs",
     ),
-    # List all CandidateURL instances: /candidate-urls/
-    # Retrieve a specific CandidateURL instance: /candidate-urls/{id}/
-    # Create a new CandidateURL instance: /candidate-urls/
-    # Update an existing CandidateURL instance: /candidate-urls/{id}/
-    # Delete an existing CandidateURL instance: /candidate-urls/{id}/
+    # List all DeltaURL instances: /delta-urls/
+    # Retrieve a specific DeltaURL instance: /delta-urls/{id}/
+    # Create a new DeltaURL instance: /delta-urls/
+    # Update an existing DeltaURL instance: /delta-urls/{id}/
+    # Delete an existing DeltaURL instance: /delta-urls/{id}/
     path("api/", include(router.urls)),
     path(
-        "curated-urls-api/<str:config_folder>/",
-        view=views.CuratedURLAPIView.as_view(),
-        name="curated-url-api",
+        "delta-urls-api/<str:config_folder>/",
+        view=views.DeltaURLAPIView.as_view(),
+        name="delta-url-api",
     ),
     path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"),
 ]
diff --git a/sde_collections/views.py b/sde_collections/views.py
index e6ad6eb0..3f711d17 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -18,7 +18,6 @@
 from rest_framework.views import APIView
 
 from .forms import CollectionGithubIssueForm, CommentsForm, RequiredUrlForm
-from .models.candidate_url import CandidateURL, ResolvedTitle, ResolvedTitleError
 from .models.collection import Collection, Comments, RequiredUrls, WorkflowHistory
 from .models.collection_choice_fields import (
     ConnectorChoices,
@@ -27,14 +26,7 @@
     DocumentTypes,
     WorkflowStatusChoices,
 )
-from .models.delta_patterns import (
-    DeltaDivisionPattern,
-    DeltaDocumentTypePattern,
-    DeltaExcludePattern,
-    DeltaIncludePattern,
-    DeltaTitlePattern,
-)
-from .models.delta_url import CuratedUrl, DeltaUrl
+from .models.delta_url import DeltaURL, ResolvedTitle, ResolvedTitleError
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -43,17 +35,10 @@
     TitlePattern,
 )
 from .serializers import (
-    CandidateURLBulkCreateSerializer,
-    CandidateURLSerializer,
     CollectionReadSerializer,
     CollectionSerializer,
-    CuratedUrlAPISerializer,
-    CuratedURLSerializer,
-    DeltaDivisionPatternSerializer,
-    DeltaDocumentTypePatternSerializer,
-    DeltaExcludePatternSerializer,
-    DeltaIncludePatternSerializer,
-    DeltaTitlePatternSerializer,
+    DeltaURLAPISerializer,
+    DeltaURLBulkCreateSerializer,
     DeltaURLSerializer,
     DivisionPatternSerializer,
     DocumentTypePatternSerializer,
@@ -81,8 +66,8 @@ def get_queryset(self):
             super()
             .get_queryset()
             .filter(delete=False)
-            .annotate(num_candidate_urls=models.Count("candidate_urls"))
-            .order_by("-num_candidate_urls")
+            .annotate(num_delta_urls=models.Count("delta_urls"))
+            .order_by("-num_delta_urls")
         )
 
     def get_context_data(self, **kwargs):
@@ -199,14 +184,14 @@ def get_success_url(self, *args, **kwargs):
         return reverse("sde_collections:detail", kwargs={"pk": self.object.collection.pk})
 
 
-class CandidateURLsListView(LoginRequiredMixin, ListView):
+class DeltaURLsListView(LoginRequiredMixin, ListView):
     """
     Display a list of collections in the system
     """
 
-    model = CandidateURL
-    template_name = "sde_collections/candidate_urls_list.html"
-    context_object_name = "candidate_urls"
+    model = DeltaURL
+    template_name = "sde_collections/delta_urls_list.html"
+    context_object_name = "delta_urls"
     # paginate_by = 1000
 
     def _filter_by_is_exluded(self, queryset, is_excluded):
@@ -229,7 +214,7 @@ def get_queryset(self):
 
     def get_context_data(self, **kwargs):
         context = super().get_context_data(**kwargs)
-        context["segment"] = "candidate-url-list"
+        context["segment"] = "delta-url-list"
         context["collection"] = self.collection
         context["regex_exclude_patterns"] = self.collection.excludepattern.filter(
             match_pattern_type=2
@@ -269,68 +254,8 @@ def get_queryset(self):
         return super().get_queryset().filter(collection=collection)
 
 
-class CandidateURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = CandidateURL.objects.all()
-    serializer_class = CandidateURLSerializer
-
-    def _filter_by_is_excluded(self, queryset, is_excluded):
-        if is_excluded == "false":
-            queryset = queryset.filter(excluded=False)
-        elif is_excluded == "true":
-            queryset = queryset.exclude(excluded=False)
-        return queryset
-
-    def get_queryset(self):
-        queryset = super().get_queryset()
-        if self.request.method == "GET":
-            # Filter based on exclusion status
-            is_excluded = self.request.GET.get("is_excluded")
-            if is_excluded:
-                queryset = self._filter_by_is_excluded(queryset, is_excluded)
-        return queryset.order_by("url")
-
-    def update_division(self, request, pk=None):
-        candidate_url = get_object_or_404(CandidateURL, pk=pk)
-        division = request.data.get("division")
-        if division:
-            candidate_url.division = division
-            candidate_url.save()
-            return Response(status=status.HTTP_200_OK)
-        return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
-
-
-class CuratedURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = CuratedUrl.objects.all()
-    serializer_class = CuratedURLSerializer
-
-    def _filter_by_is_excluded(self, queryset, is_excluded):
-        if is_excluded == "false":
-            queryset = queryset.filter(excluded=False)
-        elif is_excluded == "true":
-            queryset = queryset.exclude(excluded=False)
-        return queryset
-
-    def get_queryset(self):
-        queryset = super().get_queryset()
-        if self.request.method == "GET":
-            # Filter based on exclusion status
-            is_excluded = self.request.GET.get("is_excluded")
-            if is_excluded:
-                queryset = self._filter_by_is_excluded(queryset, is_excluded)
-        return queryset.order_by("url")
-
-    def update_division(self, request, pk=None):
-        curated_url = get_object_or_404(CuratedUrl, pk=pk)
-        division = request.data.get("division")
-        if division:
-            curated_url.division = division
-            curated_url.save()
-            return Response(status=status.HTTP_200_OK)
-        return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
-
-
 class DeltaURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DeltaUrl.objects.all()
+    queryset = DeltaURL.objects.all()
     serializer_class = DeltaURLSerializer
 
     def _filter_by_is_excluded(self, queryset, is_excluded):
@@ -350,7 +275,7 @@ def get_queryset(self):
         return queryset.order_by("url")
 
     def update_division(self, request, pk=None):
-        delta_url = get_object_or_404(DeltaUrl, pk=pk)
+        delta_url = get_object_or_404(DeltaURL, pk=pk)
         division = request.data.get("division")
         if division:
             delta_url.division = division
@@ -359,9 +284,9 @@ def update_division(self, request, pk=None):
         return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
 
 
-class CandidateURLBulkCreateView(generics.ListCreateAPIView):
-    queryset = CandidateURL.objects.all()
-    serializer_class = CandidateURLBulkCreateSerializer
+class DeltaURLBulkCreateView(generics.ListCreateAPIView):
+    queryset = DeltaURL.objects.all()
+    serializer_class = DeltaURLBulkCreateSerializer
 
     def perform_create(self, serializer, collection_id=None):
         for validated_data in serializer.validated_data:
@@ -371,7 +296,7 @@ def perform_create(self, serializer, collection_id=None):
     def create(self, request, *args, **kwargs):
         config_folder = kwargs.get("config_folder")
         collection = Collection.objects.get(config_folder=config_folder)
-        collection.candidate_urls.all().delete()
+        collection.delta_urls.all().delete()
 
         serializer = self.get_serializer(data=request.data, many=True)
         serializer.is_valid(raise_exception=True)
@@ -382,8 +307,8 @@ def create(self, request, *args, **kwargs):
         return Response(serializer.data, status=status.HTTP_201_CREATED)
 
 
-class CuratedURLAPIView(ListAPIView):
-    serializer_class = CuratedUrlAPISerializer
+class DeltaURLAPIView(ListAPIView):
+    serializer_class = DeltaURLAPISerializer
 
     def get(self, request, *args, **kwargs):
         config_folder = kwargs.get("config_folder")
@@ -392,7 +317,7 @@ def get(self, request, *args, **kwargs):
 
     def get_queryset(self):
         queryset = (
-            CuratedUrl.objects.filter(collection__config_folder=self.config_folder)
+            DeltaURL.objects.filter(collection__config_folder=self.config_folder)
             .with_exclusion_status()
             .filter(excluded=False)
         )
@@ -419,26 +344,6 @@ def create(self, request, *args, **kwargs):
             return super().create(request, *args, **kwargs)
 
 
-class DeltaExcludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DeltaExcludePattern.objects.all()
-    serializer_class = DeltaExcludePatternSerializer
-
-    def get_queryset(self):
-        return super().get_queryset().order_by("match_pattern")
-
-    def create(self, request, *args, **kwargs):
-        match_pattern = request.POST.get("match_pattern")
-        collection_id = request.POST.get("collection")
-        try:
-            DeltaExcludePattern.objects.get(
-                collection_id=Collection.objects.get(id=collection_id),
-                match_pattern=match_pattern,
-            ).delete()
-            return Response(status=status.HTTP_200_OK)
-        except DeltaExcludePattern.DoesNotExist:
-            return super().create(request, *args, **kwargs)
-
-
 class IncludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = IncludePattern.objects.all()
     serializer_class = IncludePatternSerializer
@@ -459,26 +364,6 @@ def create(self, request, *args, **kwargs):
             return super().create(request, *args, **kwargs)
 
 
-class DeltaIncludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DeltaIncludePattern.objects.all()
-    serializer_class = DeltaIncludePatternSerializer
-
-    def get_queryset(self):
-        return super().get_queryset().order_by("match_pattern")
-
-    def create(self, request, *args, **kwargs):
-        match_pattern = request.POST.get("match_pattern")
-        collection_id = request.POST.get("collection")
-        try:
-            DeltaIncludePattern.objects.get(
-                collection_id=Collection.objects.get(id=collection_id),
-                match_pattern=match_pattern,
-            ).delete()
-            return Response(status=status.HTTP_200_OK)
-        except DeltaIncludePattern.DoesNotExist:
-            return super().create(request, *args, **kwargs)
-
-
 class TitlePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = TitlePattern.objects.all()
     serializer_class = TitlePatternSerializer
@@ -487,14 +372,6 @@ def get_queryset(self):
         return super().get_queryset().order_by("match_pattern")
 
 
-class DeltaTitlePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DeltaTitlePattern.objects.all()
-    serializer_class = DeltaTitlePatternSerializer
-
-    def get_queryset(self):
-        return super().get_queryset().order_by("match_pattern")
-
-
 class DocumentTypePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = DocumentTypePattern.objects.all()
     serializer_class = DocumentTypePatternSerializer
@@ -520,31 +397,6 @@ def create(self, request, *args, **kwargs):
                 return Response(status=status.HTTP_204_NO_CONTENT)
 
 
-class DeltaDocumentTypePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DeltaDocumentTypePattern.objects.all()
-    serializer_class = DeltaDocumentTypePatternSerializer
-
-    def get_queryset(self):
-        return super().get_queryset().order_by("match_pattern")
-
-    def create(self, request, *args, **kwargs):
-        document_type = request.POST.get("document_type")
-        if not int(document_type) == 0:  # 0=none
-            return super().create(request, *args, **kwargs)
-        else:
-            collection_id = request.POST.get("collection")
-            match_pattern = request.POST.get("match_pattern")
-            try:
-                DeltaDocumentTypePattern.objects.get(
-                    collection_id=Collection.objects.get(id=collection_id),
-                    match_pattern=match_pattern,
-                    match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
-                ).delete()
-                return Response(status=status.HTTP_200_OK)
-            except DeltaDocumentTypePattern.DoesNotExist:
-                return Response(status=status.HTTP_204_NO_CONTENT)
-
-
 class DivisionPatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = DivisionPattern.objects.all()
     serializer_class = DivisionPatternSerializer
@@ -560,21 +412,6 @@ def create(self, request, *args, **kwargs):
             return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
 
 
-class DeltaDivisionPatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DeltaDivisionPattern.objects.all()
-    serializer_class = DeltaDivisionPatternSerializer
-
-    def get_queryset(self):
-        return super().get_queryset().order_by("match_pattern")
-
-    def create(self, request, *args, **kwargs):
-        division = request.POST.get("division")
-        if division:
-            return super().create(request, *args, **kwargs)
-        else:
-            return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
-
-
 class CollectionViewSet(viewsets.ModelViewSet):
     queryset = Collection.objects.all()
     serializer_class = CollectionSerializer
@@ -710,8 +547,8 @@ class ResolvedTitleErrorListView(ListView):
 
 class TitlesAndErrorsView(View):
     def get(self, request, *args, **kwargs):
-        resolved_titles = ResolvedTitle.objects.select_related("title_pattern", "candidate_url").all()
-        resolved_title_errors = ResolvedTitleError.objects.select_related("title_pattern", "candidate_url").all()
+        resolved_titles = ResolvedTitle.objects.select_related("title_pattern", "delta_url").all()
+        resolved_title_errors = ResolvedTitleError.objects.select_related("title_pattern", "delta_url").all()
         context = {
             "resolved_titles": resolved_titles,
             "resolved_title_errors": resolved_title_errors,
diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/candidate_url_list.css
index aa2d5d18..06689207 100644
--- a/sde_indexing_helper/static/css/candidate_url_list.css
+++ b/sde_indexing_helper/static/css/candidate_url_list.css
@@ -18,7 +18,8 @@
     text-decoration-thickness: 1px;
 }
 
-.dataTables_scrollHead, .dataTables_scrollBody {
+.dataTables_scrollHead,
+.dataTables_scrollBody {
     overflow: visible !important;
 }
 
@@ -43,7 +44,7 @@
     background: #FFF;
     color: white;
     border-radius: 5px;
-    background-color:#15232E;
+    background-color: #15232E;
 }
 
 .custom-menu li {
@@ -76,29 +77,31 @@
     cursor: pointer
 }
 
-.table_filter_row_input{
+.table_filter_row_input {
     width: 100%;
 }
 
 
-  .select-dropdown {
+.select-dropdown {
     text-align: center;
     width: 100% !important;
-    color: #333333;;
+    color: #333333;
+    ;
     background-color: #fafafa;
     border-radius: 0.2rem;
-    border-color:  #fafafa;
+    border-color: #fafafa;
     font-size: 0.6875rem;
-    box-shadow: 0 2px 2px 0 rgba(153, 153, 153, 0.14), 0 3px 1px -2px rgba(153, 153, 153, 0.2), 0 1px 5px 0 rgba(153, 153, 153, 0.12);  }
+    box-shadow: 0 2px 2px 0 rgba(153, 153, 153, 0.14), 0 3px 1px -2px rgba(153, 153, 153, 0.2), 0 1px 5px 0 rgba(153, 153, 153, 0.12);
+}
 
-  .select-dropdown:hover {
+.select-dropdown:hover {
     box-shadow: 0 14px 26px -12px rgba(250, 250, 250, 0.42), 0 4px 23px 0px rgba(0, 0, 0, 0.12), 0 8px 10px -5px rgba(250, 250, 250, 0.2);
-  }
+}
 
-  .select-dropdown:focus,
-  .select-dropdown.focus {
+.select-dropdown:focus,
+.select-dropdown.focus {
     box-shadow: none, 0 0 0 0.2rem rgba(76, 175, 80, 0.5);
-  }
+}
 
 /* badge showing workflow status by header */
 .badge {
@@ -107,7 +110,8 @@
 }
 
 
-.table_filter_row_input, .doc-dropdown{
+.table_filter_row_input,
+.doc-dropdown {
     width: 100%;
 }
 
@@ -159,16 +163,17 @@
     border-radius: 1px !important;
 }
 
-.candidateUrlContainer {
+.deltaUrlContainer {
     background: #15232E;
     padding: 40px 30px;
     border-radius: 15px;
 }
+
 .modalTitle {
-font-size: 24px;
-font-weight: 600;
-line-height: 36px;
-letter-spacing: -0.03em;
+    font-size: 24px;
+    font-weight: 600;
+    line-height: 36px;
+    letter-spacing: -0.03em;
 }
 
 #hideShowColumnsModal {
@@ -181,43 +186,45 @@ letter-spacing: -0.03em;
     z-index: 2000;
 }
 
-#caption, #subTitle {
-font-size: 14px;
-font-weight: 400;
-line-height: 21px;
-letter-spacing: -0.02em;
+#caption,
+#subTitle {
+    font-size: 14px;
+    font-weight: 400;
+    line-height: 21px;
+    letter-spacing: -0.02em;
 }
 
 
 
-  .checkbox-wrapper {
+.checkbox-wrapper {
     display: flex;
     align-items: baseline;
-  }
+}
 
-  .checkbox-wrapper label {
+.checkbox-wrapper label {
     font-weight: 600;
     font-size: 16px;
     line-height: 24px;
     margin-bottom: 0;
     color: rgba(31, 41, 53, 1);
     padding-left: 10px;
-  }
+}
 
-  .modalFooter {
+.modalFooter {
     position: sticky;
     bottom: 0;
     position: sticky;
     bottom: 0;
     padding: 10px 0;
     background: #FFFFFF;
-  }
-.badge{
+}
+
+.badge {
     background-color: #FF3D57;
 }
 
-.notifyBadge{
-    margin-left:5px !important;
+.notifyBadge {
+    margin-left: 5px !important;
 }
 
 .sorting_1 {
@@ -227,102 +234,113 @@ letter-spacing: -0.02em;
     max-width: 600px;
     width: 600px;
     color: #65B1EF;
-  }
+}
 
 .title-dropdown {
     width: fit-content !important;
-    margin-top:20px;
-    margin-bottom:20px;
+    margin-top: 20px;
+    margin-bottom: 20px;
 }
+
 .table tbody tr:nth-child(odd) {
     background-color: #050E19 !important;
-   }
+}
 
-   .table tbody tr:nth-child(even) {
+.table tbody tr:nth-child(even) {
     background-color: #3F4A58 !important;
-   }
-   .candidateTitle{
-    font-size:24px;
+}
+
+.deltaTitle {
+    font-size: 24px;
     font-weight: 500;
-   }
+}
 
 
 
-   .custom-select, .buttons-csv, .customizeColumns, .addPattern{
+.custom-select,
+.buttons-csv,
+.customizeColumns,
+.addPattern {
     border-style: solid !important;
     border-color: #A7BACD !important;
     border-width: 1px !important;
-    color:#A7BACD !important;
+    color: #A7BACD !important;
     border-radius: 5px !important;
     padding: 11px 15px;
-   }
+}
 
-   .addPattern {
+.addPattern {
     background-color: #0066CA !important;
     border-color: #0066CA !important;
     color: #fff !important;
-   }
+}
 
-   #exclude_patterns_table_wrapper .dt-buttons, #include_patterns_table_wrapper .dt-buttons, #document_type_patterns_table_wrapper .dt-buttons, #title_patterns_table_wrapper .dt-buttons {
+#exclude_patterns_table_wrapper .dt-buttons,
+#include_patterns_table_wrapper .dt-buttons,
+#document_type_patterns_table_wrapper .dt-buttons,
+#title_patterns_table_wrapper .dt-buttons {
     width: 89%;
     justify-content: end;
-   }
+}
 
-   .customizeColumns {
+.customizeColumns {
     margin-left: 10px !important;
-   }
+}
 
-   .form-control:read-only  {
+.form-control:read-only {
     background-image: none;
-   }
+}
 
-    .dt-container div.dt-length label {
+.dt-container div.dt-length label {
     display: none;
-   }
+}
 
-   div.dt-container div.dt-info {
+div.dt-container div.dt-info {
     padding-top: 0;
     white-space: normal;
 }
 
-.page-link{
-    color:white !important;
-    border:0.5px solid !important;
-    margin-left:3px;
-    margin-right:3px;
+.page-link {
+    color: white !important;
+    border: 0.5px solid !important;
+    margin-left: 3px;
+    margin-right: 3px;
 }
-.page-link:hover{
+
+.page-link:hover {
     background-color: #0066CA !important;
 
 }
 
 .page-item.disabled .page-link {
-    color:grey!important;
+    color: grey !important;
 }
-.dt-paging-input{
-    color:white;
+
+.dt-paging-input {
+    color: white;
 }
 
-.dt-paging-input input{
+.dt-paging-input input {
     background-color: #3F4A58;
     color: white;
-    border:solid 0.5px !important;
+    border: solid 0.5px !important;
 }
 
-.dt-inputpaging{
-  position: absolute;
-  right: 16px;
-  top: -27px;
+.dt-inputpaging {
+    position: absolute;
+    right: 16px;
+    top: -27px;
 }
-.ml-auto{
-    width:50%;
+
+.ml-auto {
+    width: 50%;
 }
 
-.custom-select-sm{
-    margin-left:5px;
+.custom-select-sm {
+    margin-left: 5px;
 }
 
-.selected{
+.selected {
     background-color: inherit !important;
 }
 
@@ -334,26 +352,28 @@ div.dt-buttons .btn.processing:after {
     -webkit-animation: dtb-spinner 1500ms infinite linear;
 }
 
-.document_type_dropdown, .division_dropdown, .dropdown-toggle {
+.document_type_dropdown,
+.division_dropdown,
+.dropdown-toggle {
     width: 100%;
     display: flex;
     justify-content: center;
 }
 
- .dropdown-toggle {
+.dropdown-toggle {
     width: 80%;
     /* display: flex; */
     align-items: center;
     /* justify-content: space-between; */
- }
+}
 
-.headerDiv{
+.headerDiv {
     display: flex;
     justify-content: space-between;
 }
 
 .url-cell {
-    display:flex;
+    display: flex;
     align-items: center;
     justify-content: space-between;
     word-wrap: break-word;
@@ -362,17 +382,19 @@ div.dt-buttons .btn.processing:after {
     overflow-wrap: break-word;
     min-width: 100%;
     max-width: 100%;
- }
+}
 
- .url-icon {
+.url-icon {
     color: #65B1EF;
- }
-#match_pattern_input, #title_pattern_input {
+}
+
+#match_pattern_input,
+#title_pattern_input {
     background: #3F4A58;
     border-radius: 4px;
 }
 
-.modal-body .bmd-label-static  {
+.modal-body .bmd-label-static {
     top: -20px !important;
 }
 
@@ -396,25 +418,26 @@ div.dt-buttons .btn.processing:after {
     margin-top: 40px;
 }
 
-.is-focused [class^='bmd-label']{
-    color:#0066CA;
-   }
-   .form-control{
-    color:white;
-   }
+.is-focused [class^='bmd-label'] {
+    color: #0066CA;
+}
+
+.form-control {
+    color: white;
+}
 
-   .form-control:focus{
-    color:white;
-   }
+.form-control:focus {
+    color: white;
+}
 
-   .is-focused .form-label{
-    background-image:linear-gradient(to top, #0066CA 2px, rgba(156, 39, 176, 0) 2px), linear-gradient(to top, #d2d2d2 1px, rgba(210, 210, 210, 0) 1px);
-    color:#AAAAAA;
-   }
+.is-focused .form-label {
+    background-image: linear-gradient(to top, #0066CA 2px, rgba(156, 39, 176, 0) 2px), linear-gradient(to top, #d2d2d2 1px, rgba(210, 210, 210, 0) 1px);
+    color: #AAAAAA;
+}
 
-   .dropdown-item:hover{
+.dropdown-item:hover {
     background-color: #0066CA !important;
-   }
+}
 
 
 /* pagination position */
diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js
index b47d6f86..66a52ab1 100644
--- a/sde_indexing_helper/static/js/candidate_url_list.js
+++ b/sde_indexing_helper/static/js/candidate_url_list.js
@@ -9,11 +9,6 @@ var newExcludePatternsCount = 0;
 var newTitlePatternsCount = 0;
 var newDocumentTypePatternsCount = 0;
 var newDivisionPatternsCount = 0;
-var newDeltaIncludePatternsCount = 0;
-var newDeltaExcludePatternsCount = 0;
-var newDeltaTitlePatternsCount = 0;
-var newDeltaDocumentTypePatternsCount = 0;
-var newDeltaDivisionPatternsCount = 0;
 var currentTab = ""; //blank for the first tab
 var matchPatternTypeMap = {
   "Individual URL Pattern": 1,
@@ -91,7 +86,7 @@ function modalContents(tableName) {
       .attr("for", "checkbox_" + columnName.replace(/\s+/g, "_"))
       .text(columnName);
     var $caption = $("<p class='headerDescription'>")
-      .text(candidateTableHeaderDefinitons[columnName])
+      .text(deltaTableHeaderDefinitons[columnName])
       .attr({
         id: "caption",
       });
@@ -112,436 +107,6 @@ function initializeDataTable() {
   var true_icon = '<i class="material-icons" style="color: green">check</i>';
   var false_icon = '<i class="material-icons" style="color: red">close</i>';
 
-  var candidate_urls_table = $("#candidate_urls_table").DataTable({
-    pageLength: 100,
-    colReorder: true,
-    stateSave: true,
-    layout: {
-      bottomEnd: "inputPaging",
-      topEnd: null,
-      topStart: {
-        info: true,
-        pageLength: {
-          menu: [
-            [25, 50, 100, 500],
-            ["Show 25", "Show 50", "Show 100", "Show 500"],
-          ],
-        },
-        buttons: [
-          {
-            extend: "csv",
-            exportOptions: {
-              columns: [0, 11, 2, 12, 10],
-            },
-            customize: function (csv) {
-              var lines = csv.split("\n");
-
-              // Reorder the header columns
-              var headers = lines[0].split(",");
-              headers[4] = "New Title";
-              var reorderedHeaders = [
-                headers[0],
-                headers[3],
-                headers[1],
-                headers[4],
-                headers[5],
-                headers[2],
-              ];
-              lines[0] = reorderedHeaders.join(",");
-
-              const appliedFilt = [
-                [`URL:`, `${$("#candidateUrlFilter").val()}`.trim()],
-                [`Exclude:`, `${$(".dropdown-1").val()}`.trim()],
-                [
-                  `Scraped Title:`,
-                  `${$("#candidateScrapedTitleFilter").val()}`.trim(),
-                ],
-                [`New Title:`, `${$("#candidateNewTitleFilter").val()}`.trim()],
-                [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()],
-                [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()],
-              ];
-
-              const filtersAreEmpty = appliedFilt.every((filter) => {
-                return filter[1] === "" || filter[1] === "undefined";
-              });
-
-              // Remove the second row with the filters
-              if (lines.length > 2) {
-                lines.splice(1, 1);
-              }
-              let alteredLines = [];
-              lines.forEach((line) => {
-                let newLine = "";
-                newLine = line.replace("open_in_new", "");
-                alteredLines.push(newLine);
-              });
-
-              if (filtersAreEmpty) return alteredLines.join("\n");
-              else {
-                // Add filter information to the first row
-                const secondRowFilters = [
-                  "Export of SDE Candidate URLs",
-                  `"(Applied Filters: ${appliedFilt
-                    .reduce((acc, curr) => {
-                      if (
-                        curr[1] !== " undefined" &&
-                        curr[1] !== " " &&
-                        curr[1] !== "" &&
-                        curr[1] !== "undefined"
-                      ) {
-                        acc = `${acc}, ${curr[0]} ${curr[1]}`;
-                      }
-                      return acc;
-                    }, "")
-                    .slice(2)})"`,
-                ];
-
-                var appliedFiltersInfo = secondRowFilters.join("\n");
-                return appliedFiltersInfo + "\n" + alteredLines.join("\n");
-              }
-            },
-          },
-          "spacer",
-          {
-            text: "Customize Columns",
-            className: "customizeColumns",
-            action: function () {
-              modalContents("#candidate_urls_table");
-            },
-          },
-        ],
-      },
-    },
-    serverSide: true,
-    orderCellsTop: true,
-    pagingType: "input",
-    rowId: "url",
-    stateLoadCallback: function (settings) {
-      var state = JSON.parse(
-        localStorage.getItem(
-          "DataTables_candidate_urls_" + window.location.pathname
-        )
-      );
-      if (!state) {
-        settings.oInit.pageLength = 1;
-      }
-      return state;
-    },
-    ajax: {
-      url: `/api/candidate-urls/?format=datatables&collection_id=${collection_id}`,
-      data: function (d) {
-        d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null;
-      },
-    },
-    initComplete: function (data) {
-      const addDropdownSelect = [1, 4, 5];
-      const dict = {
-        1: "Images",
-        2: "Data",
-        3: "Documentation",
-        4: "Software and Tools",
-        5: "Missions and Instruments",
-      };
-      this.api()
-        .columns()
-        .every(function (index) {
-          let column = this;
-          if (addDropdownSelect.includes(index)) {
-            $("thead tr td select.dropdown-" + index).on("change", function () {
-              var val = $.fn.dataTable.util.escapeRegex($(this).val());
-              column.search(val ? "^" + val + "$" : "", true, false).draw();
-            });
-          }
-        });
-    },
-
-    columns: [
-      getURLColumn(),
-      getExcludedColumn(true_icon, false_icon),
-      getScrapedTitleColumn(),
-      getGeneratedTitleColumn(),
-      getDocumentTypeColumn(),
-      getDivisionColumn(),
-      { data: "id", visible: false, searchable: false },
-      { data: "generated_title_id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false, searchable: false },
-      { data: "candidate_urls_count", visible: false, searchable: false },
-      { data: "excluded", visible: false, searchable: false },
-      {
-        data: null,
-        render: function (data, type, row) {
-          if (!row.document_type) return "Select";
-          return dict[row.document_type];
-        },
-        visible: false,
-      },
-      {
-        data: null,
-        render: function (data, type, row) {
-          const excludedDict = {
-            true: "Yes",
-            false: "No",
-          };
-          return excludedDict[row.excluded];
-        },
-        visible: false,
-      },
-      {
-        data: null,
-        render: function (data, type, row) {
-          return row.generated_title;
-        },
-        visible: false,
-      },
-      // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []),
-      // getDivisionColumn(),
-    ],
-    createdRow: function (row, data, dataIndex) {
-      if (data["excluded"]) {
-        $(row).attr(
-          "style",
-          "background-color: rgba(255, 61, 87, 0.36) !important"
-        );
-      }
-    },
-  });
-
-  $("#candidateUrlFilter").on(
-    "beforeinput",
-    DataTable.util.debounce(function (val) {
-      candidate_urls_table.columns(0).search(this.value).draw();
-    }, 1000)
-  );
-
-  $("#candidateScrapedTitleFilter").on(
-    "beforeinput",
-    DataTable.util.debounce(function (val) {
-      candidate_urls_table.columns(2).search(this.value).draw();
-    }, 1000)
-  );
-
-  $("#candidateNewTitleFilter").on(
-    "beforeinput",
-    DataTable.util.debounce(function (val) {
-      candidate_urls_table.columns(3).search(this.value).draw();
-    }, 1000)
-  );
-
-  var curated_urls_table = $("#curated_urls_table").DataTable({
-    pageLength: 100,
-    colReorder: true,
-    stateSave: true,
-    layout: {
-      bottomEnd: "inputPaging",
-      topEnd: null,
-      topStart: {
-        info: true,
-        pageLength: {
-          menu: [
-            [25, 50, 100, 500],
-            ["Show 25", "Show 50", "Show 100", "Show 500"],
-          ],
-        },
-        buttons: [
-          {
-            extend: "csv",
-            exportOptions: {
-              columns: [0, 11, 2, 12, 10],
-            },
-            customize: function (csv) {
-              var lines = csv.split("\n");
-
-              // Reorder the header columns
-              var headers = lines[0].split(",");
-              headers[4] = "New Title";
-              var reorderedHeaders = [
-                headers[0],
-                headers[3],
-                headers[1],
-                headers[4],
-                headers[5],
-                headers[2],
-              ];
-              lines[0] = reorderedHeaders.join(",");
-
-              const appliedFilt = [
-                [`URL:`, `${$("#curatedUrlFilter").val()}`.trim()],
-                [`Exclude:`, `${$(".dropdown-1").val()}`.trim()],
-                [
-                  `Scraped Title:`,
-                  `${$("#curatedScrapedTitleFilter").val()}`.trim(),
-                ],
-                [`New Title:`, `${$("#curatedNewTitleFilter").val()}`.trim()],
-                [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()],
-                [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()],
-              ];
-
-              const filtersAreEmpty = appliedFilt.every((filter) => {
-                return filter[1] === "" || filter[1] === "undefined";
-              });
-
-              // Remove the second row with the filters
-              if (lines.length > 2) {
-                lines.splice(1, 1);
-              }
-              let alteredLines = [];
-              lines.forEach((line) => {
-                let newLine = "";
-                newLine = line.replace("open_in_new", "");
-                alteredLines.push(newLine);
-              });
-
-              if (filtersAreEmpty) return alteredLines.join("\n");
-              else {
-                // Add filter information to the first row
-                const secondRowFilters = [
-                  "Export of SDE Curated URLs",
-                  `"(Applied Filters: ${appliedFilt
-                    .reduce((acc, curr) => {
-                      if (
-                        curr[1] !== " undefined" &&
-                        curr[1] !== " " &&
-                        curr[1] !== "" &&
-                        curr[1] !== "undefined"
-                      ) {
-                        acc = `${acc}, ${curr[0]} ${curr[1]}`;
-                      }
-                      return acc;
-                    }, "")
-                    .slice(2)})"`,
-                ];
-
-                var appliedFiltersInfo = secondRowFilters.join("\n");
-                return appliedFiltersInfo + "\n" + alteredLines.join("\n");
-              }
-            },
-          },
-          "spacer",
-          {
-            text: "Customize Columns",
-            className: "customizeColumns",
-            action: function () {
-              modalContents("#curated_urls_table");
-            },
-          },
-        ],
-      },
-    },
-    serverSide: true,
-    orderCellsTop: true,
-    pagingType: "input",
-    rowId: "url",
-    stateLoadCallback: function (settings) {
-      var state = JSON.parse(
-        localStorage.getItem(
-          "DataTables_curated_urls_" + window.location.pathname
-        )
-      );
-      if (!state) {
-        settings.oInit.pageLength = 1;
-      }
-      return state;
-    },
-    ajax: {
-      url: `/api/curated-urls/?format=datatables&collection_id=${collection_id}`,
-      data: function (d) {
-        d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null;
-      },
-    },
-    initComplete: function (data) {
-      const addDropdownSelect = [1, 4, 5];
-      const dict = {
-        1: "Images",
-        2: "Data",
-        3: "Documentation",
-        4: "Software and Tools",
-        5: "Missions and Instruments",
-      };
-      this.api()
-        .columns()
-        .every(function (index) {
-          let column = this;
-          if (addDropdownSelect.includes(index)) {
-            $("thead tr td select.dropdown-" + index).on("change", function () {
-              var val = $.fn.dataTable.util.escapeRegex($(this).val());
-              column.search(val ? "^" + val + "$" : "", true, false).draw();
-            });
-          }
-        });
-    },
-
-    columns: [
-      getCuratedURLColumn(),
-      getExcludedColumn(true_icon, false_icon),
-      getScrapedTitleColumn(),
-      getCuratedGeneratedTitleColumn(),
-      getDocumentTypeColumn(),
-      getDivisionColumn(),
-      { data: "id", visible: false, searchable: false },
-      { data: "generated_title_id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false, searchable: false },
-      { data: "curated_urls_count", visible: false, searchable: false },
-      { data: "excluded", visible: false, searchable: false },
-      {
-        data: null,
-        render: function (data, type, row) {
-          if (!row.document_type) return "Select";
-          return dict[row.document_type];
-        },
-        visible: false,
-      },
-      {
-        data: null,
-        render: function (data, type, row) {
-          const excludedDict = {
-            true: "Yes",
-            false: "No",
-          };
-          return excludedDict[row.excluded];
-        },
-        visible: false,
-      },
-      {
-        data: null,
-        render: function (data, type, row) {
-          return row.generated_title;
-        },
-        visible: false,
-      },
-      // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []),
-      // getDivisionColumn(),
-    ],
-    createdRow: function (row, data, dataIndex) {
-      if (data["excluded"]) {
-        $(row).attr(
-          "style",
-          "background-color: rgba(255, 61, 87, 0.36) !important"
-        );
-      }
-    },
-  });
-
-  $("#curatedUrlFilter").on(
-    "beforeinput",
-    DataTable.util.debounce(function (val) {
-      curated_urls_table.columns(0).search(this.value).draw();
-    }, 1000)
-  );
-
-  $("#curatedScrapedTitleFilter").on(
-    "beforeinput",
-    DataTable.util.debounce(function (val) {
-      curated_urls_table.columns(2).search(this.value).draw();
-    }, 1000)
-  );
-
-  $("#curatedNewTitleFilter").on(
-    "beforeinput",
-    DataTable.util.debounce(function (val) {
-      curated_urls_table.columns(3).search(this.value).draw();
-    }, 1000)
-  );
-
   var delta_urls_table = $("#delta_urls_table").DataTable({
     pageLength: 100,
     colReorder: true,
@@ -686,10 +251,10 @@ function initializeDataTable() {
     },
 
     columns: [
-      getDeltaURLColumn(),
+      getURLColumn(),
       getExcludedColumn(true_icon, false_icon),
       getScrapedTitleColumn(),
-      getDeltaGeneratedTitleColumn(),
+      getGeneratedTitleColumn(),
       getDocumentTypeColumn(),
       getDivisionColumn(),
       { data: "id", visible: false, searchable: false },
@@ -818,93 +383,6 @@ function initializeDataTable() {
         sortable: false,
         visible: false,
       },
-      {
-        data: "candidate_urls_count",
-        class: "text-center whiteText",
-        sortable: true,
-      },
-      {
-        data: null,
-        sortable: false,
-        class: "text-center",
-        render: function (data, type, row) {
-          return `<button class="btn btn-danger btn-sm delete-exclude-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
-        },
-      },
-      { data: "id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false },
-    ],
-  });
-
-  $("#candidateMatchPatternFilter").on("beforeinput", function () {
-    exclude_patterns_table.columns(0).search(this.value).draw();
-  });
-
-  $("#candidateReasonFilter").on("beforeinput", function () {
-    exclude_patterns_table.columns(2).search(this.value).draw();
-  });
-
-  var delta_exclude_patterns_table = $("#delta_exclude_patterns_table").DataTable({
-    // scrollY: true,
-    dom: "lBrtip",
-    buttons: [
-      {
-        text: "Add Pattern",
-        className: "addPattern",
-        action: function () {
-          $modal = $("#deltaExcludePatternModal").modal();
-        },
-      },
-      {
-        text: "Customize Columns",
-        className: "customizeColumns",
-        action: function () {
-          modalContents("#delta_exclude_patterns_table");
-        },
-      },
-    ],
-    lengthMenu: [
-      [25, 50, 100, 500],
-      ["Show 25", "Show 50", "Show 100", "Show 500"],
-    ],
-    orderCellsTop: true,
-    pageLength: 100,
-    ajax: `/api/delta-exclude-patterns/?format=datatables&collection_id=${collection_id}`,
-    initComplete: function (data) {
-      var table = $("#delta_exclude_patterns_table").DataTable();
-
-      this.api()
-        .columns()
-        .every(function (index) {
-          let column = this;
-          if (column.data().length === 0) {
-            $("#delta-exclude-patterns-dropdown-1").prop("disabled", true);
-          } else if (index === 1) {
-            $("#delta-exclude-patterns-dropdown-1").on("change", function () {
-              if ($(this).val() === "") table.columns(6).search("").draw();
-              else {
-                table
-                  .column(6)
-                  .search(matchPatternTypeMap[$(this).val()])
-                  .draw();
-              }
-            });
-          }
-        });
-    },
-    columns: [
-      { data: "delta_match_pattern", class: "whiteText" },
-      {
-        data: "delta_match_pattern_type_display",
-        class: "text-center whiteText",
-        sortable: true,
-      },
-      {
-        data: "reason",
-        class: "text-center whiteText",
-        sortable: false,
-        visible: false,
-      },
       {
         data: "delta_urls_count",
         class: "text-center whiteText",
@@ -924,11 +402,11 @@ function initializeDataTable() {
   });
 
   $("#deltaMatchPatternFilter").on("beforeinput", function () {
-    delta_exclude_patterns_table.columns(0).search(this.value).draw();
+    exclude_patterns_table.columns(0).search(this.value).draw();
   });
 
   $("#deltaReasonFilter").on("beforeinput", function () {
-    delta_exclude_patterns_table.columns(2).search(this.value).draw();
+    exclude_patterns_table.columns(2).search(this.value).draw();
   });
 
   var include_patterns_table = $("#include_patterns_table").DataTable({
@@ -986,165 +464,7 @@ function initializeDataTable() {
         sortable: false,
       },
       {
-        data: "candidate_urls_count",
-        class: "text-center whiteText",
-        sortable: true,
-      },
-      {
-        data: null,
-        sortable: false,
-        class: "text-center",
-        render: function (data, type, row) {
-          return `<button class="btn btn-danger btn-sm delete-include-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
-        },
-      },
-      { data: "id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false },
-    ],
-  });
-
-  $("#candidateIncludeMatchPatternFilter").on("beforeinput", function () {
-    include_patterns_table.columns(0).search(this.value).draw();
-  });
-
-  var delta_include_patterns_table = $("#delta_include_patterns_table").DataTable({
-    // scrollY: true,
-    lengthMenu: [
-      [25, 50, 100, 500],
-      ["Show 25", "Show 50", "Show 100", "Show 500"],
-    ],
-    dom: "lBrtip",
-    buttons: [
-      {
-        text: "Add Pattern",
-        className: "addPattern",
-        action: function () {
-          $modal = $("#deltaIncludePatternModal").modal();
-        },
-      },
-      {
-        text: "Customize Columns",
-        className: "customizeColumns",
-        action: function () {
-          modalContents("#delta_include_patterns_table");
-        },
-      },
-    ],
-    pageLength: 100,
-    orderCellsTop: true,
-    ajax: `/api/delta-include-patterns/?format=datatables&collection_id=${collection_id}`,
-    initComplete: function (data) {
-      var table = $("#delta_include_patterns_table").DataTable();
-      this.api()
-        .columns()
-        .every(function (index) {
-          let column = this;
-          if (column.data().length === 0) {
-            $("#delta-include-patterns-dropdown-1").prop("disabled", true);
-          } else {
-            if (index === 1) {
-              $("#delta-include-patterns-dropdown-1").on("change", function () {
-                if ($(this).val() === "") table.columns(5).search("").draw();
-                table
-                  .column(5)
-                  .search(matchPatternTypeMap[$(this).val()])
-                  .draw();
-              });
-            }
-          }
-        });
-    },
-    columns: [
-      { data: "delta_match_pattern", class: "whiteText" },
-      {
-        data: "delta_match_pattern_type_display",
-        class: "text-center whiteText",
-        sortable: false,
-      },
-      {
-        data: "delta_urls_count",
-        class: "text-center whiteText",
-        sortable: true,
-      },
-      {
-        data: null,
-        sortable: false,
-        class: "text-center",
-        render: function (data, type, row) {
-          return `<button class="btn btn-danger btn-sm delete-include-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
-        },
-      },
-      { data: "id", visible: false, searchable: false },
-      { data: "delta_match_pattern_type", visible: false },
-    ],
-  });
-
-  $("#deltaIncludeMatchPatternFilter").on("beforeinput", function () {
-    delta_include_patterns_table.columns(0).search(this.value).draw();
-  });
-
-
-
-  var title_patterns_table = $("#title_patterns_table").DataTable({
-    // scrollY: true,
-    dom: "lBrtip",
-    serverSide: true,
-    paging: true,
-    buttons: [
-      {
-        text: "Add Pattern",
-        className: "addPattern",
-        action: function () {
-          $modal = $("#titlePatternModal").modal();
-        },
-      },
-      {
-        text: "Customize Columns",
-        className: "customizeColumns",
-        action: function () {
-          modalContents("#title_patterns_table");
-        },
-      },
-    ],
-    lengthMenu: [
-      [25, 50, 100, 500, -1],
-      ["Show 25", "Show 50", "Show 100", "Show 500", "Show All"],
-    ],
-    pageLength: 50,
-    orderCellsTop: true,
-    ajax: `/api/title-patterns/?format=datatables&collection_id=${collection_id}`,
-    initComplete: function (data) {
-      var table = $("#title_patterns_table").DataTable();
-
-      this.api()
-        .columns()
-        .every(function (index) {
-          let column = this;
-          if (column.data().length === 0) {
-            $("#title-patterns-dropdown-1").prop("disabled", true);
-          } else if (index === 1) {
-            $("#title-patterns-dropdown-1").on("change", function () {
-              if ($(this).val() === "") table.columns(6).search("").draw();
-              else {
-                table
-                  .column(6)
-                  .search(matchPatternTypeMap[$(this).val()])
-                  .draw();
-              }
-            });
-          }
-        });
-    },
-    columns: [
-      { data: "match_pattern", class: "whiteText" },
-      {
-        data: "match_pattern_type_display",
-        class: "text-center whiteText",
-        sortable: false,
-      },
-      { data: "title_pattern", class: "whiteText" },
-      {
-        data: "candidate_urls_count",
+        data: "delta_urls_count",
         class: "text-center whiteText",
         sortable: true,
       },
@@ -1153,7 +473,7 @@ function initializeDataTable() {
         sortable: false,
         class: "text-center",
         render: function (data, type, row) {
-          return `<button class="btn btn-danger btn-sm delete-title-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
+          return `<button class="btn btn-danger btn-sm delete-include-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
         },
       },
       { data: "id", visible: false, searchable: false },
@@ -1161,15 +481,11 @@ function initializeDataTable() {
     ],
   });
 
-  $("#candidateTitleMatchPatternFilter").on("beforeinput", function (val) {
-    title_patterns_table.columns(0).search(this.value).draw();
-  });
-
-  $("#candidateTitlePatternTypeFilter").on("beforeinput", function (val) {
-    title_patterns_table.columns(2).search(this.value).draw();
+  $("#deltaIncludeMatchPatternFilter").on("beforeinput", function () {
+    include_patterns_table.columns(0).search(this.value).draw();
   });
 
-  var delta_title_patterns_table = $("#delta_title_patterns_table").DataTable({
+  var title_patterns_table = $("#title_patterns_table").DataTable({
     // scrollY: true,
     dom: "lBrtip",
     serverSide: true,
@@ -1179,14 +495,14 @@ function initializeDataTable() {
         text: "Add Pattern",
         className: "addPattern",
         action: function () {
-          $modal = $("#deltaTitlePatternModal").modal();
+          $modal = $("#titlePatternModal").modal();
         },
       },
       {
         text: "Customize Columns",
         className: "customizeColumns",
         action: function () {
-          modalContents("#delta_title_patterns_table");
+          modalContents("#title_patterns_table");
         },
       },
     ],
@@ -1196,18 +512,18 @@ function initializeDataTable() {
     ],
     pageLength: 50,
     orderCellsTop: true,
-    ajax: `/api/delta-title-patterns/?format=datatables&collection_id=${collection_id}`,
+    ajax: `/api/title-patterns/?format=datatables&collection_id=${collection_id}`,
     initComplete: function (data) {
-      var table = $("#delta_title_patterns_table").DataTable();
+      var table = $("#title_patterns_table").DataTable();
 
       this.api()
         .columns()
         .every(function (index) {
           let column = this;
           if (column.data().length === 0) {
-            $("#delta-title-patterns-dropdown-1").prop("disabled", true);
+            $("#title-patterns-dropdown-1").prop("disabled", true);
           } else if (index === 1) {
-            $("#delta-title-patterns-dropdown-1").on("change", function () {
+            $("#title-patterns-dropdown-1").on("change", function () {
               if ($(this).val() === "") table.columns(6).search("").draw();
               else {
                 table
@@ -1220,13 +536,13 @@ function initializeDataTable() {
         });
     },
     columns: [
-      { data: "delta_match_pattern", class: "whiteText" },
+      { data: "match_pattern", class: "whiteText" },
       {
-        data: "delta_match_pattern_type_display",
+        data: "match_pattern_type_display",
         class: "text-center whiteText",
         sortable: false,
       },
-      { data: "delta_title_pattern", class: "whiteText" },
+      { data: "title_pattern", class: "whiteText" },
       {
         data: "delta_urls_count",
         class: "text-center whiteText",
@@ -1241,16 +557,16 @@ function initializeDataTable() {
         },
       },
       { data: "id", visible: false, searchable: false },
-      { data: "delta_match_pattern_type", visible: false },
+      { data: "match_pattern_type", visible: false },
     ],
   });
 
   $("#deltaTitleMatchPatternFilter").on("beforeinput", function (val) {
-    delta_title_patterns_table.columns(0).search(this.value).draw();
+    title_patterns_table.columns(0).search(this.value).draw();
   });
 
   $("#deltaTitlePatternTypeFilter").on("beforeinput", function (val) {
-    delta_title_patterns_table.columns(2).search(this.value).draw();
+    title_patterns_table.columns(2).search(this.value).draw();
   });
 
   var document_type_patterns_table = $(
@@ -1340,7 +656,7 @@ function initializeDataTable() {
       },
       { data: "document_type_display", class: "whiteText" },
       {
-        data: "candidate_urls_count",
+        data: "delta_urls_count",
         class: "text-center whiteText",
         sortable: true,
       },
@@ -1358,117 +674,8 @@ function initializeDataTable() {
     ],
   });
 
-  $("#candidateDocTypeMatchPatternFilter").on("beforeinput", function (val) {
-    document_type_patterns_table.columns(0).search(this.value).draw();
-  });
-
-  var delta_document_type_patterns_table = $(
-    "#delta_document_type_patterns_table"
-  ).DataTable({
-    // scrollY: true,
-    dom: "lBrtip",
-    buttons: [
-      {
-        text: "Add Pattern",
-        className: "addPattern",
-        action: function () {
-          $modal = $("#deltaDocumentTypePatternModal").modal();
-        },
-      },
-      {
-        text: "Customize Columns",
-        className: "customizeColumns",
-        action: function () {
-          modalContents("#delta_document_type_patterns_table");
-        },
-      },
-    ],
-    lengthMenu: [
-      [25, 50, 100, 500],
-      ["Show 25", "Show 50", "Show 100", "Show 500"],
-    ],
-    orderCellsTop: true,
-    pageLength: 100,
-    ajax: `/api/delta_document-type-patterns/?format=datatables&collection_id=${collection_id}`,
-    initComplete: function (data) {
-      this.api()
-        .columns()
-        .every(function (index) {
-          var table = $("#delta_document_type_patterns_table").DataTable();
-
-          let addDropdownSelect = {
-            1: {
-              columnToSearch: 6,
-              matchPattern: {
-                "Individual URL Pattern": 1,
-                "Multi-URL Pattern": 2,
-              },
-            },
-            2: {
-              columnToSearch: 7,
-              matchPattern: {
-                Images: 1,
-                Data: 2,
-                Documentation: 3,
-                "Software and Tools": 4,
-                "Missions and Instruments": 5,
-              },
-            },
-          };
-
-          let column = this;
-          if (column.data().length === 0) {
-            $(`#delta-document-type-patterns-dropdown-${index}`).prop(
-              "disabled",
-              true
-            );
-          } else if (index in addDropdownSelect) {
-            $("#delta-document-type-patterns-dropdown-" + index).on(
-              "change",
-              function () {
-                let col = addDropdownSelect[index].columnToSearch;
-                let searchInput =
-                  addDropdownSelect[index].matchPattern[$(this).val()];
-                if ($(this).val() === "" || $(this).val() === undefined)
-                  table.columns(col).search("").draw();
-                else {
-                  table.columns(col).search(searchInput).draw();
-                }
-              }
-            );
-          }
-        });
-    },
-
-    columns: [
-      { data: "delta_match_pattern", class: "whiteText" },
-      {
-        data: "delta_match_pattern_type_display",
-        class: "text-center whiteText",
-        sortable: false,
-      },
-      { data: "delta_document_type_display", class: "whiteText" },
-      {
-        data: "delta_urls_count",
-        class: "text-center whiteText",
-        sortable: true,
-      },
-      {
-        data: null,
-        sortable: false,
-        class: "text-center",
-        render: function (data, type, row) {
-          return `<button class="btn btn-danger btn-sm delete-document-type-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
-        },
-      },
-      { data: "id", visible: false, searchable: false },
-      { data: "delta_match_pattern_type", visible: false },
-      { data: "delta_document_type", visible: false },
-    ],
-  });
-
   $("#deltaDocTypeMatchPatternFilter").on("beforeinput", function (val) {
-    delta_document_type_patterns_table.columns(0).search(this.value).draw();
+    document_type_patterns_table.columns(0).search(this.value).draw();
   });
 }
 
@@ -1550,7 +757,7 @@ var division_patterns_table = $("#division_patterns_table").DataTable({
     },
     { data: "division_display", class: "whiteText" },
     {
-      data: "candidate_urls_count",
+      data: "delta_urls_count",
       class: "text-center whiteText",
       sortable: true,
     },
@@ -1568,109 +775,10 @@ var division_patterns_table = $("#division_patterns_table").DataTable({
   ],
 });
 
-$("#candidateDivisionMatchPatternFilter").on("beforeinput", function (val) {
+$("#deltaDivisionMatchPatternFilter").on("beforeinput", function (val) {
   division_patterns_table.columns(0).search(this.value).draw();
 });
 
-var delta_division_patterns_table = $("#delta_division_patterns_table").DataTable({
-  dom: "lBrtip",
-  buttons: [
-    {
-      text: "Add Pattern",
-      className: "addPattern",
-      action: function () {
-        $modal = $("#deltaDivisionPatternModal").modal();
-      },
-    },
-    {
-      text: "Customize Columns",
-      className: "customizeColumns",
-      action: function () {
-        modalContents("#delta_division_patterns_table");
-      },
-    },
-  ],
-  lengthMenu: [
-    [25, 50, 100, 500],
-    ["Show 25", "Show 50", "Show 100", "Show 500"],
-  ],
-  orderCellsTop: true,
-  pageLength: 100,
-  ajax: `/api/delta-division-patterns/?format=datatables&collection_id=${collection_id}`,
-  initComplete: function (data) {
-    this.api()
-      .columns()
-      .every(function (index) {
-        var table = $("#delta_division_patterns_table").DataTable();
-
-        let addDropdownSelect = {
-          1: {
-            columnToSearch: 6,
-            matchPattern: {
-              "Individual URL Pattern": 1,
-              "Multi-URL Pattern": 2,
-            },
-          },
-          2: {
-            columnToSearch: 7,
-            matchPattern: {
-              "Astrophysics": 1,
-              "Biological and Physical Sciences": 2,
-              "Earth Science": 3,
-              "Heliophysics": 4,
-              "Planetary Science": 5,
-            },
-          },
-        };
-
-        let column = this;
-        if (column.data().length === 0) {
-          $(`#delta-division-patterns-dropdown-${index}`).prop("disabled", true);
-        } else if (index in addDropdownSelect) {
-          $("#delta-division-patterns-dropdown-" + index).on("change", function () {
-            let col = addDropdownSelect[index].columnToSearch;
-            let searchInput =
-              addDropdownSelect[index].matchPattern[$(this).val()];
-            if ($(this).val() === "" || $(this).val() === undefined)
-              table.columns(col).search("").draw();
-            else {
-              table.columns(col).search(searchInput).draw();
-            }
-          });
-        }
-      });
-  },
-
-  columns: [
-    { data: "delta_match_pattern", class: "whiteText" },
-    {
-      data: "delta_match_pattern_type_display",
-      class: "text-center whiteText",
-      sortable: false,
-    },
-    { data: "delta_division_display", class: "whiteText" },
-    {
-      data: "delta_urls_count",
-      class: "text-center whiteText",
-      sortable: true,
-    },
-    {
-      data: null,
-      sortable: false,
-      class: "text-center",
-      render: function (data, type, row) {
-        return `<button class="btn btn-danger btn-sm delete-division-pattern-button" data-row-id="${row["id"]}"><i class="material-icons">delete</i></button >`;
-      },
-    },
-    { data: "id", visible: false, searchable: false },
-    { data: "delta_match_pattern_type", visible: false },
-    { data: "delta_division", visible: false },
-  ],
-});
-
-$("#deltaDivisionMatchPatternFilter").on("beforeinput", function (val) {
-  delta_division_patterns_table.columns(0).search(this.value).draw();
-});
 
 function handleTabsClick() {
   $("#includePatternsTab").on("click", function () {
@@ -1693,26 +801,6 @@ function handleTabsClick() {
     newDivisionPatternsCount = 0;
     $("#divisionPatternsTab").html(`Division Patterns`);
   });
-  $("#deltaIncludePatternsTab").on("click", function () {
-    newDeltaIncludePatternsCount = 0;
-    $("#deltaIncludePatternsTab").html(`Delta Include Patterns`);
-  });
-  $("#deltaExcludePatternsTab").on("click", function () {
-    newDeltaExcludePatternsCount = 0;
-    $("#deltaExcludePatternsTab").html(`Delta Exclude Patterns`);
-  });
-  $("#deltaTitlePatternsTab").on("click", function () {
-    newDeltaTitlePatternsCount = 0;
-    $("#deltaTitlePatternsTab").html(`Delta Title Patterns`);
-  });
-  $("#deltaDocumentTypePatternsTab").on("click", function () {
-    newDeltaDocumentTypePatternsCount = 0;
-    $("#deltaDocumentTypePatternsTab").html(`Delta Document Type Patterns`);
-  });
-  $("#deltaDivisionPatternsTab").on("click", function () {
-    newDeltaDivisionPatternsCount = 0;
-    $("#deltaDivisionPatternsTab").html(`Delta Division Patterns`);
-  });
 }
 
 function setupClickHandlers() {
@@ -1779,7 +867,7 @@ function postDivision(urlId, division) {
       csrfmiddlewaretoken: csrftoken,
     },
     success: function (data) {
-      $('#candidate_urls_table').DataTable().ajax.reload(null, false);
+      $('#delta_urls_table').DataTable().ajax.reload(null, false);
       toastr.success("Division assigned successfully!");
     },
     error: function (xhr, status, error) {
@@ -1829,7 +917,7 @@ function postDivisionPatterns(match_pattern, match_pattern_type, division) {
       csrfmiddlewaretoken: csrftoken,
     },
     success: function (data) {
-      $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+      $("#delta_urls_table").DataTable().ajax.reload(null, false);
       $("#division_patterns_table").DataTable().ajax.reload(null, false);
       if (currentTab === "") { // Only add a notification if we are on the first tab
         newDivisionPatternsCount = newDivisionPatternsCount + 1;
@@ -1855,34 +943,6 @@ function postDivisionPatterns(match_pattern, match_pattern_type, division) {
 }
 
 function getURLColumn() {
-  return {
-    data: "url",
-    width: "30%",
-    render: function (data, type, row) {
-      return `<div class="url-cell"><span class="candidate_url nameStyling">${remove_protocol(
-        data
-      )}</span>
-      <a target="_blank" href="${data}" data-url="/api/candidate-urls/${row["id"]
-        }/" class="url-link"> <i class="material-icons url-icon">open_in_new</i></a></div>`;
-    },
-  };
-}
-
-function getCuratedURLColumn() {
-  return {
-    data: "url",
-    width: "30%",
-    render: function (data, type, row) {
-      return `<div class="url-cell"><span class="curated_url nameStyling">${remove_protocol(
-        data
-      )}</span>
-      <a target="_blank" href="${data}" data-url="/api/curated-urls/${row["id"]
-        }/" class="url-link"> <i class="material-icons url-icon">open_in_new</i></a></div>`;
-    },
-  };
-}
-
-function getDeltaURLColumn() {
   return {
     data: "url",
     width: "30%",
@@ -1907,32 +967,6 @@ function getScrapedTitleColumn() {
 }
 
 function getGeneratedTitleColumn() {
-  return {
-    data: "generated_title",
-    width: "20%",
-    render: function (data, type, row) {
-      return `<input type="text" class="form-control individual_title_input whiteText" value='${data}' data-generated-title-id=${row["generated_title_id"]
-        } data-match-pattern-type=${row["match_pattern_type"]
-        } data-candidate-urls-count=${row["candidate_urls_count"]
-        } data-url=${remove_protocol(row["url"])} />`;
-    },
-  };
-}
-
-function getCuratedGeneratedTitleColumn() {
-  return {
-    data: "generated_title",
-    width: "20%",
-    render: function (data, type, row) {
-      return `<input type="text" class="form-control individual_title_input whiteText" value='${data}' data-generated-title-id=${row["generated_title_id"]
-        } data-match-pattern-type=${row["match_pattern_type"]
-        } data-curated-urls-count=${row["curated_urls_count"]
-        } data-url=${remove_protocol(row["url"])} />`;
-    },
-  };
-}
-
-function getDeltaGeneratedTitleColumn() {
   return {
     data: "generated_title",
     width: "20%",
@@ -2156,14 +1190,14 @@ function handleNewTitleChange() {
     var title_pattern = $(this).val();
     var generated_title_id = $(this).data("generated-title-id");
     var match_pattern_type = $(this).data("match-pattern-type");
-    var candidate_urls_count = $(this).data("candidate-urls-count");
+    var delta_urls_count = $(this).data("delta-urls-count");
     if (!title_pattern) {
       currentURLtoDelete = `/api/title-patterns/${generated_title_id}/`;
       deletePattern(
         `/api/title-patterns/${generated_title_id}/`,
         (data_type = "Title Pattern"),
         (url_type = match_pattern_type),
-        (candidate_urls_count = candidate_urls_count)
+        (delta_urls_count = delta_urls_count)
       );
     } else {
       postTitlePatterns(
@@ -2209,7 +1243,7 @@ function postDocumentTypePatterns(
       csrfmiddlewaretoken: csrftoken,
     },
     success: function (data) {
-      $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+      $("#delta_urls_table").DataTable().ajax.reload(null, false);
       $("#document_type_patterns_table").DataTable().ajax.reload(null, false);
       if (currentTab === "") { //Only add a notification if we are on the first tab
         newDocumentTypePatternsCount = newDocumentTypePatternsCount + 1;
@@ -2260,7 +1294,7 @@ function postExcludePatterns(match_pattern, match_pattern_type = 0, force) {
       csrfmiddlewaretoken: csrftoken,
     },
     success: function (data) {
-      $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+      $("#delta_urls_table").DataTable().ajax.reload(null, false);
       $("#exclude_patterns_table").DataTable().ajax.reload(null, false);
       if (currentTab === "") { //Only add a notification if we are on the first tab
         newExcludePatternsCount = newExcludePatternsCount + 1;
@@ -2302,7 +1336,7 @@ function postIncludePatterns(match_pattern, match_pattern_type = 0) {
       csrfmiddlewaretoken: csrftoken,
     },
     success: function (data) {
-      $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+      $("#delta_urls_table").DataTable().ajax.reload(null, false);
       $("#include_patterns_table").DataTable().ajax.reload(null, false);
       if (currentTab === "") { //Only add a notification if we are on the first tab
         newIncludePatternsCount = newIncludePatternsCount + 1;
@@ -2341,7 +1375,7 @@ function postTitlePatterns(
       csrfmiddlewaretoken: csrftoken
     },
     success: function (data) {
-      $('#candidate_urls_table').DataTable().ajax.reload(null, false);
+      $('#delta_urls_table').DataTable().ajax.reload(null, false);
       $('#title_patterns_table').DataTable().ajax.reload(null, false);
       if (currentTab === "") { //Only add a notification if we are on the first tab
         newTitlePatternsCount = newTitlePatternsCount + 1;
@@ -2385,11 +1419,11 @@ function deletePattern(
   url,
   data_type,
   url_type = null,
-  candidate_urls_count = null
+  delta_urls_count = null
 ) {
   if (url_type === MULTI_URL_PATTERN) {
     var confirmDelete = confirm(
-      `YOU ARE ATTEMPTING TO DELETE A MULTI-URL PATTERN. THIS WILL AFFECT ${candidate_urls_count} URLs. \n\nAre you sure you want to do this? Currently there is no way to delete a single URL from a Multi-URL pattern`
+      `YOU ARE ATTEMPTING TO DELETE A MULTI-URL PATTERN. THIS WILL AFFECT ${delta_urls_count} URLs. \n\nAre you sure you want to do this? Currently there is no way to delete a single URL from a Multi-URL pattern`
     );
   } else {
     $modal = $("#deletePatternModal").modal({
@@ -2421,7 +1455,7 @@ function deletePattern(
           },
           success: function (data) {
             $modal = $("#deletePatternModal").modal("hide");
-            $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+            $("#delta_urls_table").DataTable().ajax.reload(null, false);
             $("#exclude_patterns_table").DataTable().ajax.reload(null, false);
             $("#include_patterns_table").DataTable().ajax.reload(null, false);
             $("#title_patterns_table").DataTable().ajax.reload(null, false);
@@ -2450,7 +1484,7 @@ function deletePattern(
           "X-CSRFToken": csrftoken,
         },
         success: function (data) {
-          $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+          $("#delta_urls_table").DataTable().ajax.reload(null, false);
           $("#exclude_patterns_table").DataTable().ajax.reload(null, false);
           $("#include_patterns_table").DataTable().ajax.reload(null, false);
           $("#title_patterns_table").DataTable().ajax.reload(null, false);
@@ -2474,7 +1508,7 @@ function deletePattern(
       "X-CSRFToken": csrftoken,
     },
     success: function (data) {
-      $("#candidate_urls_table").DataTable().ajax.reload(null, false);
+      $("#delta_urls_table").DataTable().ajax.reload(null, false);
       $("#exclude_patterns_table").DataTable().ajax.reload(null, false);
       $("#include_patterns_table").DataTable().ajax.reload(null, false);
       $("#title_patterns_table").DataTable().ajax.reload(null, false);
@@ -2521,7 +1555,7 @@ function add_exclude_pattern(pattern) {
 }
 
 // Trigger action when the contexmenu is about to be shown
-$("body").on("contextmenu", ".candidate_url", function (event) {
+$("body").on("contextmenu", ".delta_url", function (event) {
   // Avoid the real one
   event.preventDefault();
 
diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
index 322bb245..df9ba9e3 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
@@ -2,19 +2,19 @@
 {% load static i18n %}
 {% load humanize %}
 {% block title %}
-{{ collection.name }} Candidate URLs
+{{ collection.name }} Delta URLs
 {% endblock title %}
 {% block stylesheets %}
     {{ block.super }}
     <link href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css" rel="stylesheet">
     <link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css" rel="stylesheet" />
-    <link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
+    <link rel="stylesheet" href="{% static 'css/delta_url_list.css' %}" />
     <link href="{% static 'css/project.css' %}" rel="stylesheet">
 {% endblock stylesheets %}
 {% block content %}
 {% csrf_token %}
 <div class="headerDiv">
-<h1 class="pageTitle">Candidate URLs</h1>
+<h1 class="pageTitle">Delta URLs</h1>
 <button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
 type="button"
 data-toggle="dropdown"
@@ -28,9 +28,9 @@ <h1 class="pageTitle">Candidate URLs</h1>
 {% endfor %}
 </div>
 </div>
-<div class="candidateUrlContainer">
-<h3 class="whiteText candidateTitle">
-    {{ candidate_urls.count|intcomma }} Candidate URLs for <a
+<div class="deltaUrlContainer">
+<h3 class="whiteText deltaTitle">
+    {{ delta_urls.count|intcomma }} Delta URLs for <a
         href="{% url 'sde_collections:detail' collection.pk %}"><strong class="urlStyle underline">{{ collection.name }}</strong></a>
     <br>
     <!-- <small class="muted">Base URL: <a href="{{ collection.url }}" target="_blank">{{ collection.url }}</a></small> -->
@@ -40,67 +40,31 @@ <h3 class="whiteText candidateTitle">
     <!-- Nav tabs -->
     <ul class="nav nav-tabs">
         <li class="nav-item">
-            <a class="tab-nav active tabStyle" data-toggle="tab" href="#Candidate-URLs">Candidate URLs</a>
+            <a class="tab-nav active tabStyle" data-toggle="tab" href="#Delta-URLs">Delta URLs</a>
         </li>
         <li class="nav-item">
-            <a class="tab-nav tabStyle" data-toggle="tab" href="#Curated-URLs">Curated URLs</a>
+            <a class="tab-nav tabStyle" id="excludePatternsTab" data-toggle="tab" href="#Exclude-Patterns">Exclude Patterns</a>
         </li>
         <li class="nav-item">
-            <a class="tab-nav tabStyle" data-toggle="tab" href="#Delta-URLs">Delta URLs</a>
+            <a class="tab-nav tabStyle" id="includePatternsTab" data-toggle="tab" href="#Include-Patterns">Include Patterns</a>
         </li>
         <li class="nav-item">
-            <a class="tab-nav tabStyle" id="excludePatternsTab" data-toggle="tab" href="#Exclude-Patterns">Exclude
-                Patterns</a>
+            <a class="tab-nav tabStyle" id="titlePatternsTab" data-toggle="tab" href="#Title-Patterns">Title Patterns</a>
         </li>
         <li class="nav-item">
-            <a class="tab-nav tabStyle" id="includePatternsTab" data-toggle="tab" href="#Include-Patterns">Include
-                Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="titlePatternsTab" data-toggle="tab" href="#Title-Patterns">Title
-                Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="documentTypePatternsTab" data-toggle="tab"
-                href="#Document-Type-Patterns">Document Type Patterns</a>
+            <a class="tab-nav tabStyle" id="documentTypePatternsTab" data-toggle="tab" href="#Document-Type-Patterns">Document Type Patterns</a>
         </li>
         {% if is_multi_division %}
         <li class="nav-item">
-            <a class="tab-nav tabStyle" id="divisionPatternsTab" data-toggle="tab"
-                href="#Division-Patterns">Division Patterns</a>
-        </li>
-        {% endif %}
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="deltaExcludePatternsTab" data-toggle="tab"
-                href="#Delta-Exclude-Patterns">Delta Exclude
-                Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="deltaIncludePatternsTab" data-toggle="tab"
-                href="#Delta-Include-Patterns">Delta Include
-                Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="deltaTitlePatternsTab" data-toggle="tab"
-                href="#Delta-Title-Patterns">Delta Title
-                Patterns</a>
-        </li>
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="deltaDocumentTypePatternsTab" data-toggle="tab"
-                href="#Delta-Document-Type-Patterns">Delta Document Type Patterns</a>
-        </li>
-        {% if is_multi_division %}
-        <li class="nav-item">
-            <a class="tab-nav tabStyle" id="deltaDivisionPatternsTab" data-toggle="tab"
-                href="#Delta-Division-Patterns">Delta Division Patterns</a>
+            <a class="tab-nav tabStyle" id="divisionPatternsTab" data-toggle="tab" href="#Division-Patterns">Division Patterns</a>
         </li>
         {% endif %}
     </ul>
 
     <!-- Tab panes -->
     <div class="tab-content">
-        <div class="tab-pane active" id="Candidate-URLs">
-            <table class="table" id="candidate_urls_table" style="width:100%">
+        <div class="tab-pane active" id="Delta-URLs">
+            <table class="table" id="delta_urls_table" style="width:100%" >
                 <thead class="tableHeader">
                     <tr>
                         <th scope="col" class="text-center col-1"><div class="header-title">URL</div></th>
@@ -122,198 +86,37 @@ <h3 class="whiteText candidateTitle">
 
                     </tr>
                     <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="candidateUrlFilter" placeholder="URL" /></td>
-                        <td><select class="dropdown-1 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="false">FALSE</option>
-                                <option value="true">TRUE</option>
-                            </select></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateScrapedTitleFilter" placeholder="Scraped Title" /></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateNewTitleFilter" placeholder="New Title" /></td>
-                        <td><select class="dropdown-4 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="0">None</option>
-                                <option value="1">Images</option>
-                                <option value="2">Data</option>
-                                <option value="3">Documentation</option>
-                                <option value="4">Software and Tools</option>
-                                <option value="5">Missions and Instruments</option>
-                            </select></td>
-                        <td><select class="dropdown-5 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="0">None</option>
-                                <option value="1">Astrophysics</option>
-                                <option value="2">Biological and Physical Sciences</option>
-                                <option value="3">Earth Science</option>
-                                <option value="4">Heliophysics</option>
-                                <option value="5">Planetary Science</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-        <div class="tab-pane fade" id="Curated-URLs">
-            <table class="table" id="curated_urls_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">URL</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Exclude</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Scraped Title</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">New Title</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Document Type</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Division</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">ID</div>
-                        </th>
-                        <th></th>
-                        <th></th>
-                        <th></th>
-                        <th></th>
-                        <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
-                    <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
-                    <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
-                        <!-- {% if is_multi_division %} -->
-                        <!-- {% endif %} -->
-
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="curatedUrlFilter"
-                                placeholder="URL" /></td>
-                        <td><select class="dropdown-1 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="false">FALSE</option>
-                                <option value="true">TRUE</option>
-                            </select></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="curatedScrapedTitleFilter" placeholder="Scraped Title" /></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="curatedNewTitleFilter" placeholder="New Title" /></td>
-                        <td><select class="dropdown-4 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="0">None</option>
-                                <option value="1">Images</option>
-                                <option value="2">Data</option>
-                                <option value="3">Documentation</option>
-                                <option value="4">Software and Tools</option>
-                                <option value="5">Missions and Instruments</option>
-                            </select></td>
-                        <td><select class="dropdown-5 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="0">None</option>
-                                <option value="1">Astrophysics</option>
-                                <option value="2">Biological and Physical Sciences</option>
-                                <option value="3">Earth Science</option>
-                                <option value="4">Heliophysics</option>
-                                <option value="5">Planetary Science</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-        <div class="tab-pane fade" id="Delta-URLs">
-            <table class="table" id="delta_urls_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">URL</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Exclude</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Scraped Title</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">New Title</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Document Type</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">Division</div>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <div class="header-title">ID</div>
-                        </th>
-                        <th></th>
-                        <th></th>
-                        <th></th>
-                        <th></th>
-                        <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
-                    <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
-                    <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
-                        <!-- {% if is_multi_division %} -->
-                        <!-- {% endif %} -->
-
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaUrlFilter"
-                                placeholder="URL" /></td>
-                        <td><select class="dropdown-1 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="false">FALSE</option>
-                                <option value="true">TRUE</option>
-                            </select></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaScrapedTitleFilter" placeholder="Scraped Title" /></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaNewTitleFilter" placeholder="New Title" /></td>
-                        <td><select class="dropdown-4 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="0">None</option>
-                                <option value="1">Images</option>
-                                <option value="2">Data</option>
-                                <option value="3">Documentation</option>
-                                <option value="4">Software and Tools</option>
-                                <option value="5">Missions and Instruments</option>
-                            </select></td>
-                        <td><select class="dropdown-5 select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="0">None</option>
-                                <option value="1">Astrophysics</option>
-                                <option value="2">Biological and Physical Sciences</option>
-                                <option value="3">Earth Science</option>
-                                <option value="4">Heliophysics</option>
-                                <option value="5">Planetary Science</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaUrlFilter" placeholder="URL" /></td>
+                        <td ><select class="dropdown-1 select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="false">FALSE</option>
+                            <option value="true">TRUE</option>
+                        </select></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaNewTitleFilter" placeholder="New Title" /></td>
+                        <td><select class="dropdown-4 select-dropdown selectStyling"><option value="">SELECT</option>
+                        <option value="0">None</option>
+                        <option value="1">Images</option>
+                        <option value="2">Data</option>
+                        <option value="3">Documentation</option>
+                        <option value="4">Software and Tools</option>
+                        <option value="5">Missions and Instruments</option>
+                        </select></td>
+                        <td><select class="dropdown-5 select-dropdown selectStyling"><option value="">SELECT</option>
+                        <option value="0">None</option>
+                        <option value="1">Astrophysics</option>
+                        <option value="2">Biological and Physical Sciences</option>
+                        <option value="3">Earth Science</option>
+                        <option value="4">Heliophysics</option>
+                        <option value="5">Planetary Science</option>
+                        </select></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
                     </tr>
                 </thead>
             </table>
@@ -330,18 +133,15 @@ <h3 class="whiteText candidateTitle">
                         <th scope="col" class="text-center col-1"><strong>ID</strong></th>
                     </tr>
                     <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td> <select id="exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateReasonFilter" placeholder="Reason" /></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td > <select id="exclude-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="Individual URL Pattern">Individual URL Pattern</option>
+                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                        </select>
+                            <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaReasonFilter" placeholder="Reason" /></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
                     </tr>
                 </thead>
             </table>
@@ -369,16 +169,14 @@ <h3 class="whiteText candidateTitle">
                         </th>
                     </tr>
                     <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td> <select id="include-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td > <select id="include-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="Individual URL Pattern">Individual URL Pattern</option>
+                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                        </select></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
                     </tr>
                 </thead>
             </table>
@@ -408,18 +206,15 @@ <h3 class="whiteText candidateTitle">
                         </th>
                     </tr>
                     <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td> <select id="title-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td > <select id="title-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="Individual URL Pattern">Individual URL Pattern</option>
+                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                        </select></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
                     </tr>
                 </thead>
             </table>
@@ -448,24 +243,21 @@ <h3 class="whiteText candidateTitle">
                         </th>
                     </tr>
                     <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td><select id="document-type-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
-                        <td><select id="document-type-patterns-dropdown-2" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Images">Images</option>
-                                <option value="Data">Data</option>
-                                <option value="Documentation">Documentation</option>
-                                <option value="Software and Tools">Software and Tools</option>
-                                <option value="Missions and Instruments">Missions and Instruments</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td ><select id="document-type-patterns-dropdown-1" class="select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="Individual URL Pattern">Individual URL Pattern</option>
+                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                        </select></td>
+                        <td ><select id="document-type-patterns-dropdown-2" class="select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="Images">Images</option>
+                            <option value="Data">Data</option>
+                            <option value="Documentation">Documentation</option>
+                            <option value="Software and Tools">Software and Tools</option>
+                            <option value="Missions and Instruments">Missions and Instruments</option>
+                        </select></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
                     </tr>
                 </thead>
             </table>
@@ -482,211 +274,20 @@ <h3 class="whiteText candidateTitle">
                         <th scope="col" class="text-center col-1"><strong>ID</strong></th>
                     </tr>
                     <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="candidateDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
+                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
                         <td><select id="division-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
+                            <option value="">SELECT</option>
+                            <option value="Individual URL Pattern">Individual URL Pattern</option>
+                            <option value="Multi-URL Pattern">Multi-URL Pattern</option>
+                        </select></td>
                         <td><select id="division-patterns-dropdown-2" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="1">Astrophysics</option>
-                                <option value="2">Biological and Physical Sciences</option>
-                                <option value="3">Earth Science</option>
-                                <option value="4">Heliophysics</option>
-                                <option value="5">Planetary Science</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-
-
-        <div class="tab-pane fade" id="Delta-Exclude-Patterns">
-            <table class="table" id="delta_exclude_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td> <select id="delta-exclude-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling" id="deltaReasonFilter"
-                                placeholder="Reason" /></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-            <hr>
-        </div>
-
-        <div class="tab-pane fade" id="Delta-Include-Patterns">
-            <table class="table" id="delta_include_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Actions</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>ID</strong>
-                        </th>
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaIncludeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td> <select id="delta-include-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-
-        </div>
-        <div class="tab-pane fade" id="Delta-Title-Patterns">
-            <table class="table" id="delta_title_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Title Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Actions</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>ID</strong>
-                        </th>
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaTitleMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td> <select id="delta-title-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaTitlePatternTypeFilter" placeholder="Title Pattern" /></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-        <div class="tab-pane fade" id="Delta-Document-Type-Patterns">
-            <table class="table" id="delta_document_type_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Match Pattern Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Document Type</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>Actions</strong>
-                        </th>
-                        <th scope="col" class="text-center col-1">
-                            <strong>ID</strong>
-                        </th>
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaDocTypeMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td><select id="delta-document-type-patterns-dropdown-1"
-                                class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
-                        <td><select id="delta-document-type-patterns-dropdown-2"
-                                class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Images">Images</option>
-                                <option value="Data">Data</option>
-                                <option value="Documentation">Documentation</option>
-                                <option value="Software and Tools">Software and Tools</option>
-                                <option value="Missions and Instruments">Missions and Instruments</option>
-                            </select></td>
-                        <td></td>
-                        <td></td>
-                        <td></td>
-                    </tr>
-                </thead>
-            </table>
-        </div>
-        <div class="tab-pane fade" id="Delta-Division-Patterns">
-            <table class="table" id="delta_division_patterns_table" style="width:100%">
-                <thead class="tableHeader">
-                    <tr>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Division</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>ID</strong></th>
-                    </tr>
-                    <tr>
-                        <td><input type="text" class="table_filter_row_input textBoxStyling"
-                                id="deltaDivisionMatchPatternFilter" placeholder="Match Pattern" /></td>
-                        <td><select id="delta-division-patterns-dropdown-1" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="Individual URL Pattern">Individual URL Pattern</option>
-                                <option value="Multi-URL Pattern">Multi-URL Pattern</option>
-                            </select></td>
-                        <td><select id="delta-division-patterns-dropdown-2" class="select-dropdown selectStyling">
-                                <option value="">SELECT</option>
-                                <option value="1">Astrophysics</option>
-                                <option value="2">Biological and Physical Sciences</option>
-                                <option value="3">Earth Science</option>
-                                <option value="4">Heliophysics</option>
-                                <option value="5">Planetary Science</option>
-                            </select></td>
+                            <option value="">SELECT</option>
+                            <option value="1">Astrophysics</option>
+                            <option value="2">Biological and Physical Sciences</option>
+                            <option value="3">Earth Science</option>
+                            <option value="4">Heliophysics</option>
+                            <option value="5">Planetary Science</option>
+                        </select></td>
                         <td></td>
                         <td></td>
                         <td></td>
@@ -708,11 +309,6 @@ <h3 class="whiteText candidateTitle">
     <li data-action="title-pattern" class="list-group-item">Create Title Pattern</li>
     <li data-action="document-type-pattern" class="list-group-item">Create Document Type Pattern</li>
     <li data-action="division-pattern" class="list-group-item">Create Division Pattern</li>
-    <li data-action="delta-exclude-pattern" class="list-group-item">Create Delta Exclude Pattern</li>
-    <li data-action="delta-include-pattern" class="list-group-item">Create Delta Include Pattern</li>
-    <li data-action="delta-title-pattern" class="list-group-item">Create Delta Title Pattern</li>
-    <li data-action="delta-document-type-pattern" class="list-group-item">Create Delta Document Type Pattern</li>
-    <li data-action="delta-division-pattern" class="list-group-item">Create Delta Division Pattern</li>
 </ul>
 <div class="modal fade" id="excludePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
     aria-labelledby="excludePatternModalLabel" aria-hidden="true">
@@ -733,10 +329,9 @@ <h5 class="modal-title" id="excludePatternModalLabel">Exclude Pattern Form</h5>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
+                    <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
+                    <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                </div></div>
             </form>
         </div>
     </div>
@@ -760,10 +355,9 @@ <h5 class="modal-title" id="includePatternModalLabel">Include Pattern Form</h5>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
+                    <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
+                    <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                </div></div>
             </form>
         </div>
     </div>
@@ -791,10 +385,9 @@ <h5 class="modal-title" id="titlePatternModalLabel">Title Pattern Form</h5>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
+                    <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
+                    <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                </div> </div>
             </form>
         </div>
     </div>
@@ -864,11 +457,10 @@ <h5 class="modal-title" id="divisionPatternModalLabel">Division Pattern Form</h5
                     <div class="form-group">
                         <div class="input-group">
                             <input type="hidden" name="division_pattern" class="form-control"
-                            aria-label="Division" id="division_input_field">
+                                aria-label="Division" id="division_input_field">
                             <div class="input-group-append division-dropdown-input">
-                                <button class="btn btn-secondary btn-block dropdown-toggle division-dropdown"
-                                    type="button" data-toggle="dropdown" aria-haspopup="true"
-                                    aria-expanded="false">Select Division</button>
+                                <button class="btn btn-secondary btn-block dropdown-toggle division-dropdown" type="button"
+                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Division</button>
                                 <div class="division-form dropdown-menu">
                                     <a class="dropdown-item division_form_select" value="1">Astrophysics</a>
                                     <a class="dropdown-item division_form_select" value="2">Biological and Physical Sciences</a>
@@ -916,257 +508,10 @@ <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Column
 <div class="modal-dialog">
     <div class="modal-content">
         <div class="modalHeader">
-            <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close"
-                id="closeDivisionModal">
+            <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeDivisionModal">
                 <span aria-hidden="true">&times;</span>
             </button>
         </div>
-        <div class="modal-body" id="modal-body">
-            <h5 class="modal-title">Are you sure?</h5>
-            <p class="delete-pattern-caption" id="caption"></p>
-        </div>
-        <div class="modal-footer">
-            <form id="deletePatternModalForm">
-                <div class="button-wrapper">
-                <button type="submit" class="btn btn-secondary modal-button-1" id="dontDeletePattern">No</button>
-                <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="deletePattern">Yes</button>
-                </div>
-                </form>
-        </div>
-    </div>
-</div>
-</div>
-
-<div class="modal fade" id="deltaExcludePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
-    aria-labelledby="deltaExcludePatternModalLabel" aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modal-header">
-                <h5 class="modal-title" id="deltaExcludePatternModalLabel">Delta Exclude Pattern Form</h5>
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
-            <form id="delta_exclude_pattern_form">
-                <div class="modal-body">
-                    <div class="form-group">
-                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
-                            </div>
-                        </label>
-                        <input type="text" class="form-control" id="delta_match_pattern_input" required
-                            name="delta_match_pattern">
-                    </div>
-                </div>
-                <div class="modal-footer">
-                    <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
-                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
-            </form>
-        </div>
-    </div>
-</div>
-<div class="modal fade" id="deltaIncludePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
-    aria-labelledby="deltaIncludePatternModalLabel" aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modal-header">
-                <h5 class="modal-title" id="deltaIncludePatternModalLabel">Delta Include Pattern Form</h5>
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
-            <form id="delta_include_pattern_form">
-                <div class="modal-body">
-                    <div class="form-group">
-                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
-                            </div>
-                        </label>
-                        <input type="text" class="form-control" id="delta_match_pattern_input" required
-                            name="delta_match_pattern">
-                    </div>
-                </div>
-                <div class="modal-footer">
-                    <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
-                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
-            </form>
-        </div>
-    </div>
-</div>
-<div class="modal fade" id="deltaTitlePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
-    aria-labelledby="deltaTitlePatternModalLabel" aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modal-header">
-                <h5 class="modal-title" id="deltaTitlePatternModalLabel">Delta Title Pattern Form</h5>
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
-            <form id="delta_title_pattern_form">
-                <div class="modal-body">
-                    <div class="form-group">
-                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
-                            </div>
-                        </label>
-                        <input type="text" class="form-control" id="delta_match_pattern_input" required
-                            name="delta_match_pattern">
-                    </div>
-                    <div class="form-group title_pattern-form-group">
-                        <label for="delta_title_pattern_input" class="form-label">Title Pattern <div class="asterik">*
-                            </div>
-                        </label>
-                        <input type="text" class="form-control" id="delta_title_pattern_input" required
-                            name="delta_title_pattern">
-                    </div>
-                </div>
-                <div class="modal-footer">
-                    <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
-                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
-            </form>
-        </div>
-    </div>
-</div>
-<div class="modal fade" id="deltaDocumentTypePatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
-    aria-labelledby="deltaDocumentTypePatternModalLabel" aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modal-header">
-                <h5 class="modal-title" id="deltaDocumentTypePatternModalLabel">Delta Document Type Pattern Form</h5>
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
-            <form id="delta_document_type_pattern_form">
-                <div class="modal-body">
-                    <div class="form-group">
-                        <label for="delta_match_pattern_input" class="form-label">Match Pattern <div class="asterik">*
-                            </div>
-                        </label>
-                        <input type="text" class="form-control" id="delta_match_pattern_input" required
-                            name="delta_match_pattern">
-                    </div>
-                    <div class="form-group">
-                        <div class="input-group">
-                            <input type="hidden" name="delta_document_type_pattern" class="form-control"
-                                aria-label="Document Type" id="delta_doc_type_input_field">
-                            <div class="input-group-append doc-dropdown-input">
-                                <button class="btn btn-secondary btn-block dropdown-toggle doc-dropdown" type="button"
-                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Document
-                                    Type</button>
-                                <div class="doc-type-form dropdown-menu">
-                                    <a class="dropdown-item document_type_form_select" value="1">Images</a>
-                                    <a class="dropdown-item document_type_form_select" value="2">Data</a>
-                                    <a class="dropdown-item document_type_form_select" value="3">Documentation</a>
-                                    <a class="dropdown-item document_type_form_select" value="4">Software and Tools</a>
-                                    <a class="dropdown-item document_type_form_select" value="5">Missions and
-                                        Instruments</a>
-                                </div>
-                            </div>
-                        </div>
-                    </div>
-                </div>
-                <div class="modal-footer">
-                    <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
-                        <button type="submit"
-                            class="document-type-submit btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
-            </form>
-        </div>
-    </div>
-</div>
-<div class="modal fade" id="deltaDivisionPatternModal" data-backdrop="static" data-keyboard="false" tabindex="-1"
-    aria-labelledby="deltaDivisionPatternModalLabel" aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modal-header">
-                <h5 class="modal-title" id="deltaDivisionPatternModalLabel">Delta Division Pattern Form</h5>
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
-            <form id="delta_division_pattern_form">
-                <div class="modal-body">
-                    <div class="form-group">
-                        <label for="delta_division_match_pattern_input" class="form-label">Match Pattern <div
-                                class="asterik">
-                                *</div></label>
-                        <input type="text" class="form-control" id="delta_division_match_pattern_input" required
-                            name="delta_match_pattern">
-                    </div>
-                    <div class="form-group">
-                        <div class="input-group">
-                            <input type="hidden" name="delta_division_pattern" class="form-control"
-                                aria-label="Division" id="delta_division_input_field">
-                            <div class="input-group-append division-dropdown-input">
-                                <button class="btn btn-secondary btn-block dropdown-toggle division-dropdown"
-                                    type="button" data-toggle="dropdown" aria-haspopup="true"
-                                    aria-expanded="false">Select Division</button>
-                                <div class="division-form dropdown-menu">
-                                    <a class="dropdown-item division_form_select" value="1">Astrophysics</a>
-                                    <a class="dropdown-item division_form_select" value="2">Biological and Physical
-                                        Sciences</a>
-                                    <a class="dropdown-item division_form_select" value="3">Earth Science</a>
-                                    <a class="dropdown-item division_form_select" value="4">Heliophysics</a>
-                                    <a class="dropdown-item division_form_select" value="5">Planetary Science</a>
-                                </div>
-                            </div>
-                        </div>
-                    </div>
-                </div>
-                <div class="modal-footer">
-                    <div class="button-wrapper">
-                        <button type="button" class="btn btn-secondary modal-button-1"
-                            data-dismiss="modal">Close</button>
-                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                    </div>
-                </div>
-            </form>
-        </div>
-    </div>
-</div>
-
-<!-- Validate if the below is required for delta -->
-<!-- <div id="hideShowColumnsModal" class="modal pr-4 pl-4 pt-4 customizeColumnContainer">
-    <div class="modalDialog">
-        <div class="modalContent">
-            <div class="modalHeader ">
-                <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Columns</h5>
-                <p id="subTitle" class="whiteText">Attributes marked with a checkbox will be displayed in the table.</p>
-            </div>
-            <form id="hide_show_columns_form">
-                <div class="modalBody whiteText" id="modalBody">
-                </div>
-                <div class="modalFooter customizeColumnContainer">
-                    <div type="submit" class="btn-prime hideShowSubmitButton" id="hideShowSubmitButton">Confirm</div>
-                </div>
-            </form>
-        </div>
-    </div>
-</div>
-
-
-<div class="modal" id="deletePatternModal" tabindex="-1" aria-labelledby="deletePatternModal" aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modalHeader">
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeDivisionModal">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
             <div class="modal-body" id="modal-body">
                 <h5 class="modal-title">Are you sure?</h5>
                 <p class="delete-pattern-caption" id="caption"></p>
@@ -1174,24 +519,24 @@ <h5 class="modal-title">Are you sure?</h5>
             <div class="modal-footer">
                 <form id="deletePatternModalForm">
                     <div class="button-wrapper">
-                        <button type="submit" class="btn btn-secondary modal-button-1" id="dontDeletePattern">No</button>
-                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="deletePattern">Yes</button>
+                    <button type="submit" class="btn btn-secondary modal-button-1" id="dontDeletePattern">No</button>
+                    <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="deletePattern">Yes</button>
                     </div>
-                </form>
+                    </form>
             </div>
-        </div>
     </div>
-</div> -->
+</div>
+</div>
 
-<div class="modal" id="workflowStatusChangeModal" tabindex="-1" aria-labelledby="workflowStatusChangeModal"
-    aria-hidden="true">
-    <div class="modal-dialog">
-        <div class="modal-content">
-            <div class="modalHeader">
-                <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeworkflowStatusChangeModal">
-                    <span aria-hidden="true">&times;</span>
-                </button>
-            </div>
+<div class="modal" id="workflowStatusChangeModal" tabindex="-1"
+aria-labelledby="workflowStatusChangeModal" aria-hidden="true">
+<div class="modal-dialog">
+    <div class="modal-content">
+        <div class="modalHeader">
+            <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closeworkflowStatusChangeModal">
+                <span aria-hidden="true">&times;</span>
+            </button>
+        </div>
             <div class="modal-body" id="modal-body">
                 <h5 class="modal-title">Are you sure?</h5>
                 <p class="workflow-status-change-caption" id="caption"></p>
@@ -1199,17 +544,17 @@ <h5 class="modal-title">Are you sure?</h5>
             <div class="modal-footer">
                 <form id="workflowStatusChangeModalForm">
                     <div class="button-wrapper">
-                        <button type="submit" class="btn btn-secondary modal-button-1" id="cancelworkflowStatusChange">No</button>
-                        <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="changeWorkflowStatus">Yes</button>
+                    <button type="submit" class="btn btn-secondary modal-button-1" id="cancelworkflowStatusChange">No</button>
+                    <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="changeWorkflowStatus">Yes</button>
                     </div>
-                </form>
+                    </form>
             </div>
-        </div>
     </div>
+</div>
 
-    {% endblock content %}
+{% endblock content %}
 
-    {% block javascripts %}
+{% block javascripts %}
     {{ block.super }}
     <script>var collection_id = "{{ collection.id }}";</script>
     <script>var is_multi_division = "{{ is_multi_division|lower }}";</script>
@@ -1218,9 +563,7 @@ <h5 class="modal-title">Are you sure?</h5>
     <script src="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.js"></script>
     <script src="//cdnjs.cloudflare.com/ajax/libs/jquery.blockUI/2.70/jquery.blockUI.min.js"></script>
     <script src="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.js"></script>
-    <script src="{% static 'js/candidate_url_list.js' %}"></script>
-    <!-- <script src="{% static 'js/curated_url_list.js' %}"></script>
-    <script src="{% static 'js/delta_url_list.js' %}"></script> -->
+    <script src="{% static 'js/delta_url_list.js' %}"></script>
     <script src="{% static 'js/project.js' %}"></script>
     <script src="{% static 'js/core/bootstrap.min.js' %}"></script>
-    {% endblock javascripts %}
+{% endblock javascripts %}

From 0294a019732f071afea820139cfc3f0ffc8fd601 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 14 Nov 2024 21:29:46 -0600
Subject: [PATCH 129/441] Rectified imports and names

---
 sde_collections/serializers.py | 10 +++++-----
 sde_collections/views.py       | 20 ++++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 575ae198..36b0a818 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -2,14 +2,14 @@
 
 from .models.collection import Collection, WorkflowHistory
 from .models.collection_choice_fields import Divisions, DocumentTypes
-from .models.delta_url import DeltaURL
-from .models.pattern import (
+from .models.delta_patterns import (
     DeltaDivisionPattern,
     DeltaDocumentTypePattern,
     DeltaExcludePattern,
     DeltaIncludePattern,
     DeltaTitlePattern,
 )
+from .models.delta_url import DeltaUrl
 
 
 class CollectionSerializer(serializers.ModelSerializer):
@@ -76,7 +76,7 @@ def get_match_pattern_type(self, obj):
         return titlepattern.match_pattern_type if titlepattern else None
 
     class Meta:
-        model = DeltaURL
+        model = DeltaUrl
         fields = (
             "id",
             "excluded",
@@ -96,7 +96,7 @@ class Meta:
 
 class DeltaURLBulkCreateSerializer(serializers.ModelSerializer):
     class Meta:
-        model = DeltaURL
+        model = DeltaUrl
         fields = (
             "url",
             "scraped_title",
@@ -110,7 +110,7 @@ class DeltaURLAPISerializer(serializers.ModelSerializer):
     tree_root = serializers.SerializerMethodField()
 
     class Meta:
-        model = DeltaURL
+        model = DeltaUrl
         fields = (
             "url",
             "title",
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 3f711d17..60481753 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -26,7 +26,7 @@
     DocumentTypes,
     WorkflowStatusChoices,
 )
-from .models.delta_url import DeltaURL, ResolvedTitle, ResolvedTitleError
+from .models.delta_url import DeltaResolvedTitle, DeltaResolvedTitleError, DeltaUrl
 from .models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -189,7 +189,7 @@ class DeltaURLsListView(LoginRequiredMixin, ListView):
     Display a list of collections in the system
     """
 
-    model = DeltaURL
+    model = DeltaUrl
     template_name = "sde_collections/delta_urls_list.html"
     context_object_name = "delta_urls"
     # paginate_by = 1000
@@ -255,7 +255,7 @@ def get_queryset(self):
 
 
 class DeltaURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DeltaURL.objects.all()
+    queryset = DeltaUrl.objects.all()
     serializer_class = DeltaURLSerializer
 
     def _filter_by_is_excluded(self, queryset, is_excluded):
@@ -275,7 +275,7 @@ def get_queryset(self):
         return queryset.order_by("url")
 
     def update_division(self, request, pk=None):
-        delta_url = get_object_or_404(DeltaURL, pk=pk)
+        delta_url = get_object_or_404(DeltaUrl, pk=pk)
         division = request.data.get("division")
         if division:
             delta_url.division = division
@@ -285,7 +285,7 @@ def update_division(self, request, pk=None):
 
 
 class DeltaURLBulkCreateView(generics.ListCreateAPIView):
-    queryset = DeltaURL.objects.all()
+    queryset = DeltaUrl.objects.all()
     serializer_class = DeltaURLBulkCreateSerializer
 
     def perform_create(self, serializer, collection_id=None):
@@ -317,7 +317,7 @@ def get(self, request, *args, **kwargs):
 
     def get_queryset(self):
         queryset = (
-            DeltaURL.objects.filter(collection__config_folder=self.config_folder)
+            DeltaUrl.objects.filter(collection__config_folder=self.config_folder)
             .with_exclusion_status()
             .filter(excluded=False)
         )
@@ -536,19 +536,19 @@ def get_context_data(self, **kwargs):
 
 
 class ResolvedTitleListView(ListView):
-    model = ResolvedTitle
+    model = DeltaResolvedTitle
     context_object_name = "resolved_titles"
 
 
 class ResolvedTitleErrorListView(ListView):
-    model = ResolvedTitleError
+    model = DeltaResolvedTitleError
     context_object_name = "resolved_title_errors"
 
 
 class TitlesAndErrorsView(View):
     def get(self, request, *args, **kwargs):
-        resolved_titles = ResolvedTitle.objects.select_related("title_pattern", "delta_url").all()
-        resolved_title_errors = ResolvedTitleError.objects.select_related("title_pattern", "delta_url").all()
+        resolved_titles = DeltaResolvedTitle.objects.select_related("title_pattern", "delta_url").all()
+        resolved_title_errors = DeltaResolvedTitleError.objects.select_related("title_pattern", "delta_url").all()
         context = {
             "resolved_titles": resolved_titles,
             "resolved_title_errors": resolved_title_errors,

From fbc1ecb809ca181eddbf2305fbc656f2bf345323 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 22:44:26 -0600
Subject: [PATCH 130/441] update related name for url models

---
 sde_collections/models/delta_url.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 2edb2380..f0a8bea1 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -38,7 +38,6 @@ def get_queryset(self):
 class BaseUrl(models.Model):
     """Abstract base class for Urls with shared fields and methods."""
 
-    collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="%(class)s_urls")
     url = models.CharField("Url", unique=True)
     scraped_title = models.CharField(
         "Scraped Title",
@@ -113,6 +112,8 @@ def __str__(self):
 class DumpUrl(BaseUrl):
     """Stores the raw dump from the server before deltas are calculated."""
 
+    collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="dump_urls")
+
     class Meta:
         verbose_name = "Dump Urls"
         verbose_name_plural = "Dump Urls"
@@ -122,6 +123,8 @@ class Meta:
 class DeltaUrl(BaseUrl):
     """Urls that are being curated. Only deltas are stored in this model."""
 
+    collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="delta_urls")
+
     objects = DeltaUrlManager()
     delete = models.BooleanField(default=False)
 
@@ -134,6 +137,8 @@ class Meta:
 class CuratedUrl(BaseUrl):
     """Urls that are curated and ready for production"""
 
+    collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="curated_urls")
+
     objects = CuratedUrlManager()
 
     class Meta:

From 8afb79c8716aac111d0fd5bebf24d86561f393bb Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 14 Nov 2024 22:49:37 -0600
Subject: [PATCH 131/441] update file naming

---
 .../templates/sde_collections/collection_detail.html          | 4 ++--
 .../{candidate_urls_list.html => delta_urls_list.html}        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename sde_indexing_helper/templates/sde_collections/{candidate_urls_list.html => delta_urls_list.html} (99%)

diff --git a/sde_indexing_helper/templates/sde_collections/collection_detail.html b/sde_indexing_helper/templates/sde_collections/collection_detail.html
index 2f196527..d6dd9ace 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_detail.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_detail.html
@@ -211,9 +211,9 @@ <h1 class="nameWrapper"><div class="collectionName" id="collectionName">{{ colle
                         </td>
                     </tr>
                     <tr>
-                        <th class="detailsHeader">Candidate URLs</th>
+                        <th class="detailsHeader">Delta URLs</th>
                         <td>
-                            <a class="urlStyle underline" href="{% url 'sde_collections:candidate_urls' collection.id %}">View</a>
+                            <a class="urlStyle underline" href="{% url 'sde_collections:delta_urls' collection.id %}">View</a>
                         </td>
                     </tr>
                     <tr>
diff --git a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
similarity index 99%
rename from sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
rename to sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index df9ba9e3..cebfa249 100644
--- a/sde_indexing_helper/templates/sde_collections/candidate_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -8,7 +8,7 @@
     {{ block.super }}
     <link href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css" rel="stylesheet">
     <link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css" rel="stylesheet" />
-    <link rel="stylesheet" href="{% static 'css/delta_url_list.css' %}" />
+    <link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
     <link href="{% static 'css/project.css' %}" rel="stylesheet">
 {% endblock stylesheets %}
 {% block content %}

From 9737bf937e61c804c48630f787894e9af4ab74bc Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 15 Nov 2024 10:43:39 -0600
Subject: [PATCH 132/441] Renamed relevant files

---
 .../static/css/{candidate_url_list.css => delta_url_list.css}   | 0
 .../static/js/{candidate_url_list.js => delta_url_list.js}      | 0
 .../templates/sde_collections/delta_urls_list.html              | 2 +-
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename sde_indexing_helper/static/css/{candidate_url_list.css => delta_url_list.css} (100%)
 rename sde_indexing_helper/static/js/{candidate_url_list.js => delta_url_list.js} (100%)

diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/delta_url_list.css
similarity index 100%
rename from sde_indexing_helper/static/css/candidate_url_list.css
rename to sde_indexing_helper/static/css/delta_url_list.css
diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
similarity index 100%
rename from sde_indexing_helper/static/js/candidate_url_list.js
rename to sde_indexing_helper/static/js/delta_url_list.js
diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index cebfa249..df9ba9e3 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -8,7 +8,7 @@
     {{ block.super }}
     <link href="//cdn.datatables.net/v/bs4/jszip-3.10.1/dt-2.0.5/af-2.7.0/b-3.0.2/b-colvis-3.0.2/b-html5-3.0.2/b-print-3.0.2/cr-2.0.1/fc-5.0.0/fh-4.0.1/kt-2.12.0/r-3.0.2/rg-1.5.0/rr-1.5.0/sc-2.4.1/sp-2.3.1/sl-2.0.1/datatables.min.css" rel="stylesheet">
     <link href="https://cdn.datatables.net/plug-ins/2.0.8/features/inputPaging/dist/dataTables.inputPaging.min.css" rel="stylesheet" />
-    <link rel="stylesheet" href="{% static 'css/candidate_url_list.css' %}" />
+    <link rel="stylesheet" href="{% static 'css/delta_url_list.css' %}" />
     <link href="{% static 'css/project.css' %}" rel="stylesheet">
 {% endblock stylesheets %}
 {% block content %}

From 44e6ff1a50c57951c488e05e974488eadbd64f6c Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 15 Nov 2024 11:14:45 -0600
Subject: [PATCH 133/441] Patterns frontend fixed

---
 sde_collections/views.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/sde_collections/views.py b/sde_collections/views.py
index 60481753..192d9eec 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -26,14 +26,14 @@
     DocumentTypes,
     WorkflowStatusChoices,
 )
-from .models.delta_url import DeltaResolvedTitle, DeltaResolvedTitleError, DeltaUrl
-from .models.pattern import (
-    DivisionPattern,
-    DocumentTypePattern,
-    ExcludePattern,
-    IncludePattern,
-    TitlePattern,
+from .models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+    DeltaTitlePattern,
 )
+from .models.delta_url import DeltaResolvedTitle, DeltaResolvedTitleError, DeltaUrl
 from .serializers import (
     CollectionReadSerializer,
     CollectionSerializer,
@@ -325,7 +325,7 @@ def get_queryset(self):
 
 
 class ExcludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = ExcludePattern.objects.all()
+    queryset = DeltaExcludePattern.objects.all()
     serializer_class = ExcludePatternSerializer
 
     def get_queryset(self):
@@ -335,17 +335,17 @@ def create(self, request, *args, **kwargs):
         match_pattern = request.POST.get("match_pattern")
         collection_id = request.POST.get("collection")
         try:
-            ExcludePattern.objects.get(
+            DeltaExcludePattern.objects.get(
                 collection_id=Collection.objects.get(id=collection_id),
                 match_pattern=match_pattern,
             ).delete()
             return Response(status=status.HTTP_200_OK)
-        except ExcludePattern.DoesNotExist:
+        except DeltaExcludePattern.DoesNotExist:
             return super().create(request, *args, **kwargs)
 
 
 class IncludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = IncludePattern.objects.all()
+    queryset = DeltaIncludePattern.objects.all()
     serializer_class = IncludePatternSerializer
 
     def get_queryset(self):
@@ -355,17 +355,17 @@ def create(self, request, *args, **kwargs):
         match_pattern = request.POST.get("match_pattern")
         collection_id = request.POST.get("collection")
         try:
-            IncludePattern.objects.get(
+            DeltaIncludePattern.objects.get(
                 collection_id=Collection.objects.get(id=collection_id),
                 match_pattern=match_pattern,
             ).delete()
             return Response(status=status.HTTP_200_OK)
-        except IncludePattern.DoesNotExist:
+        except DeltaIncludePattern.DoesNotExist:
             return super().create(request, *args, **kwargs)
 
 
 class TitlePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = TitlePattern.objects.all()
+    queryset = DeltaTitlePattern.objects.all()
     serializer_class = TitlePatternSerializer
 
     def get_queryset(self):
@@ -373,7 +373,7 @@ def get_queryset(self):
 
 
 class DocumentTypePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DocumentTypePattern.objects.all()
+    queryset = DeltaDocumentTypePattern.objects.all()
     serializer_class = DocumentTypePatternSerializer
 
     def get_queryset(self):
@@ -387,18 +387,18 @@ def create(self, request, *args, **kwargs):
             collection_id = request.POST.get("collection")
             match_pattern = request.POST.get("match_pattern")
             try:
-                DocumentTypePattern.objects.get(
+                DeltaDocumentTypePattern.objects.get(
                     collection_id=Collection.objects.get(id=collection_id),
                     match_pattern=match_pattern,
-                    match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
+                    match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL,
                 ).delete()
                 return Response(status=status.HTTP_200_OK)
-            except DocumentTypePattern.DoesNotExist:
+            except DeltaDocumentTypePattern.DoesNotExist:
                 return Response(status=status.HTTP_204_NO_CONTENT)
 
 
 class DivisionPatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
-    queryset = DivisionPattern.objects.all()
+    queryset = DeltaDivisionPattern.objects.all()
     serializer_class = DivisionPatternSerializer
 
     def get_queryset(self):

From 2adbd754f8899f03f8e55263a5f8daf3597e32a8 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 15 Nov 2024 11:24:55 -0600
Subject: [PATCH 134/441] Fixed delta urls frontend

---
 sde_collections/serializers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 36b0a818..680de4cb 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -64,15 +64,15 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_delta_urls_count(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
+        titlepattern = obj.deltatitlepattern_delta_urls.last()
         return titlepattern.delta_urls.count() if titlepattern else 0
 
     def get_generated_title_id(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
+        titlepattern = obj.deltatitlepattern_delta_urls.last()
         return titlepattern.id if titlepattern else None
 
     def get_match_pattern_type(self, obj):
-        titlepattern = obj.titlepattern_urls.last()
+        titlepattern = obj.deltatitlepattern_delta_urls.last()
         return titlepattern.match_pattern_type if titlepattern else None
 
     class Meta:

From 6e8dab3d4fea52d1303c71c180744c35b2657720 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 15 Nov 2024 13:27:43 -0600
Subject: [PATCH 135/441] Added HTML for CuratedUrl

---
 .../sde_collections/delta_urls_list.html      | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index df9ba9e3..33d3f536 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -42,6 +42,9 @@ <h3 class="whiteText deltaTitle">
         <li class="nav-item">
             <a class="tab-nav active tabStyle" data-toggle="tab" href="#Delta-URLs">Delta URLs</a>
         </li>
+        <li class="nav-item">
+            <a class="tab-nav tabStyle" data-toggle="tab" href="#Curated-URLs">Curated URLs</a>
+        </li>
         <li class="nav-item">
             <a class="tab-nav tabStyle" id="excludePatternsTab" data-toggle="tab" href="#Exclude-Patterns">Exclude Patterns</a>
         </li>
@@ -121,6 +124,64 @@ <h3 class="whiteText deltaTitle">
                 </thead>
             </table>
         </div>
+        <div class="tab-pane fade" id="Curated-URLs">
+            <table class="table" id="curated_urls_table" style="width:100%" >
+                <thead class="tableHeader">
+                    <tr>
+                        <th scope="col" class="text-center col-1"><div class="header-title">URL</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Scraped Title</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">New Title</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Division</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">ID</div></th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <th></th>
+                        <!-- <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                        <th scope="col" class="text-center col-1"><div class="header-title">generated_title</div></th> -->
+                        <!-- {% if is_multi_division %} -->
+                        <!-- {% endif %} -->
+
+                    </tr>
+                    <tr>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="curatedUrlFilter" placeholder="URL" /></td>
+                        <td ><select class="dropdown-1 select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="false">FALSE</option>
+                            <option value="true">TRUE</option>
+                        </select></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="curatedScrapedTitleFilter" placeholder="Scraped Title" /></td>
+                        <td ><input type="text" class="table_filter_row_input textBoxStyling" id="curatedNewTitleFilter" placeholder="New Title" /></td>
+                        <td><select class="dropdown-4 select-dropdown selectStyling"><option value="">SELECT</option>
+                        <option value="0">None</option>
+                        <option value="1">Images</option>
+                        <option value="2">Data</option>
+                        <option value="3">Documentation</option>
+                        <option value="4">Software and Tools</option>
+                        <option value="5">Missions and Instruments</option>
+                        </select></td>
+                        <td><select class="dropdown-5 select-dropdown selectStyling"><option value="">SELECT</option>
+                        <option value="0">None</option>
+                        <option value="1">Astrophysics</option>
+                        <option value="2">Biological and Physical Sciences</option>
+                        <option value="3">Earth Science</option>
+                        <option value="4">Heliophysics</option>
+                        <option value="5">Planetary Science</option>
+                        </select></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                        <td ></td>
+                    </tr>
+                </thead>
+            </table>
+        </div>
         <div class="tab-pane fade" id="Exclude-Patterns">
             <table class="table" id="exclude_patterns_table" style="width:100%">
                 <thead class="tableHeader">

From 629c6eaefc083d018fbd7c13f72b9cf296c7572e Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 15 Nov 2024 14:15:40 -0600
Subject: [PATCH 136/441] Fix typo

---
 sde_collections/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index d97aecf7..779bfc8b 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -252,7 +252,7 @@ def fetch_and_replace_full_text(collection_id, server_name):
 
     # Step 1: Delete all existing DumpUrl entries for the collection
     deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
-    print(f"Deleted {deleted_count} existing DumpUrl entries for collection '{collection.config_name}'.")
+    print(f"Deleted {deleted_count} existing DumpUrl entries for collection '{collection.name}'.")
 
     # Step 2: Create new DumpUrl entries from the fetched documents
     processed_count = 0

From 8e5407bc2e37b28ec7686151015087900ebb1b50 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 15 Nov 2024 14:58:50 -0600
Subject: [PATCH 137/441] Added JS for CuratedUrl

---
 .../static/js/delta_url_list.js               | 327 ++++++++++++++++++
 1 file changed, 327 insertions(+)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 66a52ab1..ecce652d 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -322,6 +322,221 @@ function initializeDataTable() {
     }, 1000)
   );
 
+  var curated_urls_table = $("#curated_urls_table").DataTable({
+    pageLength: 100,
+    colReorder: true,
+    stateSave: true,
+    layout: {
+      bottomEnd: "inputPaging",
+      topEnd: null,
+      topStart: {
+        info: true,
+        pageLength: {
+          menu: [
+            [25, 50, 100, 500],
+            ["Show 25", "Show 50", "Show 100", "Show 500"],
+          ],
+        },
+        buttons: [
+          {
+            extend: "csv",
+            exportOptions: {
+              columns: [0, 11, 2, 12, 10],
+            },
+            customize: function (csv) {
+              var lines = csv.split("\n");
+
+              // Reorder the header columns
+              var headers = lines[0].split(",");
+              headers[4] = "New Title";
+              var reorderedHeaders = [
+                headers[0],
+                headers[3],
+                headers[1],
+                headers[4],
+                headers[5],
+                headers[2],
+              ];
+              lines[0] = reorderedHeaders.join(",");
+
+              const appliedFilt = [
+                [`URL:`, `${$("#curatedUrlFilter").val()}`.trim()],
+                [`Exclude:`, `${$(".dropdown-1").val()}`.trim()],
+                [
+                  `Scraped Title:`,
+                  `${$("#curatedScrapedTitleFilter").val()}`.trim(),
+                ],
+                [`New Title:`, `${$("#curatedNewTitleFilter").val()}`.trim()],
+                [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()],
+                [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()],
+              ];
+
+              const filtersAreEmpty = appliedFilt.every((filter) => {
+                return filter[1] === "" || filter[1] === "undefined";
+              });
+
+              // Remove the second row with the filters
+              if (lines.length > 2) {
+                lines.splice(1, 1);
+              }
+              let alteredLines = [];
+              lines.forEach((line) => {
+                let newLine = "";
+                newLine = line.replace("open_in_new", "");
+                alteredLines.push(newLine);
+              });
+
+              if (filtersAreEmpty) return alteredLines.join("\n");
+              else {
+                // Add filter information to the first row
+                const secondRowFilters = [
+                  "Export of SDE Curated URLs",
+                  `"(Applied Filters: ${appliedFilt
+                    .reduce((acc, curr) => {
+                      if (
+                        curr[1] !== " undefined" &&
+                        curr[1] !== " " &&
+                        curr[1] !== "" &&
+                        curr[1] !== "undefined"
+                      ) {
+                        acc = `${acc}, ${curr[0]} ${curr[1]}`;
+                      }
+                      return acc;
+                    }, "")
+                    .slice(2)})"`,
+                ];
+
+                var appliedFiltersInfo = secondRowFilters.join("\n");
+                return appliedFiltersInfo + "\n" + alteredLines.join("\n");
+              }
+            },
+          },
+          "spacer",
+          {
+            text: "Customize Columns",
+            className: "customizeColumns",
+            action: function () {
+              modalContents("#curated_urls_table");
+            },
+          },
+        ],
+      },
+    },
+    serverSide: true,
+    orderCellsTop: true,
+    pagingType: "input",
+    rowId: "url",
+    stateLoadCallback: function (settings) {
+      var state = JSON.parse(
+        localStorage.getItem(
+          "DataTables_curated_urls_" + window.location.pathname
+        )
+      );
+      if (!state) {
+        settings.oInit.pageLength = 1;
+      }
+      return state;
+    },
+    ajax: {
+      url: `/api/curated-urls/?format=datatables&collection_id=${collection_id}`,
+      data: function (d) {
+        d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null;
+      },
+    },
+    initComplete: function (data) {
+      const addDropdownSelect = [1, 4, 5];
+      const dict = {
+        1: "Images",
+        2: "Data",
+        3: "Documentation",
+        4: "Software and Tools",
+        5: "Missions and Instruments",
+      };
+      this.api()
+        .columns()
+        .every(function (index) {
+          let column = this;
+          if (addDropdownSelect.includes(index)) {
+            $("thead tr td select.dropdown-" + index).on("change", function () {
+              var val = $.fn.dataTable.util.escapeRegex($(this).val());
+              column.search(val ? "^" + val + "$" : "", true, false).draw();
+            });
+          }
+        });
+    },
+
+    columns: [
+      getCuratedURLColumn(),
+      getCuratedExcludedColumn(true_icon, false_icon),
+      getCuratedScrapedTitleColumn(),
+      getCuratedGeneratedTitleColumn(),
+      getCuratedDocumentTypeColumn(),
+      getCuratedDivisionColumn(),
+      { data: "id", visible: false, searchable: false },
+      { data: "generated_title_id", visible: false, searchable: false },
+      { data: "match_pattern_type", visible: false, searchable: false },
+      { data: "curated_urls_count", visible: false, searchable: false },
+      { data: "excluded", visible: false, searchable: false },
+      {
+        data: null,
+        render: function (data, type, row) {
+          if (!row.document_type) return "Select";
+          return dict[row.document_type];
+        },
+        visible: false,
+      },
+      {
+        data: null,
+        render: function (data, type, row) {
+          const excludedDict = {
+            true: "Yes",
+            false: "No",
+          };
+          return excludedDict[row.excluded];
+        },
+        visible: false,
+      },
+      {
+        data: null,
+        render: function (data, type, row) {
+          return row.generated_title;
+        },
+        visible: false,
+      },
+      // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []),
+      // getDivisionColumn(),
+    ],
+    createdRow: function (row, data, dataIndex) {
+      if (data["excluded"]) {
+        $(row).attr(
+          "style",
+          "background-color: rgba(255, 61, 87, 0.36) !important"
+        );
+      }
+    },
+  });
+
+  $("#curatedUrlFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      delta_urls_table.columns(0).search(this.value).draw();
+    }, 1000)
+  );
+
+  $("#curatedScrapedTitleFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      delta_urls_table.columns(2).search(this.value).draw();
+    }, 1000)
+  );
+
+  $("#curatedNewTitleFilter").on(
+    "beforeinput",
+    DataTable.util.debounce(function (val) {
+      delta_urls_table.columns(3).search(this.value).draw();
+    }, 1000)
+  );
+
   var exclude_patterns_table = $("#exclude_patterns_table").DataTable({
     // scrollY: true,
     dom: "lBrtip",
@@ -849,6 +1064,30 @@ function getDivisionColumn() {
   };
 }
 
+function getCuratedDivisionColumn() {
+  return {
+    data: "division",
+    width: "10%",
+    visible: (is_multi_division === "true"), searchable: is_multi_division,
+    render: function (data, type, row) {
+      let button_text = data ? divisionDict[data] : "Select";
+      let button_color = data ? "btn-success" : "btn-secondary";
+      return `
+        <div class="dropdown document_type_dropdown" data-match-pattern=${remove_protocol(row["url"])}>
+          <button class="btn ${button_color} btn-sm dropdown-toggle selectStyling" type="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
+            ${button_text}
+          </button>
+          <div class="dropdown-menu">
+            <a class="dropdown-item division_select" href="#" value="0">None</a>
+            ${Object.entries(divisionDict).map(([value, name]) => {
+        return `<a class="dropdown-item division_select" href="#" value="${value}">${name}</a>`;
+      }).join('')}
+          </div>
+        </div>`;
+    },
+  };
+}
+
 
 function handleDivisionSelect() {
   $("body").on("click", ".division_select", function () {
@@ -956,6 +1195,20 @@ function getURLColumn() {
   };
 }
 
+function getCuratedURLColumn() {
+  return {
+    data: "url",
+    width: "30%",
+    render: function (data, type, row) {
+      return `<div class="url-cell"><span class="delta_url nameStyling">${remove_protocol(
+        data
+      )}</span>
+      <a target="_blank" href="${data}" data-url="/api/curated-urls/${row["id"]
+        }/" class="url-link"> <i class="material-icons url-icon">open_in_new</i></a></div>`;
+    },
+  };
+}
+
 function getScrapedTitleColumn() {
   return {
     data: "scraped_title",
@@ -966,6 +1219,16 @@ function getScrapedTitleColumn() {
   };
 }
 
+function getCuratedScrapedTitleColumn() {
+  return {
+    data: "scraped_title",
+    width: "30%",
+    render: function (data, type, row) {
+      return `<span class="whiteText">${data}</span>`;
+    },
+  };
+}
+
 function getGeneratedTitleColumn() {
   return {
     data: "generated_title",
@@ -979,6 +1242,19 @@ function getGeneratedTitleColumn() {
   };
 }
 
+function getCuratedGeneratedTitleColumn() {
+  return {
+    data: "generated_title",
+    width: "20%",
+    render: function (data, type, row) {
+      return `<input type="text" class="form-control individual_title_input whiteText" value='${data}' data-generated-title-id=${row["generated_title_id"]
+        } data-match-pattern-type=${row["match_pattern_type"]
+        } data-curated-urls-count=${row["curated_urls_count"]
+        } data-url=${remove_protocol(row["url"])} />`;
+    },
+  };
+}
+
 function getExcludedColumn(true_icon, false_icon) {
   return {
     data: "excluded",
@@ -996,6 +1272,23 @@ function getExcludedColumn(true_icon, false_icon) {
   };
 }
 
+function getCuratedExcludedColumn(true_icon, false_icon) {
+  return {
+    data: "excluded",
+    width: "10%",
+    class: "col-1 text-center",
+    render: function (data, type, row) {
+      return data === true
+        ? `<a class="exclude_individual_url" value=${remove_protocol(
+          row["url"]
+        )}>${true_icon}</a>`
+        : `<a class="exclude_individual_url" value=${remove_protocol(
+          row["url"]
+        )}>${false_icon}</a>`;
+    },
+  };
+}
+
 function getDocumentTypeColumn() {
   return {
     data: "document_type",
@@ -1030,6 +1323,40 @@ function getDocumentTypeColumn() {
   };
 }
 
+function getCuratedDocumentTypeColumn() {
+  return {
+    data: "document_type",
+    width: "10%",
+    render: function (data, type, row) {
+      var dict = {
+        1: "Images",
+        2: "Data",
+        3: "Documentation",
+        4: "Software and Tools",
+        5: "Missions and Instruments",
+      };
+      button_text = data ? dict[data] : "Select";
+      button_color = data ? "btn-success" : "btn-secondary";
+      return `
+            <div class="dropdown document_type_dropdown"  data-match-pattern=${remove_protocol(
+        row["url"]
+      )}>
+              <button class="btn ${button_color} btn-sm dropdown-toggle selectStyling" type="button" id="dropdownMenuButton" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
+                ${button_text}
+              </button>
+              <div class="dropdown-menu" aria-labelledby="dropdownMenuButton">
+                <a class="dropdown-item document_type_select" href="#" value="0">None</a>
+                <a class="dropdown-item document_type_select" href="#" value="1">Images</a>
+                <a class="dropdown-item document_type_select" href="#" value="2">Data</a>
+                <a class="dropdown-item document_type_select" href="#" value="3">Documentation</a>
+                <a class="dropdown-item document_type_select" href="#" value="4">Software and Tools</a>
+                <a class="dropdown-item document_type_select" href="#" value="5">Missions and Instruments</a>
+              </div>
+            </div>`;
+    },
+  };
+}
+
 //template to add enter and escape functionalities to add pattern modals
 function addEnterEscapeKeypress(modalID, formID) {
   $("body").on("keydown", function (event) {

From e0e1d3e3429214978d052d818d2ef4ee28e082fe Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 15:39:51 -0600
Subject: [PATCH 138/441] move setting of TRAEFIK_DOMAIN into an environment
 file

---
 compose/production/traefik/traefik.yml | 4 ++--
 production.yml                         | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index d367bb22..7f0300e6 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -31,7 +31,7 @@ certificatesResolvers:
 http:
   routers:
     web-secure-router:
-      rule: "Host(`sde-indexing-helper.nasa-impact.net`)"
+      rule: "Host(`${TRAEFIK_DOMAIN}`)" # Dynamic domain using environment variable
       entryPoints:
         - web-secure
       middlewares:
@@ -42,7 +42,7 @@ http:
         certResolver: letsencrypt
 
     flower-secure-router:
-      rule: "Host(`sde-indexing-helper.nasa-impact.net`)"
+      rule: "Host(`${TRAEFIK_DOMAIN}`)"
       entryPoints:
         - flower
       service: flower
diff --git a/production.yml b/production.yml
index 8e5853e1..3a17e853 100644
--- a/production.yml
+++ b/production.yml
@@ -38,6 +38,10 @@ services:
       - django
     volumes:
       - production_traefik:/etc/traefik/acme
+    env_file:
+      # this should contain TRAEFIK_DOMAIN=sde-indexing-helper-staging.nasa-impact.net or
+      # TRAEFIK_DOMAIN=sde-indexing-helper.nasa-impact.net or
+      - ./.envs/.production/.traefik
     ports:
       - "0.0.0.0:80:80"
       - "0.0.0.0:443:443"

From 6ef697ced57b3379d1d6b72bceeeb0dfe3a1c0ac Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 15 Nov 2024 15:40:07 -0600
Subject: [PATCH 139/441] Added views and serializers for DeltaUrl

---
 sde_collections/serializers.py | 42 +++++++++++++++++++++++++++++++++-
 sde_collections/urls.py        |  1 +
 sde_collections/views.py       | 38 +++++++++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 680de4cb..35e3af02 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -9,7 +9,7 @@
     DeltaIncludePattern,
     DeltaTitlePattern,
 )
-from .models.delta_url import DeltaUrl
+from .models.delta_url import CuratedUrl, DeltaUrl
 
 
 class CollectionSerializer(serializers.ModelSerializer):
@@ -94,6 +94,46 @@ class Meta:
         )
 
 
+class CuratedURLSerializer(serializers.ModelSerializer):
+    excluded = serializers.BooleanField(required=False)
+    document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
+    division_display = serializers.CharField(source="get_division_display", read_only=True)
+    url = serializers.CharField(required=False)
+    generated_title_id = serializers.SerializerMethodField(read_only=True)
+    match_pattern_type = serializers.SerializerMethodField(read_only=True)
+    curated_urls_count = serializers.SerializerMethodField(read_only=True)
+
+    def get_curated_urls_count(self, obj):
+        titlepattern = obj.deltatitlepattern_curated_urls.last()
+        return titlepattern.curated_urls.count() if titlepattern else 0
+
+    def get_generated_title_id(self, obj):
+        titlepattern = obj.deltatitlepattern_curated_urls.last()
+        return titlepattern.id if titlepattern else None
+
+    def get_match_pattern_type(self, obj):
+        titlepattern = obj.deltatitlepattern_curated_urls.last()
+        return titlepattern.match_pattern_type if titlepattern else None
+
+    class Meta:
+        model = CuratedUrl
+        fields = (
+            "id",
+            "excluded",
+            "url",
+            "scraped_title",
+            "generated_title",
+            "generated_title_id",
+            "match_pattern_type",
+            "curated_urls_count",
+            "document_type",
+            "document_type_display",
+            "division",
+            "division_display",
+            "visited",
+        )
+
+
 class DeltaURLBulkCreateSerializer(serializers.ModelSerializer):
     class Meta:
         model = DeltaUrl
diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index 60f61aa4..c64b9b1d 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -9,6 +9,7 @@
 router.register(r"collections", views.CollectionViewSet, basename="collection")
 router.register(r"collections-read", views.CollectionReadViewSet, basename="collection-read")
 router.register(r"delta-urls", views.DeltaURLViewSet)
+router.register(r"curated-urls", views.CuratedURLViewSet)
 router.register(r"exclude-patterns", views.ExcludePatternViewSet)
 router.register(r"include-patterns", views.IncludePatternViewSet)
 router.register(r"title-patterns", views.TitlePatternViewSet)
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 192d9eec..004064e1 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -33,10 +33,16 @@
     DeltaIncludePattern,
     DeltaTitlePattern,
 )
-from .models.delta_url import DeltaResolvedTitle, DeltaResolvedTitleError, DeltaUrl
+from .models.delta_url import (
+    CuratedUrl,
+    DeltaResolvedTitle,
+    DeltaResolvedTitleError,
+    DeltaUrl,
+)
 from .serializers import (
     CollectionReadSerializer,
     CollectionSerializer,
+    CuratedURLSerializer,
     DeltaURLAPISerializer,
     DeltaURLBulkCreateSerializer,
     DeltaURLSerializer,
@@ -284,6 +290,36 @@ def update_division(self, request, pk=None):
         return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
 
 
+class CuratedURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
+    queryset = CuratedUrl.objects.all()
+    serializer_class = CuratedURLSerializer
+
+    def _filter_by_is_excluded(self, queryset, is_excluded):
+        if is_excluded == "false":
+            queryset = queryset.filter(excluded=False)
+        elif is_excluded == "true":
+            queryset = queryset.exclude(excluded=False)
+        return queryset
+
+    def get_queryset(self):
+        queryset = super().get_queryset()
+        if self.request.method == "GET":
+            # Filter based on exclusion status
+            is_excluded = self.request.GET.get("is_excluded")
+            if is_excluded:
+                queryset = self._filter_by_is_excluded(queryset, is_excluded)
+        return queryset.order_by("url")
+
+    def update_division(self, request, pk=None):
+        delta_url = get_object_or_404(CuratedUrl, pk=pk)
+        division = request.data.get("division")
+        if division:
+            delta_url.division = division
+            delta_url.save()
+            return Response(status=status.HTTP_200_OK)
+        return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."})
+
+
 class DeltaURLBulkCreateView(generics.ListCreateAPIView):
     queryset = DeltaUrl.objects.all()
     serializer_class = DeltaURLBulkCreateSerializer

From 106c962a15fda868dfe9f4eee71d585f45b684ce Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 15:52:45 -0600
Subject: [PATCH 140/441] update traefik.yml to reference the env variable
 differently

---
 compose/production/traefik/traefik.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 7f0300e6..e925820b 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -42,7 +42,7 @@ http:
         certResolver: letsencrypt
 
     flower-secure-router:
-      rule: "Host(`${TRAEFIK_DOMAIN}`)"
+      rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)'
       entryPoints:
         - flower
       service: flower

From aa59602e322a8ca4bf4a282f37f77ab63efe104a Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 15:58:38 -0600
Subject: [PATCH 141/441] fix misconfigured env reference in traefik.yml

---
 compose/production/traefik/traefik.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index e925820b..58d78eba 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -31,7 +31,7 @@ certificatesResolvers:
 http:
   routers:
     web-secure-router:
-      rule: "Host(`${TRAEFIK_DOMAIN}`)" # Dynamic domain using environment variable
+      rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)'
       entryPoints:
         - web-secure
       middlewares:

From 8e68b92510d7c0ee9a40b104d733c64ab5d5870f Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 18:17:48 -0600
Subject: [PATCH 142/441] add promotion on workflow status change to

---
 sde_collections/models/collection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index fc994b78..9080b036 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -683,6 +683,8 @@ def create_configs_on_status_change(sender, instance, created, **kwargs):
     if "workflow_status" in instance.tracker.changed():
         if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION:
             instance.create_plugin_config(overwrite=True)
+        elif instance.workflow_status == WorkflowStatusChoices.CURATED:
+            instance.promote_to_curated()
         elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
             instance.create_scraper_config(overwrite=False)
             instance.create_indexer_config(overwrite=False)

From 3c7fab9fa9bb9d1add1798647825b5ed32a70ae8 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 18:38:07 -0600
Subject: [PATCH 143/441] change DeltaUrl.delete to to_delete

---
 ...name_delete_deltaurl_to_delete_and_more.py | 42 +++++++++++++++++++
 sde_collections/models/collection.py          | 24 +++++------
 sde_collections/models/delta_url.py           |  2 +-
 sde_collections/tests/factories.py            |  2 +-
 sde_collections/tests/test_migrate_dump.py    | 36 ++++++++--------
 .../tests/test_promote_collection.py          |  2 +-
 6 files changed, 75 insertions(+), 33 deletions(-)
 create mode 100644 sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py

diff --git a/sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py b/sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py
new file mode 100644
index 00000000..a7507629
--- /dev/null
+++ b/sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py
@@ -0,0 +1,42 @@
+# Generated by Django 4.2.9 on 2024-11-16 00:26
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0064_alter_curatedurl_options_and_more"),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name="deltaurl",
+            old_name="delete",
+            new_name="to_delete",
+        ),
+        migrations.AlterField(
+            model_name="curatedurl",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name="curated_urls",
+                to="sde_collections.collection",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaurl",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE, related_name="delta_urls", to="sde_collections.collection"
+            ),
+        ),
+        migrations.AlterField(
+            model_name="dumpurl",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE, related_name="dump_urls", to="sde_collections.collection"
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 9080b036..9f921717 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -109,27 +109,27 @@ def migrate_dump_to_delta(self):
             if curated:
                 # Check if any of the comparison fields differ
                 if any(getattr(curated, field) != getattr(dump, field) for field in DELTA_COMPARISON_FIELDS):
-                    self.create_or_update_delta_url(dump, delete=False)
+                    self.create_or_update_delta_url(dump, to_delete=False)
             else:
                 # New URL, not in CuratedUrls; move it entirely to DeltaUrls
-                self.create_or_update_delta_url(dump, delete=False)
+                self.create_or_update_delta_url(dump, to_delete=False)
 
         # Step 4: Identify CuratedUrls missing in DumpUrls and flag them for deletion in DeltaUrls
         for curated in curated_urls.values():
             if curated.url not in dump_urls:
-                self.create_or_update_delta_url(curated, delete=True)
+                self.create_or_update_delta_url(curated, to_delete=True)
 
         # Step 5: Clear DumpUrls after migration is complete
         self.clear_dump_urls()
 
-    def create_or_update_delta_url(self, url_instance, delete=False):
+    def create_or_update_delta_url(self, url_instance, to_delete=False):
         """
         Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
-        If delete is True, only sets the delete flag and url.
+        If to_delete is True, only sets the to_delete flag and url.
         """
-        if delete:
-            # Only set the URL and delete flag
-            DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults={"delete": True})
+        if to_delete:
+            # Only set the URL and to_delete flag
+            DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults={"to_delete": True})
         else:
             # Automatically move over all fields from url_instance
             fields_to_copy = {
@@ -137,7 +137,7 @@ def create_or_update_delta_url(self, url_instance, delete=False):
                 for field in DumpUrl._meta.fields  # Assumes same fields for CuratedUrl via inheritance
                 if field.name not in ["id", "collection", "url"]
             }
-            fields_to_copy["delete"] = False  # Ensure delete flag is False
+            fields_to_copy["to_delete"] = False  # Ensure to_delete flag is False
 
             DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy)
 
@@ -155,7 +155,7 @@ def promote_to_curated(self):
             curated = curated_urls.get(url)
 
             # Delete the CuratedUrl if the DeltaUrl is marked for deletion
-            if delta.delete:
+            if delta.to_delete:
                 if curated:
                     curated.delete()
                 continue
@@ -164,7 +164,7 @@ def promote_to_curated(self):
                 updated_fields = {}
                 for field in delta._meta.fields:
                     field_name = field.name
-                    if field_name == "delete":
+                    if field_name == "to_delete":
                         continue
 
                     delta_value = getattr(delta, field_name)
@@ -179,7 +179,7 @@ def promote_to_curated(self):
                 new_data = {
                     field.name: getattr(delta, field.name)
                     for field in delta._meta.fields
-                    if field.name not in ["delete", "collection"] and getattr(delta, field.name) not in [None, ""]
+                    if field.name not in ["to_delete", "collection"] and getattr(delta, field.name) not in [None, ""]
                 }
                 CuratedUrl.objects.create(collection=self, **new_data)
 
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index f0a8bea1..3f1212e0 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -126,7 +126,7 @@ class DeltaUrl(BaseUrl):
     collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="delta_urls")
 
     objects = DeltaUrlManager()
-    delete = models.BooleanField(default=False)
+    to_delete = models.BooleanField(default=False)
 
     class Meta:
         verbose_name = "Delta Urls"
diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py
index a62251b4..414221d5 100644
--- a/sde_collections/tests/factories.py
+++ b/sde_collections/tests/factories.py
@@ -87,4 +87,4 @@ class Meta:
     collection = factory.SubFactory(CollectionFactory)
     url = factory.Faker("url")
     scraped_title = factory.Faker("sentence")
-    delete = False
+    to_delete = False
diff --git a/sde_collections/tests/test_migrate_dump.py b/sde_collections/tests/test_migrate_dump.py
index bfcb8920..05dfe372 100644
--- a/sde_collections/tests/test_migrate_dump.py
+++ b/sde_collections/tests/test_migrate_dump.py
@@ -31,18 +31,18 @@ def test_clear_dump_urls(self):
     def test_create_or_update_delta_url_add(self):
         collection = CollectionFactory()
         dump_url = DumpUrlFactory(collection=collection)
-        collection.create_or_update_delta_url(dump_url, delete=False)
+        collection.create_or_update_delta_url(dump_url, to_delete=False)
         delta = DeltaUrl.objects.get(url=dump_url.url)
-        assert delta.delete is False
+        assert delta.to_delete is False
         for field in DELTA_COMPARISON_FIELDS:
             assert getattr(delta, field) == getattr(dump_url, field)
 
     def test_create_or_update_delta_url_delete(self):
         collection = CollectionFactory()
         curated_url = CuratedUrlFactory(collection=collection)
-        collection.create_or_update_delta_url(curated_url, delete=True)
+        collection.create_or_update_delta_url(curated_url, to_delete=True)
         delta = DeltaUrl.objects.get(url=curated_url.url)
-        assert delta.delete is True
+        assert delta.to_delete is True
         assert delta.scraped_title == ""
 
 
@@ -53,7 +53,7 @@ def test_new_url_in_dump_only(self):
         dump_url = DumpUrlFactory(collection=collection)
         collection.migrate_dump_to_delta()
         delta = DeltaUrl.objects.get(url=dump_url.url)
-        assert delta.delete is False
+        assert delta.to_delete is False
         for field in DELTA_COMPARISON_FIELDS:
             assert getattr(delta, field) == getattr(dump_url, field)
 
@@ -63,7 +63,7 @@ def test_url_in_both_with_different_field(self):
         curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Old Title")  # noqa
         collection.migrate_dump_to_delta()
         delta = DeltaUrl.objects.get(url=dump_url.url)
-        assert delta.delete is False
+        assert delta.to_delete is False
         assert delta.scraped_title == "New Title"
 
     def test_url_in_curated_only(self):
@@ -71,7 +71,7 @@ def test_url_in_curated_only(self):
         curated_url = CuratedUrlFactory(collection=collection)
         collection.migrate_dump_to_delta()
         delta = DeltaUrl.objects.get(url=curated_url.url)
-        assert delta.delete is True
+        assert delta.to_delete is True
         assert delta.scraped_title == ""
 
     def test_identical_url_in_both(self):
@@ -91,16 +91,16 @@ def test_full_migration_flow(self):
         collection.migrate_dump_to_delta()
 
         # New URL moved to DeltaUrls
-        assert DeltaUrl.objects.filter(url=dump_url_new.url, delete=False).exists()
+        assert DeltaUrl.objects.filter(url=dump_url_new.url, to_delete=False).exists()
 
         # Updated URL moved to DeltaUrls
         delta_update = DeltaUrl.objects.get(url=dump_url_update.url)
         assert delta_update.scraped_title == "Updated Title"
-        assert delta_update.delete is False
+        assert delta_update.to_delete is False
 
         # Deleted URL in CuratedUrls marked as delete in DeltaUrls
         delta_delete = DeltaUrl.objects.get(url=curated_url_delete.url)
-        assert delta_delete.delete is True
+        assert delta_delete.to_delete is True
 
     def test_empty_collections(self):
         collection = CollectionFactory()
@@ -113,7 +113,7 @@ def test_partial_data_in_dump_urls(self):
         collection.migrate_dump_to_delta()
         delta = DeltaUrl.objects.get(url=dump_url.url)
         assert delta.scraped_title == ""
-        assert delta.delete is False
+        assert delta.to_delete is False
 
 
 @pytest.mark.django_db
@@ -136,11 +136,11 @@ def test_create_or_update_delta_url_idempotency(self):
         dump_url = DumpUrlFactory(collection=collection)
 
         # First call
-        collection.create_or_update_delta_url(dump_url, delete=False)
+        collection.create_or_update_delta_url(dump_url, to_delete=False)
         assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1
 
         # Second call with the same data
-        collection.create_or_update_delta_url(dump_url, delete=False)
+        collection.create_or_update_delta_url(dump_url, to_delete=False)
         assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1  # Should still be one
 
 
@@ -157,7 +157,7 @@ def test_create_or_update_delta_url_field_copy():
         division=2,
     )
 
-    collection.create_or_update_delta_url(dump_url, delete=False)
+    collection.create_or_update_delta_url(dump_url, to_delete=False)
     delta = DeltaUrl.objects.get(url=dump_url.url)
 
     # Verify each field is copied correctly
@@ -174,7 +174,7 @@ def test_full_migration_new_url(self):
         collection.migrate_dump_to_delta()
 
         # New URL should be added to DeltaUrls
-        assert DeltaUrl.objects.filter(url=dump_url.url, delete=False).exists()
+        assert DeltaUrl.objects.filter(url=dump_url.url, to_delete=False).exists()
 
     def test_full_migration_updated_url(self):
         collection = CollectionFactory()
@@ -184,7 +184,7 @@ def test_full_migration_updated_url(self):
         # URL with differing fields should be updated in DeltaUrls
         delta_update = DeltaUrl.objects.get(url=dump_url.url)
         assert delta_update.scraped_title == "Updated Title"
-        assert delta_update.delete is False
+        assert delta_update.to_delete is False
 
     def test_full_migration_deleted_url(self):
         collection = CollectionFactory()
@@ -193,7 +193,7 @@ def test_full_migration_deleted_url(self):
 
         # Missing URL in DumpUrls should be marked as delete in DeltaUrls
         delta_delete = DeltaUrl.objects.get(url=curated_url.url)
-        assert delta_delete.delete is True
+        assert delta_delete.to_delete is True
 
 
 @pytest.mark.django_db
@@ -225,4 +225,4 @@ def test_partial_data_in_curated_urls():
     # Since `scraped_title` differs (None vs "Title Exists"), it should create a DeltaUrl
     delta = DeltaUrl.objects.get(url=dump_url.url)
     assert delta.scraped_title == "Title Exists"
-    assert delta.delete is False
+    assert delta.to_delete is False
diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py
index 73b4bcd2..060b0737 100644
--- a/sde_collections/tests/test_promote_collection.py
+++ b/sde_collections/tests/test_promote_collection.py
@@ -68,7 +68,7 @@ def test_promotion_deletes_curated_urls(collection):
     collection.promote_to_curated()
 
     # create a new DeltaUrl marked for deletion
-    DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1", delete=True)
+    DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1", to_delete=True)
 
     # Promote the deletion
     collection.promote_to_curated()

From a88f3f6044ae6292ce99de9e2b7bb55382a47d5c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 20:28:58 -0600
Subject: [PATCH 144/441] updated patterns to refresh url lists

---
 sde_collections/models/collection.py          | 30 ++++++-
 sde_collections/models/delta_patterns.py      | 15 ++--
 sde_collections/tests/test_migrate_dump.py    | 89 ++++++++++++++++++-
 .../tests/test_promote_collection.py          | 39 ++++++++
 4 files changed, 162 insertions(+), 11 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 9f921717..5eaa4635 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -3,6 +3,7 @@
 
 import requests
 from django.contrib.auth import get_user_model
+from django.contrib.contenttypes.models import ContentType
 from django.db import models
 from django.db.models.signals import post_save
 from django.dispatch import receiver
@@ -93,6 +94,28 @@ def clear_dump_urls(self):
         """Clears all DumpUrls for this collection."""
         DumpUrl.objects.filter(collection=self).delete()
 
+    def refresh_url_lists_for_all_patterns(self):
+        """
+        Updates pattern relations for all patterns associated with this collection.
+        """
+        # List of pattern models to update
+        pattern_models = [
+            "DeltaExcludePattern",
+            "DeltaIncludePattern",
+            "DeltaTitlePattern",
+            "DeltaDocumentTypePattern",
+            "DeltaDivisionPattern",
+        ]
+
+        # Loop through each model and update its relations
+        for model_name in pattern_models:
+            # Get the model dynamically
+            model = ContentType.objects.get(app_label="sde_collections", model=model_name.lower()).model_class()
+
+            # Filter patterns for the current collection and update relations
+            for pattern in model.objects.filter(collection=self):
+                pattern.refresh_url_lists()
+
     def migrate_dump_to_delta(self):
         """Main function to handle migration from DumpUrls to DeltaUrls with specific rules."""
         # Step 1: Clear existing DeltaUrls for this collection
@@ -122,6 +145,9 @@ def migrate_dump_to_delta(self):
         # Step 5: Clear DumpUrls after migration is complete
         self.clear_dump_urls()
 
+        # Step 6: Reapply patterns to DeltaUrls
+        self.refresh_url_lists_for_all_patterns()
+
     def create_or_update_delta_url(self, url_instance, to_delete=False):
         """
         Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
@@ -172,7 +198,6 @@ def promote_to_curated(self):
                         updated_fields[field_name] = delta_value
 
                 if updated_fields:
-                    # Use update to modify fields directly in the database
                     CuratedUrl.objects.filter(pk=curated.pk).update(**updated_fields)
             else:
                 # If no matching CuratedUrl, create a new one using all non-null and non-empty fields
@@ -186,6 +211,9 @@ def promote_to_curated(self):
         # Step 3: Clear all DeltaUrls for this collection since they've been promoted
         DeltaUrl.objects.filter(collection=self).delete()
 
+        # Step 4: Reapply patterns to DeltaUrls
+        self.refresh_url_lists_for_all_patterns()
+
     def add_to_public_query(self):
         """Add the collection to the public query."""
         if self.workflow_status not in [
diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 1f073edc..9783fcfb 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -41,7 +41,6 @@ class MatchPatternTypeChoices(models.IntegerChoices):
 
     def matched_urls(self):
         """Find all URLs matching the pattern."""
-        # Dynamically get the DeltaUrl model to avoid circular imports
         DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
         CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
 
@@ -62,6 +61,12 @@ def matched_urls(self):
             "matching_curated_urls": matching_curated_urls,
         }
 
+    def refresh_url_lists(self):
+        """Update the delta_urls and curated_urls ManyToMany relationships."""
+        matched_urls = self.matched_urls()
+        self.delta_urls.set(matched_urls["matching_delta_urls"])
+        self.curated_urls.set(matched_urls["matching_curated_urls"])
+
     def generate_delta_url(self, curated_url, fields_to_copy=None):
         """
         Generates or updates a DeltaUrl based on a CuratedUrl.
@@ -93,9 +98,8 @@ def apply(self, fields_to_copy=None, update_fields=None):
         if update_fields:
             matched_urls["matching_delta_urls"].update(**update_fields)
 
-        # Step 3: Populate ManyToMany relationships for DeltaUrls and CuratedUrls
-        self.delta_urls.add(*matched_urls["matching_delta_urls"])
-        self.curated_urls.add(*matched_urls["matching_curated_urls"])
+        # Update ManyToMany relationships
+        self.refresh_url_lists()
 
     def unapply(self):
         """Default unapply behavior."""
@@ -182,8 +186,7 @@ def apply(self) -> None:
             self.create_delta_if_title_differs(curated_url, DeltaResolvedTitle, DeltaResolvedTitleError)
 
         # Step 3: Update ManyToMany relationships for DeltaUrls and CuratedUrls
-        self.delta_urls.add(*matched_urls["matching_delta_urls"])
-        self.curated_urls.add(*matched_urls["matching_curated_urls"])
+        self.refresh_url_lists()
 
     def create_delta_if_title_differs(self, curated_url, DeltaResolvedTitle, DeltaResolvedTitleError):
         """
diff --git a/sde_collections/tests/test_migrate_dump.py b/sde_collections/tests/test_migrate_dump.py
index 05dfe372..451dd0be 100644
--- a/sde_collections/tests/test_migrate_dump.py
+++ b/sde_collections/tests/test_migrate_dump.py
@@ -3,6 +3,10 @@
 
 import pytest
 
+from sde_collections.models.delta_patterns import (
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+)
 from sde_collections.models.delta_url import DeltaUrl, DumpUrl
 from sde_collections.tests.factories import (
     CollectionFactory,
@@ -60,7 +64,7 @@ def test_new_url_in_dump_only(self):
     def test_url_in_both_with_different_field(self):
         collection = CollectionFactory()
         dump_url = DumpUrlFactory(collection=collection, scraped_title="New Title")
-        curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Old Title")  # noqa
+        CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Old Title")
         collection.migrate_dump_to_delta()
         delta = DeltaUrl.objects.get(url=dump_url.url)
         assert delta.to_delete is False
@@ -77,7 +81,7 @@ def test_url_in_curated_only(self):
     def test_identical_url_in_both(self):
         collection = CollectionFactory()
         dump_url = DumpUrlFactory(collection=collection, scraped_title="Same Title")
-        curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title")  # noqa
+        CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title")
         collection.migrate_dump_to_delta()
         assert not DeltaUrl.objects.filter(url=dump_url.url).exists()
 
@@ -200,7 +204,7 @@ def test_full_migration_deleted_url(self):
 def test_empty_delta_comparison_fields():
     collection = CollectionFactory()
     dump_url = DumpUrlFactory(collection=collection, scraped_title="Same Title")
-    curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title")  # noqa
+    CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title")  # noqa
 
     global DELTA_COMPARISON_FIELDS
     original_fields = DELTA_COMPARISON_FIELDS
@@ -218,7 +222,7 @@ def test_empty_delta_comparison_fields():
 def test_partial_data_in_curated_urls():
     collection = CollectionFactory()
     dump_url = DumpUrlFactory(collection=collection, scraped_title="Title Exists")
-    curated_url = CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="")  # noqa
+    CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="")  # noqa
 
     collection.migrate_dump_to_delta()
 
@@ -226,3 +230,80 @@ def test_partial_data_in_curated_urls():
     delta = DeltaUrl.objects.get(url=dump_url.url)
     assert delta.scraped_title == "Title Exists"
     assert delta.to_delete is False
+
+
+@pytest.mark.django_db
+def test_patterns_applied_after_migration():
+    collection = CollectionFactory()
+
+    # Add DumpUrls to migrate
+    DumpUrlFactory(collection=collection, url="https://exclude.com")
+    DumpUrlFactory(collection=collection, url="https://include.com")
+    DumpUrlFactory(collection=collection, url="https://neutral.com")
+
+    # Create exclude and include patterns
+    exclude_pattern = DeltaExcludePattern.objects.create(
+        collection=collection, match_pattern_type=2, match_pattern="exclude.*"
+    )
+    include_pattern = DeltaIncludePattern.objects.create(
+        collection=collection, match_pattern_type=2, match_pattern="include.*"
+    )
+
+    # Perform the migration
+    collection.migrate_dump_to_delta()
+
+    # Check that the patterns were applied
+    exclude_pattern.refresh_from_db()
+    include_pattern.refresh_from_db()
+
+    # Verify exclude pattern relationship
+    assert exclude_pattern.delta_urls.filter(
+        url="https://exclude.com"
+    ).exists(), "Exclude pattern not applied to DeltaUrls."
+
+    # Verify include pattern relationship
+    assert include_pattern.delta_urls.filter(
+        url="https://include.com"
+    ).exists(), "Include pattern not applied to DeltaUrls."
+
+    # Ensure neutral URL is unaffected
+    assert not exclude_pattern.delta_urls.filter(
+        url="https://neutral.com"
+    ).exists(), "Exclude pattern incorrectly applied."
+    assert not include_pattern.delta_urls.filter(
+        url="https://neutral.com"
+    ).exists(), "Include pattern incorrectly applied."
+
+
+@pytest.mark.django_db
+def test_full_migration_with_patterns():
+    collection = CollectionFactory()
+
+    # Set up DumpUrls and CuratedUrls
+    DumpUrlFactory(collection=collection, url="https://new.com")
+    DumpUrlFactory(collection=collection, url="https://update.com", scraped_title="Updated Title")
+    CuratedUrlFactory(collection=collection, url="https://update.com", scraped_title="Old Title")
+    CuratedUrlFactory(collection=collection, url="https://delete.com")
+
+    # Create patterns
+    exclude_pattern = DeltaExcludePattern.objects.create(
+        collection=collection, match_pattern_type=2, match_pattern="delete.*"
+    )
+    include_pattern = DeltaIncludePattern.objects.create(
+        collection=collection, match_pattern_type=2, match_pattern="update.*"
+    )
+
+    # Perform migration
+    collection.migrate_dump_to_delta()
+
+    # Check DeltaUrls
+    assert DeltaUrl.objects.filter(url="https://new.com", to_delete=False).exists()
+    assert DeltaUrl.objects.filter(url="https://update.com", to_delete=False, scraped_title="Updated Title").exists()
+    assert DeltaUrl.objects.filter(url="https://delete.com", to_delete=True).exists()
+
+    # Check patterns
+    exclude_pattern.refresh_from_db()
+    include_pattern.refresh_from_db()
+
+    assert exclude_pattern.delta_urls.filter(url="https://delete.com").exists(), "Exclude pattern not applied."
+    assert include_pattern.delta_urls.filter(url="https://update.com").exists(), "Include pattern not applied."
diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py
index 060b0737..7e350525 100644
--- a/sde_collections/tests/test_promote_collection.py
+++ b/sde_collections/tests/test_promote_collection.py
@@ -2,6 +2,10 @@
 
 import pytest
 
+from sde_collections.models.delta_patterns import (
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+)
 from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
 from sde_collections.tests.factories import CollectionFactory
 
@@ -77,3 +81,38 @@ def test_promotion_deletes_curated_urls(collection):
     assert not CuratedUrl.objects.filter(url="https://example1.com").exists()
     # Ensure the other CuratedUrl is still present
     assert CuratedUrl.objects.filter(url="https://example2.com").exists()
+
+
+@pytest.mark.django_db
+def test_patterns_reapplied_after_promotion(collection):
+    # Add DeltaUrls matching the patterns
+    DeltaUrl.objects.create(collection=collection, url="https://exclude.com", scraped_title="Exclude This")
+    DeltaUrl.objects.create(collection=collection, url="https://include.com", scraped_title="Include This")
+
+    # Create exclude and include patterns
+    exclude_pattern = DeltaExcludePattern.objects.create(
+        collection=collection, match_pattern_type=2, match_pattern="exclude.*"
+    )
+    include_pattern = DeltaIncludePattern.objects.create(
+        collection=collection, match_pattern_type=2, match_pattern="include.*"
+    )
+
+    # Promote DeltaUrls to CuratedUrls
+    collection.promote_to_curated()
+
+    # Refresh the patterns and check relationships
+    exclude_pattern.refresh_from_db()
+    include_pattern.refresh_from_db()
+
+    # Verify that patterns are reapplied
+    curated_urls = CuratedUrl.objects.filter(collection=collection)
+
+    assert curated_urls.filter(url="https://exclude.com").exists()
+    assert curated_urls.filter(url="https://include.com").exists()
+
+    # Ensure exclude_pattern and include_pattern relationships are populated
+    assert exclude_pattern.curated_urls.filter(url="https://exclude.com").exists()
+    assert include_pattern.curated_urls.filter(url="https://include.com").exists()
+
+    # Verify exclusion status
+    assert curated_urls.filter(url="https://exclude.com", excluded=True).exists()

From d7208f76c64b0755f57fe34a17908f533ff03bb2 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 20:46:57 -0600
Subject: [PATCH 145/441] update console logs for fetch_and_replace_full_text

---
 sde_collections/tasks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 779bfc8b..ebe48bad 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -252,7 +252,6 @@ def fetch_and_replace_full_text(collection_id, server_name):
 
     # Step 1: Delete all existing DumpUrl entries for the collection
     deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
-    print(f"Deleted {deleted_count} existing DumpUrl entries for collection '{collection.name}'.")
 
     # Step 2: Create new DumpUrl entries from the fetched documents
     processed_count = 0
@@ -269,4 +268,6 @@ def fetch_and_replace_full_text(collection_id, server_name):
             # Handle duplicate URL case if needed
             print(f"Duplicate URL found, skipping: {doc['url']}")
 
+    print(f"Processed {processed_count} new records.")
+
     return f"Successfully processed {len(documents)} records and updated the database."

From 75cde3b1b3660f2fe31eb3886ee1fc9961562baa Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 20:53:25 -0600
Subject: [PATCH 146/441] add properties for delta_urls_count and
 included_urls_count

---
 sde_collections/models/collection.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 5eaa4635..3dd2e1f6 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -242,6 +242,16 @@ def add_to_public_query(self):
     def included_urls_count(self):
         return self.candidate_urls.filter(excluded=False).count()
 
+    @property
+    def delta_urls_count(self):
+        """get the total number of delta urls"""
+        return self.delta_urls.filter(excluded=False).count()
+
+    @property
+    def included_curated_urls_count(self):
+        """get the number of included, curated urls"""
+        return self.curated_urls.filter(excluded=False).count()
+
     @property
     def _scraper_config_path(self) -> str:
         return f"sources/scrapers/{self.config_folder}/default.xml"

From 868798b1dca3210dbbc5326f7735869c20beec81 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 15 Nov 2024 21:03:44 -0600
Subject: [PATCH 147/441] update collection admin to show number of curated and
 delta urls

---
 sde_collections/admin.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 9dcc57b9..8c9acf9b 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -240,6 +240,8 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
     list_display = (
         "name",
         "candidate_urls_count",
+        "delta_urls_count",
+        "included_curated_urls_count",
         "config_folder",
         "url",
         "division",

From 1dbbdfbcb7764c8e400ee65cf8e736d79c92cf31 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 15 Nov 2024 21:58:59 -0600
Subject: [PATCH 148/441] added html for delta and curated columns in
 collection list

---
 .../sde_collections/collection_list.html          | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index b211f963..99ad4795 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -18,7 +18,8 @@ <h2 class="title">Welcome back!</h2>
                 <th class="text-center noBorder" style="padding-right:25px !important">Name</th>
                 <th class="text-center noBorder url-th" style="padding-right:25px !important">URL</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Division</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Candidate URLs</th>
+                <th class="text-center noBorder" style="padding-right:25px !important">Delta URLs</th>
+                <th class="text-center noBorder" style="padding-right:25px !important">Curated URLs</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Workflow Status</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Curator</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Connector Type</th>
@@ -31,6 +32,7 @@ <h2 class="title">Welcome back!</h2>
                 <td class="filterRowBottom url-td"><input class="table_filter_row_input textBoxStyling" type="text" id="urlFilter" placeholder="URL" /></td>
                 <td class="filterRowBottom"><input class="table_filter_row_input textBoxStyling" type="text" id="divisionFilter" placeholder="Division" /></td>
                 <td class="filterRowBottom"></td>
+                <td class="filterRowBottom"></td>
                 <td class="filterRowBottom"><select id="collection-dropdown-4" class="select-dropdown selectStyling">
                     <option value="">SELECT</option>
                    {% for choice in workflow_status_choices %}
@@ -68,9 +70,14 @@ <h2 class="title">Welcome back!</h2>
                     </td>
                     <td class="whiteText noBorder">{{ collection.get_division_display }}</td>
                     <td class="noBorder centerAlign">
-                        <a href=" {% if collection.num_candidate_urls > 0 %} {% url 'sde_collections:candidate_urls' collection.pk %} {% endif %} "
-                           class="btn btn-sm {% if collection.num_candidate_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
-                           role="button">{{ collection.num_candidate_urls|intcomma }}</a>
+                        <a href=" {% if collection.delta_urls_count > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
+                           class="btn btn-sm {% if collection.delta_urls_count > 0 %}btn-primary {% else %}disabled{% endif %}deltaCount"
+                           role="button">{{ collection.delta_urls_count|intcomma }}</a>
+                    </td>
+                    <td class="noBorder centerAlign">
+                        <a href=" {% if collection.included_curated_urls_count > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
+                           class="btn btn-sm {% if collection.included_curated_urls_count > 0 %}btn-primary {% else %}disabled{% endif %}curatedCount"
+                           role="button">{{ collection.included_curated_urls_count|intcomma }}</a>
                     </td>
                     <td class="noBorder">
                         <div class="dropdown workflow_status_dropdown"

From 503f7f5f8b0e413b4575b5edb9d80d73438ac0a7 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Mon, 18 Nov 2024 12:07:48 -0600
Subject: [PATCH 149/441] Commented out delta and curated count on home page.
 'delta url column' comments

---
 .../templates/sde_collections/collection_list.html | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index 99ad4795..56a7bb2c 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -18,8 +18,9 @@ <h2 class="title">Welcome back!</h2>
                 <th class="text-center noBorder" style="padding-right:25px !important">Name</th>
                 <th class="text-center noBorder url-th" style="padding-right:25px !important">URL</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Division</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Delta URLs</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Curated URLs</th>
+                <th class="text-center noBorder" style="padding-right:25px !important">Candidate URLs</th>
+                <!--delta url column <th class="text-center noBorder" style="padding-right:25px !important">Delta URLs</th> -->
+                <!--delta url column <th class="text-center noBorder" style="padding-right:25px !important">Curated URLs</th> -->
                 <th class="text-center noBorder" style="padding-right:25px !important">Workflow Status</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Curator</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Connector Type</th>
@@ -31,7 +32,7 @@ <h2 class="title">Welcome back!</h2>
                 <td class="filterRowBottom" ><input class="table_filter_row_input textBoxStyling" type="text" id="nameFilter" placeholder="Name" /></td>
                 <td class="filterRowBottom url-td"><input class="table_filter_row_input textBoxStyling" type="text" id="urlFilter" placeholder="URL" /></td>
                 <td class="filterRowBottom"><input class="table_filter_row_input textBoxStyling" type="text" id="divisionFilter" placeholder="Division" /></td>
-                <td class="filterRowBottom"></td>
+                <!--delta url column <td class="filterRowBottom"></td> -->
                 <td class="filterRowBottom"></td>
                 <td class="filterRowBottom"><select id="collection-dropdown-4" class="select-dropdown selectStyling">
                     <option value="">SELECT</option>
@@ -70,6 +71,11 @@ <h2 class="title">Welcome back!</h2>
                     </td>
                     <td class="whiteText noBorder">{{ collection.get_division_display }}</td>
                     <td class="noBorder centerAlign">
+                        <a href=" {% if collection.num_candidate_urls > 0 %} {% url 'sde_collections:candidate_urls' collection.pk %} {% endif %} "
+                           class="btn btn-sm {% if collection.num_candidate_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
+                           role="button">{{ collection.num_candidate_urls|intcomma }}</a>
+                    </td>
+                    <!--delta url column <td class="noBorder centerAlign">
                         <a href=" {% if collection.delta_urls_count > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
                            class="btn btn-sm {% if collection.delta_urls_count > 0 %}btn-primary {% else %}disabled{% endif %}deltaCount"
                            role="button">{{ collection.delta_urls_count|intcomma }}</a>
@@ -78,7 +84,7 @@ <h2 class="title">Welcome back!</h2>
                         <a href=" {% if collection.included_curated_urls_count > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
                            class="btn btn-sm {% if collection.included_curated_urls_count > 0 %}btn-primary {% else %}disabled{% endif %}curatedCount"
                            role="button">{{ collection.included_curated_urls_count|intcomma }}</a>
-                    </td>
+                    </td> -->
                     <td class="noBorder">
                         <div class="dropdown workflow_status_dropdown"
                              data-match-pattern

From 86764a852486d5c7d22b9949058e37e91aafb3de Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 18 Nov 2024 16:29:41 -0600
Subject: [PATCH 150/441] update the fetch_and_replace_full_text to
 migrate_dump_to_delta on import of collection

---
 sde_collections/tasks.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index ebe48bad..9555de30 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -268,6 +268,8 @@ def fetch_and_replace_full_text(collection_id, server_name):
             # Handle duplicate URL case if needed
             print(f"Duplicate URL found, skipping: {doc['url']}")
 
+    collection.migrate_dump_to_delta()
+
     print(f"Processed {processed_count} new records.")
 
     return f"Successfully processed {len(documents)} records and updated the database."

From 2ffb5c2c440dfeaa900898a8df28ad892859ae92 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 18 Nov 2024 20:21:50 -0600
Subject: [PATCH 151/441] update collection html to display delta url count

---
 .../sde_collections/collection_list.html      | 23 ++++---------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index 56a7bb2c..21bcc824 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -16,11 +16,9 @@ <h2 class="title">Welcome back!</h2>
         <thead class="tableHeader">
             <tr>
                 <th class="text-center noBorder" style="padding-right:25px !important">Name</th>
-                <th class="text-center noBorder url-th" style="padding-right:25px !important">URL</th>
+                <th class="text-center noBorder url-th" style="padding-right:25px !important">Url</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Division</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Candidate URLs</th>
-                <!--delta url column <th class="text-center noBorder" style="padding-right:25px !important">Delta URLs</th> -->
-                <!--delta url column <th class="text-center noBorder" style="padding-right:25px !important">Curated URLs</th> -->
+                <th class="text-center noBorder" style="padding-right:25px !important">Delta Urls</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Workflow Status</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Curator</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Connector Type</th>
@@ -32,7 +30,6 @@ <h2 class="title">Welcome back!</h2>
                 <td class="filterRowBottom" ><input class="table_filter_row_input textBoxStyling" type="text" id="nameFilter" placeholder="Name" /></td>
                 <td class="filterRowBottom url-td"><input class="table_filter_row_input textBoxStyling" type="text" id="urlFilter" placeholder="URL" /></td>
                 <td class="filterRowBottom"><input class="table_filter_row_input textBoxStyling" type="text" id="divisionFilter" placeholder="Division" /></td>
-                <!--delta url column <td class="filterRowBottom"></td> -->
                 <td class="filterRowBottom"></td>
                 <td class="filterRowBottom"><select id="collection-dropdown-4" class="select-dropdown selectStyling">
                     <option value="">SELECT</option>
@@ -71,20 +68,10 @@ <h2 class="title">Welcome back!</h2>
                     </td>
                     <td class="whiteText noBorder">{{ collection.get_division_display }}</td>
                     <td class="noBorder centerAlign">
-                        <a href=" {% if collection.num_candidate_urls > 0 %} {% url 'sde_collections:candidate_urls' collection.pk %} {% endif %} "
-                           class="btn btn-sm {% if collection.num_candidate_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
-                           role="button">{{ collection.num_candidate_urls|intcomma }}</a>
+                        <a href=" {% if collection.num_delta_urls > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
+                           class="btn btn-sm {% if collection.num_delta_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
+                           role="button">{{ collection.num_delta_urls|intcomma }}</a>
                     </td>
-                    <!--delta url column <td class="noBorder centerAlign">
-                        <a href=" {% if collection.delta_urls_count > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
-                           class="btn btn-sm {% if collection.delta_urls_count > 0 %}btn-primary {% else %}disabled{% endif %}deltaCount"
-                           role="button">{{ collection.delta_urls_count|intcomma }}</a>
-                    </td>
-                    <td class="noBorder centerAlign">
-                        <a href=" {% if collection.included_curated_urls_count > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
-                           class="btn btn-sm {% if collection.included_curated_urls_count > 0 %}btn-primary {% else %}disabled{% endif %}curatedCount"
-                           role="button">{{ collection.included_curated_urls_count|intcomma }}</a>
-                    </td> -->
                     <td class="noBorder">
                         <div class="dropdown workflow_status_dropdown"
                              data-match-pattern

From c83e1c66920346e238fd4bae0ff9f9c6681469ff Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 11:35:49 -0600
Subject: [PATCH 152/441] update pattern application to not create excess
 deltaurls

---
 sde_collections/models/delta_patterns.py     |  9 +-
 sde_collections/tests/test_delta_patterns.py | 91 ++++++++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 9783fcfb..44aaf863 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -40,7 +40,10 @@ class MatchPatternTypeChoices(models.IntegerChoices):
     )
 
     def matched_urls(self):
-        """Find all URLs matching the pattern."""
+        """
+        Find all URLs matching the pattern.
+        This does not update pattern.delta_urls or pattern.curated_urls.
+        """
         DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
         CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
 
@@ -92,6 +95,10 @@ def apply(self, fields_to_copy=None, update_fields=None):
 
         # Step 1: Generate or update DeltaUrls for each matching CuratedUrl
         for curated_url in matched_urls["matching_curated_urls"]:
+            # Check if the curated_url is already linked to this pattern
+            if self.curated_urls.filter(pk=curated_url.pk).exists():
+                # Skip creating a DeltaUrl if the curated_url is already associated with this pattern
+                continue
             self.generate_delta_url(curated_url, fields_to_copy)
 
         # Step 2: Apply updates to fields on matching DeltaUrls
diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py
index 0e796b76..29bd129a 100644
--- a/sde_collections/tests/test_delta_patterns.py
+++ b/sde_collections/tests/test_delta_patterns.py
@@ -18,6 +18,7 @@
     CollectionFactory,
     CuratedUrlFactory,
     DeltaUrlFactory,
+    DumpUrlFactory,
 )
 from sde_collections.utils.title_resolver import resolve_title
 
@@ -84,6 +85,72 @@ def test_generate_delta_url_creation_and_update(self):
         delta_url.refresh_from_db()
         assert delta_url.scraped_title == original_delta_title
 
+    def test_apply_creates_delta_url_if_curated_url_does_not_exist(self):
+        """
+        Ensures that the `apply` logic creates a new `DeltaUrl` if a matching `CuratedUrl` does not exist.
+        """
+        collection = CollectionFactory()
+        delta_url = DeltaUrlFactory(
+            collection=collection, url="https://example.com/page", scraped_title="Original Title"
+        )
+
+        # Create a pattern matching the URL
+        pattern = DeltaIncludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
+        )
+
+        # Apply the pattern
+        pattern.apply()
+
+        # Verify that a DeltaUrl is created
+        assert DeltaUrl.objects.filter(url=delta_url.url).exists()
+
+    def test_apply_skips_delta_url_creation_if_curated_url_exists(self):
+        """
+        Ensures that the `apply` logic does not create a new `DeltaUrl` if a matching `CuratedUrl` already exists.
+        """
+        collection = CollectionFactory()
+        delta_url = DeltaUrlFactory(
+            collection=collection, url="https://example.com/page", scraped_title="Original Title"
+        )
+
+        # Create a pattern matching the URL
+        pattern = DeltaIncludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
+        )
+
+        # Promote the DeltaUrl to a CuratedUrl
+        collection.promote_to_curated()
+        curated_url = CuratedUrl.objects.get(url=delta_url.url)
+
+        # ReApply the pattern
+        pattern.apply()
+
+        # Verify that no DeltaUrl is created after the CuratedUrl exists
+        assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
+
+    def test_apply_creates_delta_url_if_no_curated_url_exists(self):
+        """
+        Ensures that if no `CuratedUrl` exists for a given pattern, a new `DeltaUrl` is created.
+        """
+        collection = CollectionFactory()
+        dump_url = DumpUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title")
+
+        # Migrate DumpUrl to DeltaUrl
+        collection.migrate_dump_to_delta()
+
+        # Create a pattern matching the URL
+        pattern = DeltaIncludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
+        )
+
+        # Apply the pattern
+        pattern.apply()
+
+        # A `DeltaUrl` should now exist
+        delta_url = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta_url.scraped_title == dump_url.scraped_title
+
     def test_apply_and_unapply_pattern(self):
         # if we make a new exclude pattern and it affects an old url
         # that wasn't previously affected, what should happen?
@@ -258,6 +325,30 @@ def test_unapply_removes_pattern_relationships(self):
         assert not pattern.delta_urls.filter(pk=delta_url.pk).exists()
         assert not pattern.curated_urls.filter(pk=curated_url.pk).exists()
 
+    # TODO: work on this test logic
+    # def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
+    #     """
+    #     Ensures that reapplying a pattern does not create duplicate `DeltaUrls` or affect existing `CuratedUrls`.
+    #     """
+    #     collection = CollectionFactory()
+    #     delta_url = DeltaUrlFactory(collection=collection,
+    #                                 url="https://example.com/page",
+    #                                 scraped_title="Title Before")
+
+    #     # Promote to CuratedUrl
+    #     collection.promote_to_curated()
+    #     curated_url = CuratedUrl.objects.get(url=delta_url.url)
+
+    #     # Apply a pattern
+    #     pattern = DeltaTitlePattern.objects.create(
+    #         collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, title_patte......
+    #     )
+    #     pattern.apply()
+
+    #     # Ensure no new `DeltaUrl` is created after reapplying the pattern
+    #     pattern.apply()
+    #     assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
+
 
 @pytest.mark.django_db
 class TestDeltaDocumentTypePattern:

From ee5bf6be86075821b7f2b620030a39af2235c7f9 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 19 Nov 2024 11:37:14 -0600
Subject: [PATCH 153/441] Fixed delta url filter bug

---
 sde_indexing_helper/static/js/delta_url_list.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index ecce652d..e34a940f 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -519,21 +519,21 @@ function initializeDataTable() {
   $("#curatedUrlFilter").on(
     "beforeinput",
     DataTable.util.debounce(function (val) {
-      delta_urls_table.columns(0).search(this.value).draw();
+      curated_urls_table.columns(0).search(this.value).draw();
     }, 1000)
   );
 
   $("#curatedScrapedTitleFilter").on(
     "beforeinput",
     DataTable.util.debounce(function (val) {
-      delta_urls_table.columns(2).search(this.value).draw();
+      curated_urls_table.columns(2).search(this.value).draw();
     }, 1000)
   );
 
   $("#curatedNewTitleFilter").on(
     "beforeinput",
     DataTable.util.debounce(function (val) {
-      delta_urls_table.columns(3).search(this.value).draw();
+      curated_urls_table.columns(3).search(this.value).draw();
     }, 1000)
   );
 

From 936bddc21bc353084e0e9ddcea6a2eebefea6ed2 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 11:37:50 -0600
Subject: [PATCH 154/441] temporarily enable non-integration with slack

---
 sde_collections/models/collection.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 3dd2e1f6..f6c409cb 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -623,7 +623,13 @@ def save(self, *args, **kwargs):
                 if transition in STATUS_CHANGE_NOTIFICATIONS:
                     details = STATUS_CHANGE_NOTIFICATIONS[transition]
                     message = format_slack_message(self.name, details, self.id)
-                    send_slack_message(message)
+                    try:
+                        # TODO: find a better way to allow this to work on dev environments with
+                        # no slack integration
+                        send_slack_message(message)
+                    except Exception as e:
+                        print(f"Error sending Slack message: {e}")
+
         # Call the parent class's save method
         super().save(*args, **kwargs)
 
@@ -722,7 +728,9 @@ def create_configs_on_status_change(sender, instance, created, **kwargs):
         if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION:
             instance.create_plugin_config(overwrite=True)
         elif instance.workflow_status == WorkflowStatusChoices.CURATED:
+            print(instance.workflow_status)
             instance.promote_to_curated()
+            print(instance.workflow_status)
         elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
             instance.create_scraper_config(overwrite=False)
             instance.create_indexer_config(overwrite=False)

From 6d7148f7fe60b8cff7c1b43b5775b1312508cc45 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 16:50:33 -0600
Subject: [PATCH 155/441] add readme instructions for tmux

---
 README.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cc64ef01..ec38f6b1 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ $ docker-compose -f local.yml run --rm django python manage.py loaddata sde_coll
 Navigate to the server running prod, then to the project folder. Run the following command to create a backup:
 
 ```bash
-docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20240812.json
+docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20241114.json
 ```
 This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine.
 
@@ -208,3 +208,20 @@ Eventually, job creation will be done seamlessly by the webapp. Until then, edit
   - JavaScript: `/sde_indexing_helper/static/js`
   - CSS: `/sde_indexing_helper/static/css`
   - Images: `/sde_indexing_helper/static/images`
+
+
+## Running Long Scripts on the Server
+```shell
+tmux new -s docker_django
+```
+Once you are inside, you can run dmshell.
+
+Later, you can do this to get back in.
+```shell
+tmux attach -t docker_django
+```
+
+To delete the session:
+```shell
+tmux kill-session -t docker_django
+```

From f442d2d7a21a2acb5ca8a2da84a186ec74ea851c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 16:59:23 -0600
Subject: [PATCH 156/441] improve delta pattern tests

---
 sde_collections/tests/test_delta_patterns.py | 52 +++++++++++---------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py
index 29bd129a..1fd9c886 100644
--- a/sde_collections/tests/test_delta_patterns.py
+++ b/sde_collections/tests/test_delta_patterns.py
@@ -325,29 +325,35 @@ def test_unapply_removes_pattern_relationships(self):
         assert not pattern.delta_urls.filter(pk=delta_url.pk).exists()
         assert not pattern.curated_urls.filter(pk=curated_url.pk).exists()
 
-    # TODO: work on this test logic
-    # def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
-    #     """
-    #     Ensures that reapplying a pattern does not create duplicate `DeltaUrls` or affect existing `CuratedUrls`.
-    #     """
-    #     collection = CollectionFactory()
-    #     delta_url = DeltaUrlFactory(collection=collection,
-    #                                 url="https://example.com/page",
-    #                                 scraped_title="Title Before")
-
-    #     # Promote to CuratedUrl
-    #     collection.promote_to_curated()
-    #     curated_url = CuratedUrl.objects.get(url=delta_url.url)
-
-    #     # Apply a pattern
-    #     pattern = DeltaTitlePattern.objects.create(
-    #         collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, title_patte......
-    #     )
-    #     pattern.apply()
-
-    #     # Ensure no new `DeltaUrl` is created after reapplying the pattern
-    #     pattern.apply()
-    #     assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
+    def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
+        """
+        Ensures that reapplying a pattern does not create duplicate `DeltaUrls` or affect existing `CuratedUrls`.
+        """
+        collection = CollectionFactory()
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Title Before")
+
+        # Apply a pattern
+        pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="https://example.com/*",
+            match_pattern_type=2,
+            title_pattern="{title} - Processed",
+        )
+
+        delta_url.refresh_from_db()
+        delta_url.generated_title = "Title Before - Processed"
+
+        # Promote to CuratedUrl
+        collection.promote_to_curated()
+        curated_url = CuratedUrl.objects.get(url=delta_url.url)
+
+        # Ensure no new `DeltaUrl` is created after reapplying the pattern
+        pattern.apply()
+        assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
+
+        # Ensure no new `DeltaUrl` is created after reapplying the pattern
+        pattern.apply()
+        assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
 
 
 @pytest.mark.django_db

From 2b593b9540077658bb569c192e2308d0ba072652 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 17:01:32 -0600
Subject: [PATCH 157/441] remove print statements in collection promotion

---
 sde_collections/models/collection.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index f6c409cb..083e231e 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -728,9 +728,7 @@ def create_configs_on_status_change(sender, instance, created, **kwargs):
         if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION:
             instance.create_plugin_config(overwrite=True)
         elif instance.workflow_status == WorkflowStatusChoices.CURATED:
-            print(instance.workflow_status)
             instance.promote_to_curated()
-            print(instance.workflow_status)
         elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
             instance.create_scraper_config(overwrite=False)
             instance.create_indexer_config(overwrite=False)

From a3bc1bd237393b618415cc939a0c4f350087dc5a Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 17:07:50 -0600
Subject: [PATCH 158/441] improve fulltext import test

---
 .../tests/test_import_fulltexts.py            | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 sde_collections/tests/test_import_fulltexts.py

diff --git a/sde_collections/tests/test_import_fulltexts.py b/sde_collections/tests/test_import_fulltexts.py
new file mode 100644
index 00000000..b4256bde
--- /dev/null
+++ b/sde_collections/tests/test_import_fulltexts.py
@@ -0,0 +1,42 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_import_fulltexts.py
+
+from unittest.mock import patch
+
+import pytest
+
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from sde_collections.tasks import fetch_and_replace_full_text
+from sde_collections.tests.factories import CollectionFactory
+
+
+@pytest.mark.django_db
+def test_fetch_and_replace_full_text():
+    # Create a test collection
+    collection = CollectionFactory()
+
+    # Mock API response
+    mock_documents = [
+        {"url": "http://example.com/1", "full_text": "Test Text 1", "title": "Test Title 1"},
+        {"url": "http://example.com/2", "full_text": "Test Text 2", "title": "Test Title 2"},
+    ]
+
+    with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts:
+        mock_get_full_texts.return_value = mock_documents
+
+        # Call the function
+        fetch_and_replace_full_text(collection.id, "lrm_dev")
+
+        # Assertions
+        assert DumpUrl.objects.filter(collection=collection).count() == 0
+        assert DeltaUrl.objects.filter(collection=collection).count() == len(mock_documents)
+        assert CuratedUrl.objects.filter(collection=collection).count() == 0
+
+        for doc in mock_documents:
+            assert (
+                DeltaUrl.objects.filter(collection=collection)
+                .filter(
+                    url=doc["url"],
+                    scraped_text=doc["full_text"],
+                )
+                .exists()
+            )

From 0f579329fc050bb7471565e472fda9d30864c39f Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 17:16:20 -0600
Subject: [PATCH 159/441] remove deprecated tasks.py code

---
 sde_collections/tasks.py | 107 ++++-----------------------------------
 1 file changed, 10 insertions(+), 97 deletions(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 9555de30..47c96338 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -6,12 +6,13 @@
 from django.apps import apps
 from django.conf import settings
 from django.core import management
+from django.core.management.commands import loaddata
 from django.db import IntegrityError
 
 from config import celery_app
 
 from .models.collection import Collection, WorkflowStatusChoices
-from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from .models.delta_url import DumpUrl
 from .sinequa_api import Api
 from .utils.github_helper import GitHubHandler
 
@@ -63,89 +64,6 @@ def _get_data_to_import(collection, server_name):
     return data_to_import
 
 
-def _compare_and_populate_delta_urls(collection):
-    """Compare DumpUrl and CuratedUrl and populate DeltaUrl."""
-    dump_urls = DumpUrl.objects.filter(collection=collection)
-    curated_urls = CuratedUrl.objects.filter(collection=collection)
-
-    DeltaUrl.objects.filter(collection=collection).delete()
-
-    curated_urls_dict = {url.url: url for url in curated_urls}
-
-    # Iterate over Dump URLs to find deltas
-    for dump_url in dump_urls:
-        curated_url = curated_urls_dict.get(dump_url.url)
-
-        if not curated_url:
-            # New URL found, add to DeltaUrl
-            DeltaUrl.objects.create(
-                collection=collection,
-                url=dump_url.url,
-                scraped_title=dump_url.scraped_title,
-                generated_title=dump_url.generated_title,
-                document_type=dump_url.document_type,
-                division=dump_url.division,
-                delete=False,
-            )
-        elif (
-            curated_url.scraped_title != dump_url.scraped_title
-            or curated_url.generated_title != dump_url.generated_title
-            or curated_url.document_type != dump_url.document_type
-            or curated_url.division != dump_url.division
-        ):
-            # Metadata changed, add to DeltaUrl
-            DeltaUrl.objects.create(
-                collection=collection,
-                url=dump_url.url,
-                scraped_title=dump_url.scraped_title,
-                generated_title=dump_url.generated_title,
-                document_type=dump_url.document_type,
-                division=dump_url.division,
-                delete=False,
-            )
-
-    # Mark any missing URLs in CuratedUrl as deleted in DeltaUrl
-    dump_url_set = set(dump_urls.values_list("url", flat=True))
-    for curated_url in curated_urls:
-        if curated_url.url not in dump_url_set:
-            DeltaUrl.objects.create(
-                collection=collection,
-                url=curated_url.url,
-                scraped_title=curated_url.scraped_title,
-                generated_title=curated_url.generated_title,
-                document_type=curated_url.document_type,
-                division=curated_url.division,
-                delete=True,
-            )
-
-
-# TODO: Bishwas wrote this but it is outdated.
-# def populate_dump_urls(collection):
-#     urls = Url.objects.filter(collection=collection)
-
-#     for url_instance in urls:
-#         try:
-#             # Create DumpUrl by passing in the parent Url fields
-#             dump_url_instance = DumpUrl(
-#                 id=url_instance.id,
-#                 collection=url_instance.collection,
-#                 url=url_instance.url,
-#                 scraped_title=url_instance.scraped_title,
-#                 visited=url_instance.visited,
-#                 document_type=url_instance.document_type,
-#                 division=url_instance.division,
-#             )
-#             dump_url_instance.save()  # Save both Url and DumpUrl entries
-
-#             print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}")
-
-#         except Exception as e:
-#             print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}")
-#             continue
-
-#     print(f"Successfully populated DumpUrl model with {urls.count()} entries.")
-
-
 @celery_app.task(soft_time_limit=10000)
 def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
     TEMP_FOLDER_NAME = "temp"
@@ -160,31 +78,26 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
         data_to_import = _get_data_to_import(server_name=server_name, collection=collection)
         print(f"Got {len(data_to_import)} records for {collection.config_folder}")
 
-        print("Clearing DumpUrl model...")
-        DumpUrl.objects.filter(collection=collection).delete()
-
         print("Dumping django fixture to file")
         json.dump(data_to_import, open(urls_file, "w"))
 
-        print("Loading data into Url model using loaddata...")
-        management.call_command("loaddata", urls_file)
+        print("Deleting existing candidate URLs")
+        # this sometimes takes a while
+        collection.candidate_urls.all().delete()
 
-        # TODO: Bishwas wrote this but it is does not work.
-        # print("Creating DumpUrl entries...")
-        # populate_dump_urls(collection)
+        print("Loading fixture; this may take a while")
+        # subprocess.call(f'python manage.py loaddata "{urls_file}"', shell=True)
+        management.call_command(loaddata.Command(), urls_file)
 
         print("Applying existing patterns; this may take a while")
         collection.apply_all_patterns()
 
-        print("Comparing DumpUrl with CuratedUrl...")
-        _compare_and_populate_delta_urls(collection)
-
-        if collection.workflow_status != WorkflowStatusChoices.ENGINEERING_IN_PROGRESS:
+        if collection.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
             collection.workflow_status = WorkflowStatusChoices.ENGINEERING_IN_PROGRESS
             collection.save()
 
         # Finally set the status to READY_FOR_CURATION
-        # collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
+        collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
         collection.save()
 
     print("Deleting temp files")

From 7a33d3e6e81b8793341938409618b0ae07d59f0b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 19 Nov 2024 17:43:39 -0600
Subject: [PATCH 160/441] add initial curatedurl apis

---
 sde_collections/serializers.py | 41 +++++++++++++++++++++++++++++++++-
 sde_collections/urls.py        |  6 +++++
 sde_collections/views.py       | 18 +++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 35e3af02..b3470ce3 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -155,7 +155,46 @@ class Meta:
             "url",
             "title",
             "document_type",
-            "hash",
+            "file_extension",
+            "tree_root",
+        )
+
+    def get_document_type(self, obj):
+        if obj.document_type is not None:
+            return obj.get_document_type_display()
+        elif obj.collection.document_type is not None:
+            return obj.collection.get_document_type_display()
+        else:
+            return "Unknown"
+
+    def get_title(self, obj):
+        return obj.generated_title if obj.generated_title else obj.scraped_title
+
+    def get_file_extension(self, obj):
+        return obj.fileext
+
+    def get_tree_root(self, obj):
+        if obj.collection.is_multi_division:
+            if obj.division:
+                return f"/{obj.get_division_display()}/{obj.collection.name}/"
+            else:
+                return f"/{obj.collection.get_division_display()}/{obj.collection.name}/"
+        else:
+            return obj.collection.tree_root
+
+
+class CuratedURLAPISerializer(serializers.ModelSerializer):
+    document_type = serializers.SerializerMethodField()
+    title = serializers.SerializerMethodField()
+    file_extension = serializers.SerializerMethodField()
+    tree_root = serializers.SerializerMethodField()
+
+    class Meta:
+        model = CuratedUrl
+        fields = (
+            "url",
+            "title",
+            "document_type",
             "file_extension",
             "tree_root",
         )
diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index c64b9b1d..c3d2426f 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -60,5 +60,11 @@
         view=views.DeltaURLAPIView.as_view(),
         name="delta-url-api",
     ),
+    path("curated-urls-api/<str:config_folder>/", view=views.CuratedURLAPIView.as_view(), name="curated-url-api"),
+    path(
+        "candidate-url-api/<str:config_folder>/",
+        view=views.CuratedURLAPIView.as_view(),
+        name="candidate-url-api",
+    ),
     path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"),
 ]
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 004064e1..6c63abfe 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -42,6 +42,7 @@
 from .serializers import (
     CollectionReadSerializer,
     CollectionSerializer,
+    CuratedURLAPISerializer,
     CuratedURLSerializer,
     DeltaURLAPISerializer,
     DeltaURLBulkCreateSerializer,
@@ -360,6 +361,23 @@ def get_queryset(self):
         return queryset
 
 
+class CuratedURLAPIView(ListAPIView):
+    serializer_class = CuratedURLAPISerializer
+
+    def get(self, request, *args, **kwargs):
+        config_folder = kwargs.get("config_folder")
+        self.config_folder = config_folder
+        return super().get(request, *args, **kwargs)
+
+    def get_queryset(self):
+        queryset = (
+            CuratedUrl.objects.filter(collection__config_folder=self.config_folder)
+            .with_exclusion_status()
+            .filter(excluded=False)
+        )
+        return queryset
+
+
 class ExcludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet):
     queryset = DeltaExcludePattern.objects.all()
     serializer_class = ExcludePatternSerializer

From a5a408c13285e04411890854f4bf27d23c1924d8 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Tue, 19 Nov 2024 23:59:48 -0600
Subject: [PATCH 161/441] add TDAMM_TAG_CHOICES to collection_choice_fields

---
 sde_collections/admin.py                      |  5 +-
 sde_collections/models/candidate_url.py       | 48 +----------------
 .../models/collection_choice_fields.py        | 53 +++++++++++++++++++
 3 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 466bc875..ba0b477b 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -8,6 +8,7 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import import_candidate_urls_from_api
+from .models.collection_choice_fields import TDAMMTags
 
 
 @admin.action(description="Generate deployment message")
@@ -267,14 +268,14 @@ def exclude_and_delete_children(modeladmin, request, queryset):
 class CandidateURLForm(forms.ModelForm):
     # Define the fields as MultipleChoiceFields with checkboxes
     tdamm_tag_manual = forms.MultipleChoiceField(
-        choices=CandidateURL.TDAMM_TAG_CHOICES,
+        choices=TDAMMTags.choices,
         required=False,
         label="TDAMM Manual Tags",
         widget=forms.CheckboxSelectMultiple,
     )
 
     tdamm_tag_ml = forms.MultipleChoiceField(
-        choices=CandidateURL.TDAMM_TAG_CHOICES,
+        choices=TDAMMTags.choices,
         required=False,
         label="TDAMM ML Tags",
         widget=forms.CheckboxSelectMultiple,
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index f85c1388..87bfbbaf 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -7,7 +7,7 @@
 
 from ..utils.paired_field_descriptor import PairedFieldDescriptor
 from .collection import Collection
-from .collection_choice_fields import Divisions, DocumentTypes
+from .collection_choice_fields import Divisions, DocumentTypes, TDAMMTags
 from .pattern import ExcludePattern, TitlePattern
 
 
@@ -82,53 +82,9 @@ class CandidateURL(models.Model):
         help_text="Helps keep track if the Current URL is present in production or not",
     )
     is_tdamm = models.BooleanField("Is TDAMM?", default=False, help_text="Enable TDAMM tagging for this URL")
-
-    TDAMM_TAG_CHOICES = [
-        ("MMA_M_EM", "Messenger - EM Radiation"),
-        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
-        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
-        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
-        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
-        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
-        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
-        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
-        ("MMA_M_G", "Messenger - Gravitational Waves"),
-        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
-        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
-        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
-        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
-        ("MMA_M_C", "Messenger - Cosmic Rays"),
-        ("MMA_M_N", "Messenger - Neutrinos"),
-        ("MMA_O_BI", "Objects - Binaries"),
-        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
-        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
-        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
-        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
-        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
-        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
-        ("MMA_O_BH", "Objects - Black Holes"),
-        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
-        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
-        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
-        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
-        ("MMA_O_E", "Objects - Exoplanets"),
-        ("MMA_O_N", "Objects - Neutron Stars"),
-        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
-        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
-        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
-        ("MMA_O_S", "Objects - Supernova Remnants"),
-        ("MMA_S_F", "Signals - Fast Radio Bursts"),
-        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
-        ("MMA_S_K", "Signals - Kilonovae"),
-        ("MMA_S_N", "Signals - Novae"),
-        ("MMA_S_P", "Signals - Pevatrons"),
-        ("MMA_S_ST", "Signals - Stellar flares"),
-        ("MMA_S_SU", "Signals - Supernovae"),
-    ]
-
     tdamm_tag = PairedFieldDescriptor(
         field_name="tdamm_tag",
-        field_type=ArrayField(models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), blank=True, null=True),
+        field_type=ArrayField(models.CharField(max_length=255, choices=TDAMMTags.choices), blank=True, null=True),
         switch="is_tdamm",
         verbose_name="TDAMM Tags",
     )
diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index 3a9a3664..60567abd 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -97,3 +97,56 @@ class WorkflowStatusChoices(models.IntegerChoices):
     PROD_MAJOR = 16, "Prod: Major Issues"
     MERGE_PENDING = 17, "Code Merge Pending"
     NEEDS_DELETE = 19, "Delete from Prod"
+
+
+class TDAMMTags(models.TextChoices):
+    """TDAMM (Tagged Data for Multi-Messenger Astronomy) tag choices."""
+
+    MMA_M_EM = "MMA_M_EM", "Messenger - EM Radiation"
+    MMA_M_EM_G = "MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"
+    MMA_M_EM_X = "MMA_M_EM_X", "Messenger - EM Radiation - X-rays"
+    MMA_M_EM_U = "MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"
+    MMA_M_EM_O = "MMA_M_EM_O", "Messenger - EM Radiation - Optical"
+    MMA_M_EM_I = "MMA_M_EM_I", "Messenger - EM Radiation - Infrared"
+    MMA_M_EM_M = "MMA_M_EM_M", "Messenger - EM Radiation - Microwave"
+    MMA_M_EM_R = "MMA_M_EM_R", "Messenger - EM Radiation - Radio"
+    MMA_M_G = "MMA_M_G", "Messenger - Gravitational Waves"
+    MMA_M_G_CBI = "MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"
+    MMA_M_G_S = "MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"
+    MMA_M_G_CON = "MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"
+    MMA_M_G_B = "MMA_M_G_B", "Messenger - Gravitational Waves - Burst"
+    MMA_M_C = "MMA_M_C", "Messenger - Cosmic Rays"
+    MMA_M_N = "MMA_M_N", "Messenger - Neutrinos"
+    MMA_O_BI = "MMA_O_BI", "Objects - Binaries"
+    MMA_O_BI_BBH = "MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"
+    MMA_O_BI_BNS = "MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"
+    MMA_O_BI_C = "MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"
+    MMA_O_BI_N = "MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"
+    MMA_O_BI_B = "MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"
+    MMA_O_BI_W = "MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"
+    MMA_O_BH = "MMA_O_BH", "Objects - Black Holes"
+    MMA_O_BH_AGN = "MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"
+    MMA_O_BH_IM = "MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"
+    MMA_O_BH_STM = "MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"
+    MMA_O_BH_SUM = "MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"
+    MMA_O_E = "MMA_O_E", "Objects - Exoplanets"
+    MMA_O_N = "MMA_O_N", "Objects - Neutron Stars"
+    MMA_O_N_M = "MMA_O_N_M", "Objects - Neutron Stars - Magnetars"
+    MMA_O_N_P = "MMA_O_N_P", "Objects - Neutron Stars - Pulsars"
+    MMA_O_N_PWN = "MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"
+    MMA_O_S = "MMA_O_S", "Objects - Supernova Remnants"
+    MMA_S_F = "MMA_S_F", "Signals - Fast Radio Bursts"
+    MMA_S_G = "MMA_S_G", "Signals - Gamma-ray Bursts"
+    MMA_S_K = "MMA_S_K", "Signals - Kilonovae"
+    MMA_S_N = "MMA_S_N", "Signals - Novae"
+    MMA_S_P = "MMA_S_P", "Signals - Pevatrons"
+    MMA_S_ST = "MMA_S_ST", "Signals - Stellar flares"
+    MMA_S_SU = "MMA_S_SU", "Signals - Supernovae"
+
+    @classmethod
+    def lookup_by_text(cls, text: str) -> str | None:
+        """Look up a TDAMM tag by its display text."""
+        for choice in cls.choices:
+            if choice[1].lower() == text.lower():
+                return choice[0]
+        return None

From e8b0cf034ecb11d0550c9aee1262b01762d4dba9 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 20 Nov 2024 00:43:45 -0600
Subject: [PATCH 162/441] delete is_tdamm switch functionality

---
 sde_collections/admin.py                      | 31 +++++++------------
 ...candidateurl_tdamm_tag_manual_and_more.py} |  9 +-----
 sde_collections/models/candidate_url.py       |  3 +-
 sde_collections/serializers.py                |  9 +-----
 .../utils/paired_field_descriptor.py          | 17 ++--------
 5 files changed, 18 insertions(+), 51 deletions(-)
 rename sde_collections/migrations/{0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py => 0059_candidateurl_tdamm_tag_manual_and_more.py} (95%)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index ba0b477b..02d78434 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -290,12 +290,11 @@ class CandidateURLAdmin(admin.ModelAdmin):
     """Admin view for CandidateURL"""
 
     form = CandidateURLForm
-    list_display = ["url", "collection", "is_tdamm", "tdamm_tag_manual", "tdamm_tag_ml"]
-    list_filter = ["collection", "is_tdamm"]
+    list_display = ["url", "collection", "tdamm_tag_manual", "tdamm_tag_ml"]
+    list_filter = ["collection"]
     search_fields = ("url", "collection__name")
 
     def get_fieldsets(self, request, obj=None):
-        """Dynamically adjust fieldsets based on is_tdamm"""
         fieldsets = [
             (
                 "Essential Information",
@@ -316,27 +315,21 @@ def get_fieldsets(self, request, obj=None):
                         "is_pdf",
                         "present_on_test",
                         "present_on_prod",
-                        "is_tdamm",
                     )
                 },
             ),
+            (
+                "TDAMM Tags",
+                {
+                    "fields": (
+                        "tdamm_tag_ml",
+                        "tdamm_tag_manual",
+                    ),
+                    "classes": ("collapse",),
+                },
+            ),
         ]
 
-        # Add TDAMM fields only if is_tdamm is True
-        if obj and obj.is_tdamm:
-            fieldsets.append(
-                (
-                    "TDAMM Tags",
-                    {
-                        "fields": (
-                            "tdamm_tag_ml",
-                            "tdamm_tag_manual",
-                        ),
-                        "classes": ("collapse",),
-                    },
-                )
-            )
-
         return fieldsets
 
 
diff --git a/sde_collections/migrations/0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
similarity index 95%
rename from sde_collections/migrations/0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py
rename to sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
index c635c833..16cf4219 100644
--- a/sde_collections/migrations/0059_candidateurl_is_tdamm_candidateurl_tdamm_tag_manual_and_more.py
+++ b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.9 on 2024-11-13 08:01
+# Generated by Django 4.2.9 on 2024-11-20 06:39
 
 import django.contrib.postgres.fields
 from django.db import migrations, models
@@ -11,13 +11,6 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.AddField(
-            model_name="candidateurl",
-            name="is_tdamm",
-            field=models.BooleanField(
-                default=False, help_text="Enable TDAMM tagging for this URL", verbose_name="Is TDAMM?"
-            ),
-        ),
         migrations.AddField(
             model_name="candidateurl",
             name="tdamm_tag_manual",
diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 87bfbbaf..f274e839 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -81,11 +81,10 @@ class CandidateURL(models.Model):
         default=False,
         help_text="Helps keep track if the Current URL is present in production or not",
     )
-    is_tdamm = models.BooleanField("Is TDAMM?", default=False, help_text="Enable TDAMM tagging for this URL")
+    # is_tdamm = models.BooleanField("Is TDAMM?", default=False, help_text="Enable TDAMM tagging for this URL")
     tdamm_tag = PairedFieldDescriptor(
         field_name="tdamm_tag",
         field_type=ArrayField(models.CharField(max_length=255, choices=TDAMMTags.choices), blank=True, null=True),
-        switch="is_tdamm",
         verbose_name="TDAMM Tags",
     )
 
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 06227b31..4aec4233 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -116,14 +116,7 @@ class CandidateURLAPISerializer(serializers.ModelSerializer):
 
     class Meta:
         model = CandidateURL
-        fields = ("url", "title", "document_type", "hash", "file_extension", "tree_root", "is_tdamm", "tdamm_tag")
-
-    def to_representation(self, instance):
-        """Remove tdamm_tag field if is_tdamm is False"""
-        representation = super().to_representation(instance)
-        if not instance.is_tdamm:
-            representation.pop("tdamm_tag", None)
-        return representation
+        fields = ("url", "title", "document_type", "hash", "file_extension", "tree_root", "tdamm_tag")
 
     def get_tdamm_tag(self, obj):
         tags = obj.tdamm_tag
diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py
index 8fdb2a2f..afebc35a 100644
--- a/sde_collections/utils/paired_field_descriptor.py
+++ b/sde_collections/utils/paired_field_descriptor.py
@@ -6,13 +6,12 @@ class PairedFieldDescriptor:
     - Getting the main field returns manual if present, otherwise ML
     """
 
-    def __init__(self, field_name, field_type, switch, verbose_name=""):
+    def __init__(self, field_name, field_type, verbose_name=""):
         self.field_name = field_name
         self.manual_field_name = f"{field_name}_manual"
         self.ml_field_name = f"{field_name}_ml"
         self.field_type = field_type
         self.verbose_name = verbose_name or field_name.replace("_", " ").title()
-        self.switch = switch
 
     def contribute_to_class(self, cls, name):
         """Called by Django when the descriptor is added to the model class."""
@@ -48,14 +47,10 @@ def __get__(self, instance, owner):
         Get the value of the main field:
         - Returns manual tags if they exist
         - Otherwise returns ML tags
-        - Returns None if switch is False
         """
         if instance is None:
             return self
 
-        if not getattr(instance, self.switch, False):
-            return None
-
         manual_value = getattr(instance, self.manual_field_name, None)
         ml_value = getattr(instance, self.ml_field_name, None)
 
@@ -69,16 +64,10 @@ def __set__(self, instance, value):
         Set only the manual field when setting the field.
         ML field must be set explicitly.
         """
-        if getattr(instance, self.switch, False):
-            # Only set the manual field
-            setattr(instance, self.manual_field_name, value)
+
+        setattr(instance, self.manual_field_name, value)
 
     def __delete__(self, instance):
         """Delete both manual and ML fields"""
         setattr(instance, self.manual_field_name, None)
         setattr(instance, self.ml_field_name, None)
-
-    def set_ml(self, instance, value):
-        """Explicit method to set ML tags"""
-        if getattr(instance, self.switch, False):
-            setattr(instance, self.ml_field_name, value)

From b65c6931cb280b8e2219aacd9d49b01464c1ff16 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Fri, 15 Nov 2024 16:20:34 -0600
Subject: [PATCH 163/441] add test cases for two column tags functionality

---
 sde_collections/tests/factories.py       |  74 +++++++++++++++
 sde_collections/tests/test_tdamm_tags.py | 110 +++++++++++++++++++++++
 2 files changed, 184 insertions(+)
 create mode 100644 sde_collections/tests/factories.py
 create mode 100644 sde_collections/tests/test_tdamm_tags.py

diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py
new file mode 100644
index 00000000..cc9b3e61
--- /dev/null
+++ b/sde_collections/tests/factories.py
@@ -0,0 +1,74 @@
+import factory
+from django.contrib.auth import get_user_model
+from django.utils import timezone
+
+from sde_collections.models.collection import Collection
+from sde_collections.models.candidate_url import CandidateURL
+from sde_collections.models.collection_choice_fields import (
+    ConnectorChoices,
+    Divisions,
+    DocumentTypes,
+    UpdateFrequencies,
+    WorkflowStatusChoices,
+)
+
+User = get_user_model()
+
+
+class UserFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = User
+
+    username = factory.Sequence(lambda n: f"user{n}")
+    email = factory.LazyAttribute(lambda obj: f"{obj.username}@example.com")
+
+
+class CollectionFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = Collection
+
+    name = factory.Faker("company")
+    config_folder = factory.Sequence(
+        lambda n: f"config_folder_{n}"
+    )  # might need to update this to be calculated based on name?
+    url = factory.Faker("url")
+    division = Divisions.ASTROPHYSICS
+    connector = ConnectorChoices.CRAWLER2
+    update_frequency = UpdateFrequencies.WEEKLY
+    document_type = DocumentTypes.DOCUMENTATION
+    delete = False
+    is_multi_division = False
+
+    github_issue_number = factory.Sequence(lambda n: n)
+    notes = factory.Faker("paragraph")
+    updated_at = factory.LazyFunction(timezone.now)
+    new_collection = False
+
+    workflow_status = WorkflowStatusChoices.RESEARCH_IN_PROGRESS
+    tracker = factory.Maybe("workflow_status")
+
+    # ForeignKey to User for `curated_by`
+    curated_by = factory.SubFactory(UserFactory)
+    curation_started = factory.LazyFunction(timezone.now)
+
+
+class CandidateURLFactory(factory.django.DjangoModelFactory):
+    class Meta:
+        model = CandidateURL
+
+    collection = factory.SubFactory(CollectionFactory)
+    url = factory.Faker("url")
+    hash = factory.LazyFunction(lambda: "1")
+    scraped_title = factory.Faker("sentence")
+    generated_title = factory.Faker("sentence")
+    test_title = ""
+    production_title = ""
+    level = 0
+    visited = False
+    document_type = DocumentTypes.DOCUMENTATION
+    division = Divisions.ASTROPHYSICS
+    inferenced_by = ""
+    is_pdf = False
+    present_on_test = False
+    present_on_prod = False
+    is_tdamm = False
diff --git a/sde_collections/tests/test_tdamm_tags.py b/sde_collections/tests/test_tdamm_tags.py
new file mode 100644
index 00000000..6db17e50
--- /dev/null
+++ b/sde_collections/tests/test_tdamm_tags.py
@@ -0,0 +1,110 @@
+# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/test_tdamm_tags.py
+
+import pytest
+from sde_collections.tests.factories import CandidateURLFactory
+
+
+@pytest.mark.django_db
+class TestTDAMMFields:
+    def test_tdamm_switch_behavior(self):
+        """Test that TDAMM fields only work when switch is enabled"""
+        # Create URL with TDAMM disabled
+        url = CandidateURLFactory(is_tdamm=False)
+        url.tdamm_tag = ["MMA_M_EM"]
+        assert url.tdamm_tag is None
+        assert url.tdamm_tag_manual is None
+        assert url.tdamm_tag_ml is None
+
+        # Enable TDAMM
+        url.is_tdamm = True
+        url.save()
+        url.tdamm_tag = ["MMA_M_EM"]
+        assert url.tdamm_tag == ["MMA_M_EM"]
+        assert url.tdamm_tag_manual == ["MMA_M_EM"]
+
+    def test_manual_and_ml_field_behavior(self):
+        """Test the relationship between manual and ML fields"""
+        url = CandidateURLFactory(is_tdamm=True)
+
+        # Setting tdamm_tag affects only manual field
+        url.tdamm_tag = ["MMA_M_EM", "MMA_M_G"]
+        assert url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G"]
+        assert url.tdamm_tag_ml is None
+
+        # ML field must be set explicitly
+        url.tdamm_tag_ml = ["MMA_M_N"]
+        assert url.tdamm_tag_ml == ["MMA_M_N"]
+        assert url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G"]
+
+    def test_field_priority(self):
+        """Test that manual field takes priority over ML field"""
+        url = CandidateURLFactory(is_tdamm=True)
+
+        # Set ML tags first
+        url.tdamm_tag_ml = ["MMA_M_EM"]
+        assert url.tdamm_tag == ["MMA_M_EM"]
+
+        # Set manual tags - should take priority
+        url.tdamm_tag = ["MMA_M_G"]
+        assert url.tdamm_tag == ["MMA_M_G"]
+
+        # Clear manual tags - should fall back to ML tags
+        url.tdamm_tag_manual = None
+        assert url.tdamm_tag == ["MMA_M_EM"]
+
+    def test_empty_array_behavior(self):
+        """Test handling of empty arrays vs None"""
+        url = CandidateURLFactory(is_tdamm=True)
+
+        # Set ML tags
+        url.tdamm_tag_ml = ["MMA_M_EM"]
+        assert url.tdamm_tag == ["MMA_M_EM"]
+
+        # Empty manual array should not override ML tags
+        url.tdamm_tag = []
+        assert url.tdamm_tag == ["MMA_M_EM"]
+
+        # None manual value should not override ML tags
+        url.tdamm_tag = None
+        assert url.tdamm_tag == ["MMA_M_EM"]
+
+    def test_field_deletion(self):
+        """Test deletion of fields"""
+        url = CandidateURLFactory(is_tdamm=True)
+
+        # Set both manual and ML tags
+        url.tdamm_tag = ["MMA_M_EM"]
+        url.tdamm_tag_ml = ["MMA_M_G"]
+
+        # Delete tdamm_tag
+        del url.tdamm_tag
+        assert url.tdamm_tag_manual is None
+        assert url.tdamm_tag_ml is None
+
+    def test_multiple_tags(self):
+        """Test handling of multiple tags"""
+        url = CandidateURLFactory(is_tdamm=True)
+
+        # Test multiple manual tags
+        manual_tags = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        url.tdamm_tag = manual_tags
+        assert url.tdamm_tag_manual == manual_tags
+
+        # Test multiple ML tags
+        ml_tags = ["MMA_O_BH", "MMA_O_N"]
+        url.tdamm_tag_ml = ml_tags
+        assert url.tdamm_tag_ml == ml_tags
+
+    def test_persistence(self):
+        """Test that values persist after save"""
+        url = CandidateURLFactory(is_tdamm=True)
+
+        # Set values
+        url.tdamm_tag = ["MMA_M_EM"]
+        url.tdamm_tag_ml = ["MMA_M_G"]
+        url.save()
+
+        # Refresh from database
+        url.refresh_from_db()
+        assert url.tdamm_tag_manual == ["MMA_M_EM"]
+        assert url.tdamm_tag_ml == ["MMA_M_G"]

From 070750085dc8860cf0df661a20bc80644c5f3ceb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 Nov 2024 22:28:46 +0000
Subject: [PATCH 164/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/tests/factories.py       | 2 +-
 sde_collections/tests/test_tdamm_tags.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py
index cc9b3e61..47427993 100644
--- a/sde_collections/tests/factories.py
+++ b/sde_collections/tests/factories.py
@@ -2,8 +2,8 @@
 from django.contrib.auth import get_user_model
 from django.utils import timezone
 
-from sde_collections.models.collection import Collection
 from sde_collections.models.candidate_url import CandidateURL
+from sde_collections.models.collection import Collection
 from sde_collections.models.collection_choice_fields import (
     ConnectorChoices,
     Divisions,
diff --git a/sde_collections/tests/test_tdamm_tags.py b/sde_collections/tests/test_tdamm_tags.py
index 6db17e50..72ebfef2 100644
--- a/sde_collections/tests/test_tdamm_tags.py
+++ b/sde_collections/tests/test_tdamm_tags.py
@@ -1,6 +1,7 @@
 # docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/test_tdamm_tags.py
 
 import pytest
+
 from sde_collections.tests.factories import CandidateURLFactory
 
 

From e45eeeba8007a7f0bb3915b79cab721d72683ef2 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 20 Nov 2024 19:49:46 -0600
Subject: [PATCH 165/441] refactor code for DeltaUrl model

---
 sde_collections/admin.py                      |  78 ++--
 .../migrations/0066_merge_20241120_0158.py    |  13 +
 ..._candidateurl_tdamm_tag_manual_and_more.py |  21 ++
 ...manual_curatedurl_tdamm_tag_ml_and_more.py | 344 ++++++++++++++++++
 ..._candidateurl_tdamm_tag_manual_and_more.py | 124 +++++++
 sde_collections/models/delta_url.py           |  10 +-
 sde_collections/tests/test_tdamm_tags.py      | 125 +++++--
 7 files changed, 663 insertions(+), 52 deletions(-)
 create mode 100644 sde_collections/migrations/0066_merge_20241120_0158.py
 create mode 100644 sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py
 create mode 100644 sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py
 create mode 100644 sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 7706e9ec..28c96396 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -285,8 +285,9 @@ def exclude_and_delete_children(modeladmin, request, queryset):
         candidate_url.get_children().delete()
 
 
-class CandidateURLForm(forms.ModelForm):
-    # Define the fields as MultipleChoiceFields with checkboxes
+class TDAMMFormMixin(forms.ModelForm):
+    """Mixin for forms that need TDAMM tag fields"""
+
     tdamm_tag_manual = forms.MultipleChoiceField(
         choices=TDAMMTags.choices,
         required=False,
@@ -301,40 +302,28 @@ class CandidateURLForm(forms.ModelForm):
         widget=forms.CheckboxSelectMultiple,
     )
 
-    class Meta:
-        model = CandidateURL
-        fields = "__all__"
 
+class TDAMMAdminMixin:
+    """Mixin for admin classes that handle TDAMM tags"""
 
-class CandidateURLAdmin(admin.ModelAdmin):
-    """Admin view for CandidateURL"""
-
-    form = CandidateURLForm
-    list_display = ["url", "collection", "tdamm_tag_manual", "tdamm_tag_ml"]
+    list_display = ("url", "scraped_title", "generated_title", "collection")
     list_filter = ["collection"]
     search_fields = ("url", "collection__name")
 
     def get_fieldsets(self, request, obj=None):
         fieldsets = [
             (
-                "Essential Information",
+                "Overall Information",
                 {
                     "fields": (
                         "collection",
                         "url",
-                        "hash",
                         "scraped_title",
+                        "scraped_text",
                         "generated_title",
-                        "test_title",
-                        "production_title",
-                        "level",
                         "visited",
                         "document_type",
                         "division",
-                        "inferenced_by",
-                        "is_pdf",
-                        "present_on_test",
-                        "present_on_prod",
                     )
                 },
             ),
@@ -349,10 +338,39 @@ def get_fieldsets(self, request, obj=None):
                 },
             ),
         ]
-
         return fieldsets
 
 
+class CandidateURLForm(TDAMMFormMixin):
+    class Meta:
+        model = CandidateURL
+        fields = "__all__"
+
+
+class DumpURLForm(TDAMMFormMixin, forms.ModelForm):
+    class Meta:
+        model = DumpUrl
+        fields = "__all__"
+
+
+class DeltaURLForm(TDAMMFormMixin, forms.ModelForm):
+    class Meta:
+        model = DeltaUrl
+        fields = "__all__"
+
+
+class CuratedURLForm(TDAMMFormMixin, forms.ModelForm):
+    class Meta:
+        model = CuratedUrl
+        fields = "__all__"
+
+
+class CandidateURLAdmin(TDAMMAdminMixin, admin.ModelAdmin):
+    """Admin view for CandidateURL"""
+
+    form = CandidateURLForm
+
+
 class TitlePatternAdmin(admin.ModelAdmin):
     """Admin View for TitlePattern"""
 
@@ -408,25 +426,27 @@ class DeltaDivisionPatternAdmin(admin.ModelAdmin):
     search_fields = ("match_pattern", "division")
 
 
-class DumpUrlAdmin(admin.ModelAdmin):
+class DumpUrlAdmin(TDAMMAdminMixin, admin.ModelAdmin):
     """Admin View for DumpUrl"""
 
-    list_display = ("url", "scraped_title", "collection")
-    list_filter = ("collection",)
+    form = DumpURLForm
 
 
-class DeltaUrlAdmin(admin.ModelAdmin):
+class DeltaUrlAdmin(TDAMMAdminMixin, admin.ModelAdmin):
     """Admin View for DeltaUrl"""
 
-    list_display = ("url", "scraped_title", "generated_title", "collection")
-    list_filter = ("collection",)
+    form = DeltaURLForm
 
+    def get_fieldsets(self, request, obj=None):
+        fieldsets = super().get_fieldsets(request, obj)
+        fieldsets[0][1]["fields"] += ("to_delete",)
+        return fieldsets
 
-class CuratedUrlAdmin(admin.ModelAdmin):
+
+class CuratedUrlAdmin(TDAMMAdminMixin, admin.ModelAdmin):
     """Admin View for CuratedUrl"""
 
-    list_display = ("url", "scraped_title", "generated_title", "collection")
-    list_filter = ("collection",)
+    form = CuratedURLForm
 
 
 admin.site.register(WorkflowHistory, WorkflowHistoryAdmin)
diff --git a/sde_collections/migrations/0066_merge_20241120_0158.py b/sde_collections/migrations/0066_merge_20241120_0158.py
new file mode 100644
index 00000000..ccd58b61
--- /dev/null
+++ b/sde_collections/migrations/0066_merge_20241120_0158.py
@@ -0,0 +1,13 @@
+# Generated by Django 4.2.9 on 2024-11-20 07:58
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0059_candidateurl_tdamm_tag_manual_and_more"),
+        ("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"),
+    ]
+
+    operations = []
diff --git a/sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py
new file mode 100644
index 00000000..2391e18c
--- /dev/null
+++ b/sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py
@@ -0,0 +1,21 @@
+# Generated by Django 4.2.9 on 2024-11-20 16:12
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0066_merge_20241120_0158"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="candidateurl",
+            name="tdamm_tag_manual",
+        ),
+        migrations.RemoveField(
+            model_name="candidateurl",
+            name="tdamm_tag_ml",
+        ),
+    ]
diff --git a/sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py b/sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py
new file mode 100644
index 00000000..adae0e2b
--- /dev/null
+++ b/sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py
@@ -0,0 +1,344 @@
+# Generated by Django 4.2.9 on 2024-11-20 16:23
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0067_remove_candidateurl_tdamm_tag_manual_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="curatedurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AddField(
+            model_name="curatedurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AddField(
+            model_name="deltaurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AddField(
+            model_name="deltaurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AddField(
+            model_name="dumpurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AddField(
+            model_name="dumpurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+    ]
diff --git a/sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py
new file mode 100644
index 00000000..d45e8108
--- /dev/null
+++ b/sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py
@@ -0,0 +1,124 @@
+# Generated by Django 4.2.9 on 2024-11-20 23:42
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="candidateurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AddField(
+            model_name="candidateurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 3f1212e0..3f5effc6 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -3,8 +3,10 @@
 
 from django.db import models
 
-from .collection_choice_fields import Divisions, DocumentTypes
+from .collection_choice_fields import Divisions, DocumentTypes, TDAMMTags
 from .delta_patterns import DeltaExcludePattern, DeltaTitlePattern
+from ..utils.paired_field_descriptor import PairedFieldDescriptor
+from django.contrib.postgres.fields import ArrayField
 
 
 class DeltaUrlQuerySet(models.QuerySet):
@@ -62,6 +64,12 @@ class BaseUrl(models.Model):
     document_type = models.IntegerField(choices=DocumentTypes.choices, null=True)
     division = models.IntegerField(choices=Divisions.choices, null=True)
 
+    tdamm_tag = PairedFieldDescriptor(
+        field_name="tdamm_tag",
+        field_type=ArrayField(models.CharField(max_length=255, choices=TDAMMTags.choices), blank=True, null=True),
+        verbose_name="TDAMM Tags",
+    )
+
     class Meta:
         abstract = True
         ordering = ["url"]
diff --git a/sde_collections/tests/test_tdamm_tags.py b/sde_collections/tests/test_tdamm_tags.py
index 72ebfef2..8811bd68 100644
--- a/sde_collections/tests/test_tdamm_tags.py
+++ b/sde_collections/tests/test_tdamm_tags.py
@@ -2,30 +2,17 @@
 
 import pytest
 
-from sde_collections.tests.factories import CandidateURLFactory
+from ..models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory, DumpUrlFactory
 
 
 @pytest.mark.django_db
 class TestTDAMMFields:
-    def test_tdamm_switch_behavior(self):
-        """Test that TDAMM fields only work when switch is enabled"""
-        # Create URL with TDAMM disabled
-        url = CandidateURLFactory(is_tdamm=False)
-        url.tdamm_tag = ["MMA_M_EM"]
-        assert url.tdamm_tag is None
-        assert url.tdamm_tag_manual is None
-        assert url.tdamm_tag_ml is None
-
-        # Enable TDAMM
-        url.is_tdamm = True
-        url.save()
-        url.tdamm_tag = ["MMA_M_EM"]
-        assert url.tdamm_tag == ["MMA_M_EM"]
-        assert url.tdamm_tag_manual == ["MMA_M_EM"]
+    """Test core TDAMM tags functionality with DeltaUrl"""
 
     def test_manual_and_ml_field_behavior(self):
         """Test the relationship between manual and ML fields"""
-        url = CandidateURLFactory(is_tdamm=True)
+        url = DeltaUrlFactory()
 
         # Setting tdamm_tag affects only manual field
         url.tdamm_tag = ["MMA_M_EM", "MMA_M_G"]
@@ -39,7 +26,7 @@ def test_manual_and_ml_field_behavior(self):
 
     def test_field_priority(self):
         """Test that manual field takes priority over ML field"""
-        url = CandidateURLFactory(is_tdamm=True)
+        url = DeltaUrlFactory()
 
         # Set ML tags first
         url.tdamm_tag_ml = ["MMA_M_EM"]
@@ -55,7 +42,7 @@ def test_field_priority(self):
 
     def test_empty_array_behavior(self):
         """Test handling of empty arrays vs None"""
-        url = CandidateURLFactory(is_tdamm=True)
+        url = DeltaUrlFactory()
 
         # Set ML tags
         url.tdamm_tag_ml = ["MMA_M_EM"]
@@ -71,7 +58,7 @@ def test_empty_array_behavior(self):
 
     def test_field_deletion(self):
         """Test deletion of fields"""
-        url = CandidateURLFactory(is_tdamm=True)
+        url = DeltaUrlFactory()
 
         # Set both manual and ML tags
         url.tdamm_tag = ["MMA_M_EM"]
@@ -84,7 +71,7 @@ def test_field_deletion(self):
 
     def test_multiple_tags(self):
         """Test handling of multiple tags"""
-        url = CandidateURLFactory(is_tdamm=True)
+        url = DeltaUrlFactory()
 
         # Test multiple manual tags
         manual_tags = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
@@ -98,7 +85,7 @@ def test_multiple_tags(self):
 
     def test_persistence(self):
         """Test that values persist after save"""
-        url = CandidateURLFactory(is_tdamm=True)
+        url = DeltaUrlFactory()
 
         # Set values
         url.tdamm_tag = ["MMA_M_EM"]
@@ -109,3 +96,97 @@ def test_persistence(self):
         url.refresh_from_db()
         assert url.tdamm_tag_manual == ["MMA_M_EM"]
         assert url.tdamm_tag_ml == ["MMA_M_G"]
+
+
+@pytest.mark.django_db
+class TestTDAMMTagMigration:
+    """Test TDAMM tag behavior during the migration process"""
+
+    @pytest.fixture
+    def collection(self):
+        return CollectionFactory()
+
+    def test_tdamm_tags_preserved_in_migration(self, collection):
+        """Test that TDAMM tags are preserved when promoting from Dump to Delta"""
+        dump_url = DumpUrlFactory(collection=collection, url="https://example.com")
+        dump_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        dump_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"]
+        dump_url.save()
+
+        # Migrate to delta
+        collection.migrate_dump_to_delta()
+
+        # Verify tags in the migrated DeltaUrl
+        delta_url = DeltaUrl.objects.get(url="https://example.com")
+        assert delta_url.tdamm_tag == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        assert delta_url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        assert delta_url.tdamm_tag_ml == ["MMA_O_BH", "MMA_O_N"]
+
+    def test_tdamm_tags_updated_in_migration(self, collection):
+        """Test that TDAMM tags are updated during re-migration"""
+        # Initial migration
+        dump_url = DumpUrlFactory(collection=collection, url="https://example.com")
+        dump_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        dump_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"]
+        dump_url.save()
+
+        # Migrate to delta
+        collection.migrate_dump_to_delta()
+
+        # Create new DumpUrl with updated tags
+        updated_dump_url = DumpUrlFactory(collection=collection, url="https://example.com")
+        updated_dump_url.tdamm_tag = ["MMA_M_G"]
+        updated_dump_url.save()
+        collection.migrate_dump_to_delta()
+
+        # Verify tags were updated
+        delta_url = DeltaUrl.objects.get(url="https://example.com")
+        assert delta_url.tdamm_tag == ["MMA_M_G"]
+        assert delta_url.tdamm_tag_manual == ["MMA_M_G"]
+
+
+@pytest.mark.django_db
+class TestTDAMMTagPromotion:
+    """Test TDAMM tag behavior during the promotion process"""
+
+    @pytest.fixture
+    def collection(self):
+        return CollectionFactory()
+
+    def test_tdamm_tags_preserved_in_promotion(self, collection):
+        """Test that TDAMM tags are preserved when promoting from Delta to Curated"""
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com")
+        delta_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        delta_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"]
+        delta_url.save()
+
+        # Promote to curated
+        collection.promote_to_curated()
+
+        # Verify tags in the promoted CuratedUrl
+        curated_url = CuratedUrl.objects.get(url="https://example.com")
+        assert curated_url.tdamm_tag == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        assert curated_url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        assert curated_url.tdamm_tag_ml == ["MMA_O_BH", "MMA_O_N"]
+
+    def test_tdamm_tags_updated_in_promotion(self, collection):
+        """Test that TDAMM tags are updated during re-promotion"""
+        # Initial promotion
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com")
+        delta_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"]
+        delta_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"]
+        delta_url.save()
+
+        # Promote to curated
+        collection.promote_to_curated()
+
+        # Create new DeltaUrl with updated tags
+        updated_delta_url = DeltaUrlFactory(collection=collection, url="https://example.com")
+        updated_delta_url.tdamm_tag = ["MMA_M_G"]
+        updated_delta_url.save()
+        collection.promote_to_curated()
+
+        # Verify tags were updated
+        curated_url = CuratedUrl.objects.get(url="https://example.com")
+        assert curated_url.tdamm_tag == ["MMA_M_G"]
+        assert curated_url.tdamm_tag_manual == ["MMA_M_G"]

From c2f164c43089432ef6be32d4e03af97529d24331 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 20:05:05 -0600
Subject: [PATCH 166/441] add cmr processing code and tests

---
 scripts/ej/cmr_processing.py      | 317 +++++++++++++++++++++++++
 scripts/ej/cmr_to_models.py       | 129 ++++------
 scripts/ej/test_cmr_processing.py | 376 ++++++++++++++++++++++++++++++
 3 files changed, 735 insertions(+), 87 deletions(-)
 create mode 100644 scripts/ej/cmr_processing.py
 create mode 100644 scripts/ej/test_cmr_processing.py

diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
new file mode 100644
index 00000000..8ce6f44c
--- /dev/null
+++ b/scripts/ej/cmr_processing.py
@@ -0,0 +1,317 @@
+import urllib.parse
+from datetime import datetime
+from typing import NamedTuple
+
+
+class TemporalInfo(NamedTuple):
+    """Container for processed temporal information."""
+
+    latest_end_date: datetime | None
+    total_duration: int
+    resolution: str
+    resolution_unit: str
+    single_date_times: list[str]
+
+
+class SpatialInfo(NamedTuple):
+    """Container for processed spatial information."""
+
+    is_global: bool
+    resolution: str
+    bounding_rectangles: list[dict]
+
+
+class DownloadInfo(NamedTuple):
+    """Container for processed download information."""
+
+    has_distribution: bool
+    has_direct_download: bool
+    visualization_urls: list[str]
+    format: str
+
+
+class ProcessingInfo(NamedTuple):
+    """Container for processing level information."""
+
+    level: str
+    collection_type: str
+    data_centers: list[str]
+
+
+class CmrDataset:
+    """Comprehensive processor for CMR dataset information."""
+
+    def __init__(self, dataset: dict):
+        self.dataset = dataset
+        self.meta = dataset.get("meta", {})
+        self.umm = dataset.get("umm", {})
+        self.today = datetime.now()
+
+        # Process all information once during initialization
+        self.temporal_info = self._process_temporal_extents()
+        self.spatial_info = self._process_spatial_info()
+        self.download_info = self._process_download_info()
+        self.processing_info = self._process_processing_info()
+
+    @staticmethod
+    def _parse_datetime(date_str: str) -> datetime:
+        """Parse CMR datetime string to datetime object."""
+        try:
+            return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
+        except ValueError:
+            # Some dates might not have milliseconds
+            return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
+
+    def _check_temporal_range(self, range_datetime: dict) -> tuple[datetime, datetime]:
+        """Extract begin and end dates from a range datetime entry."""
+        begin_date = self._parse_datetime(range_datetime["BeginningDateTime"])
+        end_date = self._parse_datetime(range_datetime["EndingDateTime"])
+        return begin_date, end_date
+
+    def _process_temporal_extents(self) -> TemporalInfo:
+        """Process all temporal information."""
+        temporal_extents = self.umm.get("TemporalExtents", [])
+        latest_end_date = None
+        total_duration = 0
+        single_date_times = []
+
+        for extent in temporal_extents:
+            single_date_times.extend(extent.get("SingleDateTimes", []))
+            range_datetimes = extent.get("RangeDateTimes", [])
+
+            for range_dt in range_datetimes:
+                try:
+                    begin_date, end_date = self._check_temporal_range(range_dt)
+                    if latest_end_date is None or end_date > latest_end_date:
+                        latest_end_date = end_date
+                    total_duration += (end_date - begin_date).days
+                except (KeyError, ValueError):
+                    continue
+
+        # Fix: Extract Value and Unit correctly from the TemporalResolution dictionary
+        temporal_resolution_dict = temporal_extents[0].get("TemporalResolution", {}) if temporal_extents else {}
+        resolution_value = temporal_resolution_dict.get("Value", "")
+        resolution_unit = temporal_resolution_dict.get("Unit", "")
+
+        return TemporalInfo(
+            latest_end_date=latest_end_date,
+            total_duration=total_duration,
+            resolution=str(resolution_value),  # Convert to string in case it's a number
+            resolution_unit=resolution_unit,
+            single_date_times=single_date_times,
+        )
+
+    def _process_spatial_info(self) -> SpatialInfo:
+        """Process all spatial information."""
+        spatial_extent = self.umm.get("SpatialExtent", {})
+        horizontal_domain = spatial_extent.get("HorizontalSpatialDomain", {})
+        geometry = horizontal_domain.get("Geometry", {})
+        rectangles = geometry.get("BoundingRectangles", [])
+
+        is_global = any(
+            abs(rect.get("NorthBoundingCoordinate", 0)) >= 85
+            and abs(rect.get("SouthBoundingCoordinate", 0)) >= 85
+            and abs(rect.get("WestBoundingCoordinate", 0)) >= 175
+            and abs(rect.get("EastBoundingCoordinate", 0)) >= 175
+            for rect in rectangles
+        )
+
+        resolution_system = horizontal_domain.get("ResolutionAndCoordinateSystem", {})
+        resolution = resolution_system.get("HorizontalDataResolution", "")
+
+        return SpatialInfo(is_global, resolution, rectangles)
+
+    def _process_download_info(self) -> DownloadInfo:
+        """Process all download and visualization information."""
+        has_distribution = False
+        has_direct_download = False
+        visualization_urls = []
+
+        related_urls = self.umm.get("RelatedUrls", [])
+        for url in related_urls:
+            if url.get("URLContentType") == "DistributionURL" and url.get("Type") == "GET DATA":
+                has_distribution = True
+                if url.get("Subtype") == "DIRECT DOWNLOAD":
+                    has_direct_download = True
+            elif url.get("URLContentType") == "VisualizationURL":
+                visualization_urls.append(url.get("URL", ""))
+
+        return DownloadInfo(
+            has_distribution=has_distribution,
+            has_direct_download=has_direct_download,
+            visualization_urls=visualization_urls,
+            format=self.meta.get("format", ""),
+        )
+
+    def _process_processing_info(self) -> ProcessingInfo:
+        """Process all processing level information."""
+        processing_level = self.umm.get("ProcessingLevel", {}).get("Id", "")
+        collection_type = self.umm.get("CollectionDataType", "")
+        # Get all data center short names
+        data_centers = [
+            center.get("ShortName", "") for center in self.umm.get("DataCenters", []) if center.get("ShortName")
+        ]
+
+        return ProcessingInfo(processing_level, collection_type, data_centers)
+
+    def get_properties(self) -> tuple[str, str]:
+        """
+        Get dataset strengths and weaknesses together.
+        Returns tuple of (strengths_string, weaknesses_string).
+        """
+        strengths = set()
+        weaknesses = set()
+
+        # Collection activity
+        if self.umm.get("CollectionProgress") == "ACTIVE":
+            strengths.add("Data collection is ongoing")
+
+        # Data type
+        if self.processing_info.collection_type == "NEAR_REAL_TIME":
+            strengths.add("Near real-time data is available")
+
+        # Temporal characteristics
+        if self.temporal_info.latest_end_date:
+            age_in_days = (self.today - self.temporal_info.latest_end_date).days
+            if age_in_days <= (3 * 365):
+                strengths.add("Recent data is available")
+            else:
+                weaknesses.add("No recent data available")
+
+        if self.temporal_info.total_duration:
+            if self.temporal_info.total_duration >= (5 * 365):
+                strengths.add("Long temporal extent")
+            elif self.temporal_info.total_duration < 365:
+                weaknesses.add("Limited temporal extent")
+
+        # Download availability
+        if self.download_info.has_direct_download:
+            strengths.add("Direct data download available")
+        elif self.download_info.has_distribution:
+            weaknesses.add("Direct data download not available")
+
+        return (
+            "; ".join(sorted(strengths)) if strengths else "",
+            "; ".join(sorted(weaknesses)) if weaknesses else "",
+        )
+
+    @property
+    def strengths(self) -> str:
+        """Get dataset strengths."""
+        strengths, _ = self.get_properties()
+        return strengths
+
+    @property
+    def weaknesses(self) -> str:
+        """Get dataset weaknesses."""
+        _, weaknesses = self.get_properties()
+        return weaknesses
+
+    @property
+    def latency(self) -> str:
+        """Get dataset latency."""
+        latency_mapping = {
+            "NEAR_REAL_TIME": "1-3 Hours",
+            "LOW_LATENCY": "3 Hours to 1 Day",
+            "EXPEDITED": "1-4 Days",
+            "SCIENCE_QUALITY": "Not Provided",
+        }
+        return latency_mapping.get(self.processing_info.collection_type, "Not Provided")
+
+    @property
+    def intended_use(self) -> str:
+        """Get dataset intended use."""
+        level = self.processing_info.level
+        collection_type = self.processing_info.collection_type
+        data_centers = self.processing_info.data_centers
+
+        if level == "4" and collection_type == "SCIENCE_QUALITY":
+            return "exploration"
+
+        if (
+            (level in ["2", "2a", "2b"] and "SEDAC" in data_centers and collection_type == "SCIENCE_QUALITY")
+            or (level in ["3", "3a"] and collection_type == "SCIENCE_QUALITY")
+            or (level == "4" and collection_type != "SCIENCE_QUALITY")
+        ):
+            return "basic analysis"
+
+        return "advanced analysis"
+
+    @property
+    def geographic_coverage(self) -> str:
+        """Get dataset geographic coverage."""
+        return "Global" if self.spatial_info.is_global else ""
+
+    @property
+    def data_visualization(self) -> str:
+        """Get dataset visualization URLs."""
+        return "; ".join(self.download_info.visualization_urls)
+
+    @property
+    def temporal_resolution(self) -> str:
+        """Get dataset temporal resolution."""
+        if self.temporal_info.resolution and self.temporal_info.resolution_unit:
+            return f"{self.temporal_info.resolution} {self.temporal_info.resolution_unit}"
+        return ""
+
+    @property
+    def spatial_resolution(self) -> str:
+        """Get dataset spatial resolution."""
+        return self.spatial_info.resolution
+
+    @property
+    def projects(self) -> str:
+        """Get dataset projects."""
+        projects = self.umm.get("Projects", [])
+        return "; ".join(project.get("ShortName", "") for project in projects if project.get("ShortName"))
+
+    @property
+    def dataset_name(self) -> str:
+        """Get dataset short name."""
+        return self.umm.get("ShortName", "")
+
+    @property
+    def description(self) -> str:
+        """Get dataset abstract."""
+        return self.umm.get("Abstract", "")
+
+    @property
+    def limitations(self) -> str:
+        """Get dataset access constraints."""
+        return self.umm.get("AccessConstraints", {}).get("Description", "")
+
+    @property
+    def format(self) -> str:
+        """Get dataset format."""
+        return self.download_info.format
+
+    @property
+    def temporal_extent(self) -> str:
+        """Get dataset temporal extent."""
+        return ", ".join(self.temporal_info.single_date_times)
+
+    @property
+    def source_link(self) -> str:
+        """Generate source link from DOI information."""
+        doi_field = self.umm.get("DOI", {})
+        authority = doi_field.get("Authority")
+        doi = doi_field.get("DOI")
+        if authority and doi:
+            return urllib.parse.urljoin(authority, doi)
+        return ""
+
+    @property
+    def sde_link(self) -> str:
+        """Generate SDE link from concept ID."""
+        concept_id = self.meta.get("concept-id", "")
+        if not concept_id:
+            return ""
+
+        base_url = "https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview"
+        query = '{"name":"query-smd-primary","scope":"All","text":""}'
+        sinequa_id = f"/SDE/CMR_API/|{concept_id}"
+
+        encoded_id = urllib.parse.quote(sinequa_id, safe="")
+        encoded_query = urllib.parse.quote(query, safe="")
+
+        return f"{base_url}?id={encoded_id}&query={encoded_query}"
diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py
index f7ba46db..21995ce6 100644
--- a/scripts/ej/cmr_to_models.py
+++ b/scripts/ej/cmr_to_models.py
@@ -4,93 +4,48 @@
 """
 
 import json
-import urllib.parse
 
 from environmental_justice.models import EnvironmentalJusticeRow
 
-
-def generate_source_link(doi_field):
-    authority = doi_field.get("Authority")
-    doi = doi_field.get("DOI")
-    if authority and doi:
-        return urllib.parse.urljoin(authority, doi)
-    return ""
-
-
-def concept_id_to_sinequa_id(concept_id: str) -> str:
-    return f"/SDE/CMR_API/|{concept_id}"
-
-
-def sinequa_id_to_url(sinequa_id: str) -> str:
-    base_url = "https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview"
-    query = '{"name":"query-smd-primary","scope":"All","text":""}'
-
-    encoded_id = urllib.parse.quote(sinequa_id, safe="")
-    encoded_query = urllib.parse.quote(query, safe="")
-
-    return f"{base_url}?id={encoded_id}&query={encoded_query}"
-
-
-def categorize_processing_level(level):
-    advanced_analysis_levels = {"0", "Level 0", "NA", "Not Provided", "Not provided"}
-
-    basic_analysis_levels = {
-        "1",
-        "1A",
-        "1B",
-        "1C",
-        "1T",
-        "2",
-        "2A",
-        "2B",
-        "2G",
-        "2P",
-        "Level 1",
-        "Level 1A",
-        "Level 1B",
-        "Level 1C",
-        "Level 2",
-        "Level 2A",
-        "Level 2B",
-    }
-
-    exploration_levels = {"3", "4", "Level 3", "Level 4", "L2"}
-
-    if level in exploration_levels:
-        return "exploration"
-    elif level in basic_analysis_levels:
-        return "basic analysis"
-    elif level in advanced_analysis_levels:
-        return "advanced analysis"
-    else:
-        return "advanced analysis"
-
-
-# remove existing data
-EnvironmentalJusticeRow.objects.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV).delete()
-
-ej_dump = json.load(open("backups/ej_dump_20241017_133151.json.json"))
-for dataset in ej_dump:
-    ej_row = EnvironmentalJusticeRow(
-        destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV,
-        sde_link=sinequa_id_to_url(concept_id_to_sinequa_id(dataset.get("meta", {}).get("concept-id", ""))),
-        dataset=dataset.get("umm", {}).get("ShortName", ""),
-        description=dataset.get("umm", {}).get("Abstract", ""),
-        limitations=dataset.get("umm", {}).get("AccessConstraints", {}).get("Description", ""),
-        format=dataset.get("meta", {}).get("format", ""),
-        temporal_extent=", ".join(dataset.get("umm", {}).get("TemporalExtents", [{}])[0].get("SingleDateTimes", [])),
-        intended_use=categorize_processing_level(
-            dataset.get("umm", {}).get("ProcessingLevel", {}).get("Id", "advanced analysis")
-        ),
-        source_link=generate_source_link(dataset.get("umm", {}).get("DOI", {})),
-        indicators=dataset["indicators"],
-        geographic_coverage="",  # Not provided in the data
-        data_visualization="",  # dataset.get("umm", {}).get("RelatedUrls", [{}])[0].get("URL", ""),
-        latency="",  # Not provided in the data
-        spatial_resolution="",  # Not provided in the data
-        temporal_resolution="",  # Not provided in the data
-        description_simplified="",  # Not provided in the data
-        project="",  # Not provided in the data
-        strengths="",  # Not provided in the data
-    )
-    ej_row.save()
+from .cmr_processing import CmrDataset
+
+
+def process_ej_dump(file_path: str) -> None:
+    """Process EJ dump file and create database entries."""
+
+    destination_server = EnvironmentalJusticeRow.DestinationServerChoices.DEV
+
+    # Clear existing data
+    EnvironmentalJusticeRow.objects.filter(destination_server=destination_server).delete()
+
+    # Load the data
+    ej_dump = json.load(open(file_path))
+
+    # Process each dataset into the database
+    for dataset in ej_dump:
+        processed_dataset = CmrDataset(dataset)
+        ej_row = EnvironmentalJusticeRow(
+            destination_server=destination_server,
+            sde_link=processed_dataset.sde_link,
+            dataset=processed_dataset.dataset_name,
+            description=processed_dataset.description,
+            limitations=processed_dataset.limitations,
+            format=processed_dataset.format,
+            temporal_extent=processed_dataset.temporal_extent,
+            intended_use=processed_dataset.intended_use,
+            source_link=processed_dataset.source_link,
+            indicators=dataset["indicators"],
+            strengths=processed_dataset.strengths,
+            weaknesses=processed_dataset.weaknesses,
+            latency=processed_dataset.latency,
+            geographic_coverage=processed_dataset.geographic_coverage,
+            data_visualization=processed_dataset.data_visualization,
+            temporal_resolution=processed_dataset.temporal_resolution,
+            spatial_resolution=processed_dataset.spatial_resolution,
+            projects=processed_dataset.projects,
+        )
+        ej_row.save()
+
+
+if __name__ == "__main__":
+    process_ej_dump("backups/ej_dump_20241017_133151.json")
diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py
new file mode 100644
index 00000000..9e65e072
--- /dev/null
+++ b/scripts/ej/test_cmr_processing.py
@@ -0,0 +1,376 @@
+# docker-compose -f local.yml run --rm django pytest scripts/ej/test_cmr_processing.py
+import json
+
+import pytest
+from cmr_processing import CmrDataset
+
+
+# Helper function to load test data
+def load_test_data(file_path="scripts/ej/cmr_example.json"):
+    with open(file_path) as f:
+        return json.load(f)[0]  # First dataset from the example
+
+
+class TestCmrDatasetIntegration:
+    """Integration tests using real CMR data example"""
+
+    @pytest.fixture
+    def cmr_dataset(self):
+        return CmrDataset(load_test_data())
+
+    def test_full_dataset_processing(self, cmr_dataset):
+        """Test that all properties can be extracted from real data without errors"""
+        # Test all property accessors
+        assert cmr_dataset.dataset_name == "CIESIN_SEDAC_ESI_2000"
+        assert cmr_dataset.description.startswith("The 2000 Pilot Environmental Sustainability Index")
+        assert cmr_dataset.limitations == "None"
+        assert cmr_dataset.format == "application/vnd.nasa.cmr.umm+json"
+        assert cmr_dataset.temporal_extent == ""  # No SingleDateTimes in example
+        assert cmr_dataset.intended_use == "exploration"  # ProcessingLevel is 4
+        assert cmr_dataset.source_link == "https://doi.org/10.7927/H4NK3BZJ"
+        assert "Long temporal extent" in cmr_dataset.strengths
+        assert "No recent data available" in cmr_dataset.weaknesses
+        assert cmr_dataset.latency == "Not Provided"
+        assert cmr_dataset.geographic_coverage == ""
+        assert (
+            "https://sedac.ciesin.columbia.edu/downloads/maps/esi/esi-pilot-environmental-sustainability-index-2000/sedac-logo.jpg"  # noqa
+            in cmr_dataset.data_visualization
+        )
+        assert cmr_dataset.temporal_resolution == ""
+        assert cmr_dataset.spatial_resolution == ""
+        assert "ESI" in cmr_dataset.projects
+
+
+class TestTemporalProcessing:
+    """Unit tests for temporal information processing"""
+
+    def basic_temporal_data(self):
+        return {
+            "meta": {},
+            "umm": {
+                "TemporalExtents": [
+                    {
+                        "RangeDateTimes": [
+                            {
+                                "BeginningDateTime": "2020-01-01T00:00:00.000Z",
+                                "EndingDateTime": "2020-12-31T23:59:59.999Z",
+                            }
+                        ],
+                        "TemporalResolution": {"Unit": "Hour", "Value": 24},
+                    }
+                ]
+            },
+        }
+
+    def test_parse_datetime_with_milliseconds(self):
+        dataset = CmrDataset({})
+        dt = dataset._parse_datetime("2020-01-01T00:00:00.123Z")
+        assert dt.year == 2020
+        assert dt.microsecond == 123000
+
+    def test_parse_datetime_without_milliseconds(self):
+        dataset = CmrDataset({})
+        dt = dataset._parse_datetime("2020-01-01T00:00:00Z")
+        assert dt.year == 2020
+        assert dt.microsecond == 0
+
+    def test_temporal_info_with_invalid_dates(self):
+        data = {
+            "umm": {
+                "TemporalExtents": [
+                    {"RangeDateTimes": [{"BeginningDateTime": "invalid", "EndingDateTime": "2020-12-31T23:59:59.999Z"}]}
+                ]
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.temporal_info.total_duration == 0
+        assert dataset.temporal_info.latest_end_date is None
+
+    def test_temporal_resolution_parsing(self):
+        dataset = CmrDataset(self.basic_temporal_data())
+        assert dataset.temporal_resolution == "24 Hour"
+
+    def test_temporal_duration_calculation(self):
+        dataset = CmrDataset(self.basic_temporal_data())
+        assert dataset.temporal_info.total_duration == 365  # Full year
+
+    def test_multiple_time_ranges(self):
+        data = {
+            "meta": {},
+            "umm": {
+                "TemporalExtents": [
+                    {
+                        "RangeDateTimes": [
+                            {
+                                "BeginningDateTime": "2020-01-01T00:00:00.000Z",
+                                "EndingDateTime": "2020-06-30T23:59:59.999Z",
+                            },
+                            {
+                                "BeginningDateTime": "2020-07-01T00:00:00.000Z",
+                                "EndingDateTime": "2021-01-01T00:00:00.000Z",
+                            },
+                        ]
+                    }
+                ]
+            },
+        }
+        dataset = CmrDataset(data)
+        assert dataset.temporal_info.total_duration == 365
+
+    def test_single_date_times(self):
+        data = {
+            "meta": {},
+            "umm": {"TemporalExtents": [{"SingleDateTimes": ["2020-01-01T00:00:00.000Z", "2020-06-01T00:00:00.000Z"]}]},
+        }
+        dataset = CmrDataset(data)
+        assert len(dataset.temporal_info.single_date_times) == 2
+        assert dataset.temporal_extent == "2020-01-01T00:00:00.000Z, 2020-06-01T00:00:00.000Z"
+
+    def test_missing_temporal_data(self):
+        dataset = CmrDataset({"meta": {}, "umm": {}})
+        assert dataset.temporal_info.total_duration == 0
+        assert dataset.temporal_info.latest_end_date is None
+        assert dataset.temporal_resolution == ""
+
+
+class TestSpatialProcessing:
+    """Unit tests for spatial information processing"""
+
+    def test_global_coverage_detection(self):
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "Geometry": {
+                            "BoundingRectangles": [
+                                {
+                                    "NorthBoundingCoordinate": 90,
+                                    "SouthBoundingCoordinate": -90,
+                                    "WestBoundingCoordinate": -180,
+                                    "EastBoundingCoordinate": 180,
+                                }
+                            ]
+                        }
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.geographic_coverage == "Global"
+
+    def test_non_global_coverage(self):
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "Geometry": {
+                            "BoundingRectangles": [
+                                {
+                                    "NorthBoundingCoordinate": 45,
+                                    "SouthBoundingCoordinate": -45,
+                                    "WestBoundingCoordinate": -90,
+                                    "EastBoundingCoordinate": 90,
+                                }
+                            ]
+                        }
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.geographic_coverage == ""
+
+
+class TestDownloadProcessing:
+    """Unit tests for download information processing"""
+
+    def test_direct_download_detection(self):
+        data = {
+            "umm": {
+                "RelatedUrls": [
+                    {
+                        "URLContentType": "DistributionURL",
+                        "Type": "GET DATA",
+                        "Subtype": "DIRECT DOWNLOAD",
+                        "URL": "http://example.com/data",
+                    }
+                ]
+            }
+        }
+        dataset = CmrDataset(data)
+        assert "Direct data download available" in dataset.strengths
+
+    def test_visualization_urls(self):
+        data = {
+            "umm": {
+                "RelatedUrls": [
+                    {"URLContentType": "VisualizationURL", "URL": "http://example.com/viz1"},
+                    {"URLContentType": "VisualizationURL", "URL": "http://example.com/viz2"},
+                ]
+            }
+        }
+        dataset = CmrDataset(data)
+        assert "http://example.com/viz1" in dataset.data_visualization
+        assert "http://example.com/viz2" in dataset.data_visualization
+
+
+class TestProcessingLevelInfo:
+    """Unit tests for processing level information"""
+
+    def test_intended_use_exploration(self):
+        data = {"umm": {"ProcessingLevel": {"Id": "4"}, "CollectionDataType": "SCIENCE_QUALITY"}}
+        dataset = CmrDataset(data)
+        assert dataset.intended_use == "exploration"
+
+    def test_intended_use_basic_analysis(self):
+        data = {
+            "umm": {
+                "ProcessingLevel": {"Id": "2"},
+                "CollectionDataType": "SCIENCE_QUALITY",
+                "DataCenters": [{"ShortName": "SEDAC"}],
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.intended_use == "basic analysis"
+
+    def test_latency_mapping(self):
+        data = {"umm": {"CollectionDataType": "NEAR_REAL_TIME"}}
+        dataset = CmrDataset(data)
+        assert dataset.latency == "1-3 Hours"
+
+
+class TestPropertiesGeneration:
+    """Unit tests for strengths and weaknesses generation"""
+
+    def test_empty_properties(self):
+        dataset = CmrDataset({"meta": {}, "umm": {}})
+        assert dataset.strengths == ""
+        assert dataset.weaknesses == ""
+
+    def test_multiple_strengths(self):
+        data = {
+            "umm": {
+                "CollectionProgress": "ACTIVE",
+                "CollectionDataType": "NEAR_REAL_TIME",
+                "RelatedUrls": [
+                    {"URLContentType": "DistributionURL", "Type": "GET DATA", "Subtype": "DIRECT DOWNLOAD"}
+                ],
+            }
+        }
+        dataset = CmrDataset(data)
+        strengths = dataset.strengths.split("; ")
+        assert len(strengths) == 3
+        assert "Data collection is ongoing" in strengths
+        assert "Near real-time data is available" in strengths
+        assert "Direct data download available" in strengths
+
+
+class TestUrlProcessing:
+    """Unit tests for URL-related functionality"""
+
+    def test_sde_link_generation(self):
+        data = {"meta": {"concept-id": "C179001887-SEDAC"}}
+        dataset = CmrDataset(data)
+        assert "sciencediscoveryengine.nasa.gov" in dataset.sde_link
+        assert "C179001887-SEDAC" in dataset.sde_link
+
+    def test_source_link_generation(self):
+        data = {"umm": {"DOI": {"Authority": "https://doi.org/", "DOI": "10.1234/test"}}}
+        dataset = CmrDataset(data)
+        assert dataset.source_link == "https://doi.org/10.1234/test"
+
+    def test_missing_doi_info(self):
+        dataset = CmrDataset({"umm": {"DOI": {}}})
+        assert dataset.source_link == ""
+
+
+class TestProjectProcessing:
+    """Unit tests for project information processing"""
+
+    def test_multiple_projects(self):
+        data = {"umm": {"Projects": [{"ShortName": "Project1"}, {"ShortName": "Project2"}]}}
+        dataset = CmrDataset(data)
+        assert dataset.projects == "Project1; Project2"
+
+    def test_missing_project_shortname(self):
+        data = {"umm": {"Projects": [{"LongName": "Project1"}, {"ShortName": "Project2"}]}}
+        dataset = CmrDataset(data)
+        assert dataset.projects == "Project2"
+
+    def test_no_projects(self):
+        dataset = CmrDataset({"umm": {}})
+        assert dataset.projects == ""
+
+
+class TestStrengthsWeaknesses:
+    """Unit tests for strengths and weaknesses generation"""
+
+    def test_recent_data_strength(self):
+        data = {
+            "umm": {
+                "TemporalExtents": [
+                    {
+                        "RangeDateTimes": [
+                            {
+                                "BeginningDateTime": "2023-01-01T00:00:00.000Z",
+                                "EndingDateTime": "2024-01-01T00:00:00.000Z",
+                            }
+                        ]
+                    }
+                ]
+            }
+        }
+        dataset = CmrDataset(data)
+        assert "Recent data is available" in dataset.strengths
+
+    def test_weaknesses_combination(self):
+        data = {
+            "umm": {
+                "TemporalExtents": [
+                    {
+                        "RangeDateTimes": [
+                            {
+                                "BeginningDateTime": "2020-01-01T00:00:00.000Z",
+                                "EndingDateTime": "2020-02-01T00:00:00.000Z",
+                            }
+                        ]
+                    }
+                ],
+                "RelatedUrls": [{"URLContentType": "DistributionURL", "Type": "GET DATA"}],
+            }
+        }
+        dataset = CmrDataset(data)
+        weaknesses = dataset.weaknesses.split("; ")
+        assert "Limited temporal extent" in weaknesses
+        assert "Direct data download not available" in weaknesses
+
+
+class TestEdgeCases:
+    """Tests for edge cases and error handling"""
+
+    def test_empty_dataset(self):
+        dataset = CmrDataset({})
+        assert dataset.dataset_name == ""
+        assert dataset.description == ""
+        assert dataset.limitations == ""
+        assert dataset.strengths == ""
+        assert dataset.weaknesses == ""
+
+    def test_malformed_dates(self):
+        data = {
+            "umm": {
+                "TemporalExtents": [
+                    {
+                        "RangeDateTimes": [
+                            {"BeginningDateTime": "not-a-date", "EndingDateTime": "2020-01-01T00:00:00.000Z"}
+                        ]
+                    }
+                ]
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.temporal_info.total_duration == 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From b955e6726236861caa993987014494ea33e49ea7 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 20:39:12 -0600
Subject: [PATCH 167/441] move cmr processing into create_ej_dump

---
 scripts/ej/cmr_processing.py | 22 ++++++++++++++++
 scripts/ej/cmr_to_models.py  | 51 ++++++++++++++++++------------------
 scripts/ej/create_ej_dump.py | 48 ++++++++++++++++++++-------------
 3 files changed, 77 insertions(+), 44 deletions(-)

diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
index 8ce6f44c..93793575 100644
--- a/scripts/ej/cmr_processing.py
+++ b/scripts/ej/cmr_processing.py
@@ -315,3 +315,25 @@ def sde_link(self) -> str:
         encoded_query = urllib.parse.quote(query, safe="")
 
         return f"{base_url}?id={encoded_id}&query={encoded_query}"
+
+    def to_dict(self) -> dict:
+        """Convert CmrDataset to a dictionary with all final ej fields."""
+        return {
+            "concept_id": self.meta.get("concept-id", ""),
+            "dataset": self.dataset_name,
+            "description": self.description,
+            "limitations": self.limitations,
+            "format": self.format,
+            "temporal_extent": self.temporal_extent,
+            "intended_use": self.intended_use,
+            "source_link": self.source_link,
+            "sde_link": self.sde_link,
+            "strengths": self.strengths,
+            "weaknesses": self.weaknesses,
+            "latency": self.latency,
+            "geographic_coverage": self.geographic_coverage,
+            "data_visualization": self.data_visualization,
+            "temporal_resolution": self.temporal_resolution,
+            "spatial_resolution": self.spatial_resolution,
+            "projects": self.projects,
+        }
diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py
index 21995ce6..2c070ef2 100644
--- a/scripts/ej/cmr_to_models.py
+++ b/scripts/ej/cmr_to_models.py
@@ -1,14 +1,13 @@
 """
-the ej_dump is generated by running create_ej_dump.py and is scp'd to the COSMOS server
-this script is then run via the dm shell on the COSMOS server to populate the database
+Loads preprocessed EJ dump and creates database entries.
+The EJ dump is generated by running create_ej_dump.py and is scp'd to the COSMOS server.
+This script is then run via the dm shell on the COSMOS server to populate the database.
 """
 
 import json
 
 from environmental_justice.models import EnvironmentalJusticeRow
 
-from .cmr_processing import CmrDataset
-
 
 def process_ej_dump(file_path: str) -> None:
     """Process EJ dump file and create database entries."""
@@ -18,31 +17,31 @@ def process_ej_dump(file_path: str) -> None:
     # Clear existing data
     EnvironmentalJusticeRow.objects.filter(destination_server=destination_server).delete()
 
-    # Load the data
-    ej_dump = json.load(open(file_path))
+    # Load the preprocessed data
+    with open(file_path) as f:
+        clean_data = json.load(f)
 
-    # Process each dataset into the database
-    for dataset in ej_dump:
-        processed_dataset = CmrDataset(dataset)
+    # Create database entries
+    for entry in clean_data:
         ej_row = EnvironmentalJusticeRow(
             destination_server=destination_server,
-            sde_link=processed_dataset.sde_link,
-            dataset=processed_dataset.dataset_name,
-            description=processed_dataset.description,
-            limitations=processed_dataset.limitations,
-            format=processed_dataset.format,
-            temporal_extent=processed_dataset.temporal_extent,
-            intended_use=processed_dataset.intended_use,
-            source_link=processed_dataset.source_link,
-            indicators=dataset["indicators"],
-            strengths=processed_dataset.strengths,
-            weaknesses=processed_dataset.weaknesses,
-            latency=processed_dataset.latency,
-            geographic_coverage=processed_dataset.geographic_coverage,
-            data_visualization=processed_dataset.data_visualization,
-            temporal_resolution=processed_dataset.temporal_resolution,
-            spatial_resolution=processed_dataset.spatial_resolution,
-            projects=processed_dataset.projects,
+            sde_link=entry["sde_link"],
+            dataset=entry["dataset"],
+            description=entry["description"],
+            limitations=entry["limitations"],
+            format=entry["format"],
+            temporal_extent=entry["temporal_extent"],
+            intended_use=entry["intended_use"],
+            source_link=entry["source_link"],
+            indicators=entry["indicators"],
+            strengths=entry["strengths"],
+            weaknesses=entry["weaknesses"],
+            latency=entry["latency"],
+            geographic_coverage=entry["geographic_coverage"],
+            data_visualization=entry["data_visualization"],
+            temporal_resolution=entry["temporal_resolution"],
+            spatial_resolution=entry["spatial_resolution"],
+            projects=entry["projects"],
         )
         ej_row.save()
 
diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py
index c44aebc5..cca38284 100644
--- a/scripts/ej/create_ej_dump.py
+++ b/scripts/ej/create_ej_dump.py
@@ -1,13 +1,18 @@
 """
 inferences are supplied by the classification model. the contact point is Bishwas
-cmr is supplied by running
+
+CMR data is supplied by running:
 github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py
-move to the server like this: scp ej_dump_20241017_133151.json  sde:/home/ec2-user/sde_indexing_helper/backups/
+
+Move to the server like this:
+scp ej_dump_20241017_133151.json sde:/home/ec2-user/sde_indexing_helper/backups/
 """
 
 import json
 from datetime import datetime
 
+from cmr_processing import CmrDataset
+
 
 def load_json_file(file_path: str) -> dict:
     with open(file_path) as file:
@@ -62,33 +67,37 @@ def remove_unauthorized_classifications(classifications: list[str]) -> list[str]
     return [cls for cls in classifications if cls in authorized_classifications]
 
 
-def update_cmr_with_classifications(
+def create_clean_dataset(
     inferences: list[dict[str, dict]],
     cmr_dict: dict[str, dict[str, dict]],
     thresholds: dict[str, float],
-) -> list[dict[str, dict]]:
-    """Update CMR data with valid classifications based on inferences."""
+) -> list[dict]:
+    """Create clean dataset with processed CMR data and classifications."""
 
-    predicted_cmr = []
+    clean_data = []
 
     for inference in inferences:
-        classifications = process_classifications(predictions=inference["predictions"], thresholds=thresholds)
-        classifications = remove_unauthorized_classifications(classifications)
+        concept_id = inference["concept-id"]
+        cmr_dataset = cmr_dict.get(concept_id)
 
-        if classifications:
-            cmr_dataset = cmr_dict.get(inference["concept-id"])
+        if cmr_dataset:
+            # Process classifications
+            classifications = process_classifications(predictions=inference["predictions"], thresholds=thresholds)
+            classifications = remove_unauthorized_classifications(classifications)
 
-            if cmr_dataset:
-                cmr_dataset["indicators"] = ";".join(classifications)
-                predicted_cmr.append(cmr_dataset)
+            if classifications:
+                # Process CMR data
+                processed_cmr = CmrDataset(cmr_dataset).to_dict()
+                processed_cmr["indicators"] = ";".join(classifications)
+                clean_data.append(processed_cmr)
 
-    return predicted_cmr
+    return clean_data
 
 
 def main():
     thresholds = {
         "Not EJ": 0.80,
-        "Climate Change": 0.95,
+        "Climate Change": 1,
         "Disasters": 0.80,
         "Extreme Heat": 0.50,
         "Food Availability": 0.80,
@@ -98,17 +107,20 @@ def main():
         "Water Availability": 0.80,
     }
 
+    # Load input files
     inferences = load_json_file("alpha-1.3-wise-vortex-42-predictions.json")
     cmr = load_json_file("cmr_collections_umm_20240807_142146.json")
 
+    # Create CMR dictionary
     cmr_dict = create_cmr_dict(cmr)
 
-    predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, thresholds=thresholds)
+    # Create clean dataset with all required fields
+    clean_data = create_clean_dataset(inferences=inferences, cmr_dict=cmr_dict, thresholds=thresholds)
 
+    # Save output
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     file_name = f"ej_dump_{timestamp}.json"
-
-    save_to_json(predicted_cmr, file_name)
+    save_to_json(clean_data, file_name)
     print(f"Saved to {file_name}")
 
 

From 94a2588b080c14ddc6801fbe7401e19186fbd0df Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 21:09:12 -0600
Subject: [PATCH 168/441] refactor ej processing and add readme

---
 scripts/ej/README.md               |  85 ++++++++++++++++++
 scripts/ej/cmr_to_models.py        |   7 +-
 scripts/ej/config.py               |  34 ++++++++
 scripts/ej/create_ej_dump.py       | 134 ++++++++++++++---------------
 scripts/ej/threshold_processing.py |  72 ++++++++++++++++
 5 files changed, 258 insertions(+), 74 deletions(-)
 create mode 100644 scripts/ej/README.md
 create mode 100644 scripts/ej/config.py
 create mode 100644 scripts/ej/threshold_processing.py

diff --git a/scripts/ej/README.md b/scripts/ej/README.md
new file mode 100644
index 00000000..28df05a5
--- /dev/null
+++ b/scripts/ej/README.md
@@ -0,0 +1,85 @@
+# EJ Data Processing Pipeline
+
+This pipeline processes NASA Common Metadata Repository (CMR) data and environmental justice (EJ) classifications to create standardized data dumps for the Science Discovery Engine (SDE).
+
+## Overview
+
+The pipeline consists of several components:
+- CMR data processing
+- Environmental justice classification processing
+- Threshold-based filtering
+- Data dump creation
+
+## Prerequisites
+
+- Access to CMR collection data
+- Access to the classification model predictions (contact Bishwas for access)
+
+## Setup
+
+1. Clone the repository
+2. Install dependencies
+3. Configure settings in `scripts/ej/config.py`
+
+## Input Files
+
+You need two main input files:
+
+1. **CMR Collections Data**: Generated using:
+```bash
+github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py
+```
+
+2. **Classification Predictions**: Provided by the classification model, contact Bishwas for access
+
+## Configuration
+
+Edit `config.py` to customize:
+
+- Classification thresholds
+- Authorized classifications
+- Input/output filenames
+- Timestamp formats
+
+Example configuration:
+```python
+# Adjust thresholds for different indicators
+INDICATOR_THRESHOLDS = {
+    "Climate Change": 1.0,
+    "Disasters": 0.80,
+    # ... other thresholds
+}
+
+# Change default filenames
+DEFAULT_CMR_FILENAME = "your_cmr_file.json"
+DEFAULT_INFERENCE_FILENAME = "your_predictions.json"
+```
+
+## Usage
+
+### Basic Usage
+
+Run the pipeline on a local machine with the input files:
+```bash
+python create_ej_dump.py
+```
+
+## Output
+
+The pipeline generates a JSON file named `ej_dump_YYYYMMDD_HHMMSS.json` containing:
+- Processed CMR metadata
+- Environmental justice classifications
+
+## Server Deployment
+
+To deploy the output to the server:
+```bash
+# Copy to server
+scp ej_dump_YYYYMMDD_HHMMSS.json sde:/home/ec2-user/sde_indexing_helper/backups/
+
+# Process on server using dm shell
+dmshell
+
+# add your file name to cmr_to_models.py
+# paste and run the contents within the shell
+```
diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py
index 2c070ef2..88d60e62 100644
--- a/scripts/ej/cmr_to_models.py
+++ b/scripts/ej/cmr_to_models.py
@@ -1,7 +1,7 @@
 """
 Loads preprocessed EJ dump and creates database entries.
-The EJ dump is generated by running create_ej_dump.py and is scp'd to the COSMOS server.
-This script is then run via the dm shell on the COSMOS server to populate the database.
+
+See README.md for more information.
 """
 
 import json
@@ -46,5 +46,4 @@ def process_ej_dump(file_path: str) -> None:
         ej_row.save()
 
 
-if __name__ == "__main__":
-    process_ej_dump("backups/ej_dump_20241017_133151.json")
+process_ej_dump("backups/ej_dump_20241017_133151.json")
diff --git a/scripts/ej/config.py b/scripts/ej/config.py
new file mode 100644
index 00000000..5943373d
--- /dev/null
+++ b/scripts/ej/config.py
@@ -0,0 +1,34 @@
+"""Configuration settings for EJ data processing."""
+
+# Threshold values for different indicators
+INDICATOR_THRESHOLDS = {
+    "Not EJ": 0.80,
+    # "Climate Change": 1.0,
+    "Disasters": 0.80,
+    "Extreme Heat": 0.50,
+    "Food Availability": 0.80,
+    "Health & Air Quality": 0.90,
+    "Human Dimensions": 0.80,
+    "Urban Flooding": 0.50,
+    "Water Availability": 0.80,
+}
+
+# List of authorized classifications
+AUTHORIZED_CLASSIFICATIONS = [
+    # "Climate Change",
+    "Disasters",
+    "Extreme Heat",
+    "Food Availability",
+    "Health & Air Quality",
+    "Human Dimensions",
+    "Urban Flooding",
+    "Water Availability",
+]
+
+# File paths and names
+CMR_FILENAME = "cmr_collections_umm_20240807_142146.json"
+INFERENCE_FILENAME = "alpha-1.3-wise-vortex-42-predictions.json"
+
+# Output format
+TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"
+OUTPUT_FILENAME_TEMPLATE = "ej_dump_{}.json"
diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py
index cca38284..62097545 100644
--- a/scripts/ej/create_ej_dump.py
+++ b/scripts/ej/create_ej_dump.py
@@ -1,79 +1,62 @@
 """
-inferences are supplied by the classification model. the contact point is Bishwas
-
-CMR data is supplied by running:
-github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py
-
-Move to the server like this:
-scp ej_dump_20241017_133151.json sde:/home/ec2-user/sde_indexing_helper/backups/
+Creates EJ dump files by processing CMR data and classifications.
 """
 
 import json
 from datetime import datetime
 
 from cmr_processing import CmrDataset
+from threshold_processing import ThresholdProcessor
+
+from config import (
+    CMR_FILENAME,
+    INFERENCE_FILENAME,
+    OUTPUT_FILENAME_TEMPLATE,
+    TIMESTAMP_FORMAT,
+)
 
 
 def load_json_file(file_path: str) -> dict:
+    """Load and parse a JSON file."""
     with open(file_path) as file:
         return json.load(file)
 
 
 def save_to_json(data: dict | list, file_path: str) -> None:
+    """Save data to a JSON file with proper formatting."""
     with open(file_path, "w") as file:
         json.dump(data, file, indent=2)
 
 
-def process_classifications(predictions: list[dict[str, float]], thresholds: dict[str, float]) -> list[str]:
+def create_cmr_dict(cmr_data: list[dict]) -> dict[str, dict]:
     """
-    Process the predictions and classify based on the individual thresholds per indicator:
-    1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification.
-    2. Filter classifications based on their individual thresholds, excluding 'Not EJ'.
-    3. Default to 'Not EJ' if no classifications meet the threshold.
-    """
-    highest_prediction = max(predictions, key=lambda x: x["score"])
-
-    if highest_prediction["label"] == "Not EJ":
-        return ["Not EJ"]
+    Restructure CMR data into a dictionary with concept-id as the key.
 
-    classifications = [
-        pred["label"]
-        for pred in predictions
-        if pred["score"] >= thresholds[pred["label"]] and pred["label"] != "Not EJ"
-    ]
+    Args:
+        cmr_data: List of CMR dataset dictionaries.
 
-    return classifications if classifications else ["Not EJ"]
-
-
-def create_cmr_dict(cmr_data: list[dict[str, dict[str, str]]]) -> dict[str, dict[str, dict[str, str]]]:
-    """Restructure CMR data into a dictionary with 'concept-id' as the key."""
+    Returns:
+        Dictionary mapping concept-ids to their respective CMR data.
+    """
     return {dataset["meta"]["concept-id"]: dataset for dataset in cmr_data}
 
 
-def remove_unauthorized_classifications(classifications: list[str]) -> list[str]:
-    """Filter classifications to keep only those in the authorized list."""
-
-    authorized_classifications = [
-        "Climate Change",
-        "Disasters",
-        "Extreme Heat",
-        "Food Availability",
-        "Health & Air Quality",
-        "Human Dimensions",
-        "Urban Flooding",
-        "Water Availability",
-    ]
-
-    return [cls for cls in classifications if cls in authorized_classifications]
-
-
 def create_clean_dataset(
-    inferences: list[dict[str, dict]],
-    cmr_dict: dict[str, dict[str, dict]],
-    thresholds: dict[str, float],
+    inferences: list[dict],
+    cmr_dict: dict[str, dict],
+    processor: ThresholdProcessor,
 ) -> list[dict]:
-    """Create clean dataset with processed CMR data and classifications."""
+    """
+    Create clean dataset with processed CMR data and classifications.
 
+    Args:
+        inferences: List of inference dictionaries containing predictions.
+        cmr_dict: Dictionary mapping concept-ids to CMR data.
+        processor: ThresholdProcessor instance for processing classifications.
+
+    Returns:
+        List of processed dataset dictionaries.
+    """
     clean_data = []
 
     for inference in inferences:
@@ -82,8 +65,7 @@ def create_clean_dataset(
 
         if cmr_dataset:
             # Process classifications
-            classifications = process_classifications(predictions=inference["predictions"], thresholds=thresholds)
-            classifications = remove_unauthorized_classifications(classifications)
+            classifications = processor.process_and_filter(inference["predictions"])
 
             if classifications:
                 # Process CMR data
@@ -94,35 +76,47 @@ def create_clean_dataset(
     return clean_data
 
 
-def main():
-    thresholds = {
-        "Not EJ": 0.80,
-        "Climate Change": 1,
-        "Disasters": 0.80,
-        "Extreme Heat": 0.50,
-        "Food Availability": 0.80,
-        "Health & Air Quality": 0.90,
-        "Human Dimensions": 0.80,
-        "Urban Flooding": 0.50,
-        "Water Availability": 0.80,
-    }
+def main(
+    cmr_file: str = CMR_FILENAME,
+    inference_file: str = INFERENCE_FILENAME,
+) -> None:
+    """
+    Main function to create EJ dump file.
+
+    Args:
+        cmr_file: Path to the CMR data JSON file.
+        inference_file: Path to the inference predictions JSON file.
+    """
+    # Initialize processor
+    processor = ThresholdProcessor()
 
     # Load input files
-    inferences = load_json_file("alpha-1.3-wise-vortex-42-predictions.json")
-    cmr = load_json_file("cmr_collections_umm_20240807_142146.json")
+    inferences = load_json_file(inference_file)
+    cmr = load_json_file(cmr_file)
 
     # Create CMR dictionary
     cmr_dict = create_cmr_dict(cmr)
 
     # Create clean dataset with all required fields
-    clean_data = create_clean_dataset(inferences=inferences, cmr_dict=cmr_dict, thresholds=thresholds)
+    clean_data = create_clean_dataset(
+        inferences=inferences,
+        cmr_dict=cmr_dict,
+        processor=processor,
+    )
+
+    # Generate output filename with timestamp
+    timestamp = datetime.now().strftime(TIMESTAMP_FORMAT)
+    output_filename = OUTPUT_FILENAME_TEMPLATE.format(timestamp)
 
     # Save output
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    file_name = f"ej_dump_{timestamp}.json"
-    save_to_json(clean_data, file_name)
-    print(f"Saved to {file_name}")
+    save_to_json(clean_data, output_filename)
+    print(f"Processed {len(clean_data)} datasets from {cmr_file} and {inference_file}")
+    print()
+    print(f"Saved to {output_filename}")
 
 
 if __name__ == "__main__":
-    main()
+    main(
+        cmr_file=CMR_FILENAME,
+        inference_file=INFERENCE_FILENAME,
+    )
diff --git a/scripts/ej/threshold_processing.py b/scripts/ej/threshold_processing.py
new file mode 100644
index 00000000..549a6281
--- /dev/null
+++ b/scripts/ej/threshold_processing.py
@@ -0,0 +1,72 @@
+"""Module for processing classification predictions with thresholds."""
+
+from config import AUTHORIZED_CLASSIFICATIONS, INDICATOR_THRESHOLDS
+
+
+class ThresholdProcessor:
+    """
+    Processes classification predictions using configurable thresholds.
+    """
+
+    def __init__(self, thresholds: dict[str, float] = None):
+        """
+        Initialize the processor with thresholds.
+
+        Args:
+            thresholds: Dictionary of classification labels and their threshold values.
+                       If None, uses default thresholds from config.
+        """
+        self.thresholds = thresholds or INDICATOR_THRESHOLDS
+
+    def process_predictions(self, predictions: list[dict[str, float]]) -> list[str]:
+        """
+        Process predictions and classify based on individual thresholds.
+
+        Args:
+            predictions: List of dictionaries containing prediction labels and scores.
+                       Each dict should have 'label' and 'score' keys.
+
+        Returns:
+            List of classification labels that meet their respective thresholds.
+        """
+        # Find highest scoring prediction
+        highest_prediction = max(predictions, key=lambda x: x["score"])
+
+        # If highest prediction is "Not EJ", return it as the only classification
+        if highest_prediction["label"] == "Not EJ":
+            return ["Not EJ"]
+
+        # Filter classifications based on thresholds
+        classifications = [
+            pred["label"]
+            for pred in predictions
+            if (pred["score"] >= self.thresholds[pred["label"]] and pred["label"] != "Not EJ")
+        ]
+
+        # Default to "Not EJ" if no classifications meet thresholds
+        return classifications if classifications else ["Not EJ"]
+
+    def filter_authorized_classifications(self, classifications: list[str]) -> list[str]:
+        """
+        Filter classifications to keep only authorized ones.
+
+        Args:
+            classifications: List of classification labels.
+
+        Returns:
+            List of authorized classification labels.
+        """
+        return [cls for cls in classifications if cls in AUTHORIZED_CLASSIFICATIONS]
+
+    def process_and_filter(self, predictions: list[dict[str, float]]) -> list[str]:
+        """
+        Process predictions and filter to authorized classifications.
+
+        Args:
+            predictions: List of dictionaries containing prediction labels and scores.
+
+        Returns:
+            List of authorized classification labels that meet their thresholds.
+        """
+        classifications = self.process_predictions(predictions)
+        return self.filter_authorized_classifications(classifications)

From ae0ff18cd352ac0f0a31001190e68fe561e6fda7 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 21:13:41 -0600
Subject: [PATCH 169/441] clarify code to only allow passage of ej
 classifications

---
 scripts/ej/create_ej_dump.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py
index 62097545..233b6296 100644
--- a/scripts/ej/create_ej_dump.py
+++ b/scripts/ej/create_ej_dump.py
@@ -48,6 +48,7 @@ def create_clean_dataset(
 ) -> list[dict]:
     """
     Create clean dataset with processed CMR data and classifications.
+    Excludes datasets classified as 'Not EJ'.
 
     Args:
         inferences: List of inference dictionaries containing predictions.
@@ -55,7 +56,7 @@ def create_clean_dataset(
         processor: ThresholdProcessor instance for processing classifications.
 
     Returns:
-        List of processed dataset dictionaries.
+        List of processed dataset dictionaries, excluding 'Not EJ' classifications.
     """
     clean_data = []
 
@@ -67,7 +68,8 @@ def create_clean_dataset(
             # Process classifications
             classifications = processor.process_and_filter(inference["predictions"])
 
-            if classifications:
+            # Only include datasets that have valid classifications and are not marked as 'Not EJ'
+            if classifications and "Not EJ" not in classifications:
                 # Process CMR data
                 processed_cmr = CmrDataset(cmr_dataset).to_dict()
                 processed_cmr["indicators"] = ";".join(classifications)
@@ -97,7 +99,7 @@ def main(
     # Create CMR dictionary
     cmr_dict = create_cmr_dict(cmr)
 
-    # Create clean dataset with all required fields
+    # Create clean dataset with all required fields, excluding 'Not EJ' classifications
     clean_data = create_clean_dataset(
         inferences=inferences,
         cmr_dict=cmr_dict,
@@ -110,7 +112,7 @@ def main(
 
     # Save output
     save_to_json(clean_data, output_filename)
-    print(f"Processed {len(clean_data)} datasets from {cmr_file} and {inference_file}")
+    print(f"Processed {len(clean_data)} EJ datasets from {cmr_file} and {inference_file}")
     print()
     print(f"Saved to {output_filename}")
 

From 08124c4b8e300f1538d2ef0fe25bbb77c2ce0243 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 21:16:57 -0600
Subject: [PATCH 170/441] remove references to default

---
 scripts/ej/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/ej/README.md b/scripts/ej/README.md
index 28df05a5..456adb6a 100644
--- a/scripts/ej/README.md
+++ b/scripts/ej/README.md
@@ -50,9 +50,9 @@ INDICATOR_THRESHOLDS = {
     # ... other thresholds
 }
 
-# Change default filenames
-DEFAULT_CMR_FILENAME = "your_cmr_file.json"
-DEFAULT_INFERENCE_FILENAME = "your_predictions.json"
+# Change filenames
+CMR_FILENAME = "your_cmr_file.json"
+INFERENCE_FILENAME = "your_predictions.json"
 ```
 
 ## Usage

From c2468add5515019c367d67f7f54d66865667651a Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 21:19:29 -0600
Subject: [PATCH 171/441] update config to include climate change threshold

---
 scripts/ej/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ej/config.py b/scripts/ej/config.py
index 5943373d..90b4d10f 100644
--- a/scripts/ej/config.py
+++ b/scripts/ej/config.py
@@ -3,7 +3,7 @@
 # Threshold values for different indicators
 INDICATOR_THRESHOLDS = {
     "Not EJ": 0.80,
-    # "Climate Change": 1.0,
+    "Climate Change": 1.0,
     "Disasters": 0.80,
     "Extreme Heat": 0.50,
     "Food Availability": 0.80,

From a6fe59c8d209617ce6d39b540ab6cfd327b021ec Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 21:27:13 -0600
Subject: [PATCH 172/441] update limitations to use weaknesses data

---
 scripts/ej/cmr_to_models.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py
index 88d60e62..f4e5a523 100644
--- a/scripts/ej/cmr_to_models.py
+++ b/scripts/ej/cmr_to_models.py
@@ -28,22 +28,23 @@ def process_ej_dump(file_path: str) -> None:
             sde_link=entry["sde_link"],
             dataset=entry["dataset"],
             description=entry["description"],
-            limitations=entry["limitations"],
+            description_simplified="",  # This field exists in model but not in data
+            # I think the "limitations" in SDE is equivalent to "weaknesses" from emily's data
+            limitations=entry["weaknesses"],
             format=entry["format"],
             temporal_extent=entry["temporal_extent"],
             intended_use=entry["intended_use"],
             source_link=entry["source_link"],
             indicators=entry["indicators"],
             strengths=entry["strengths"],
-            weaknesses=entry["weaknesses"],
             latency=entry["latency"],
             geographic_coverage=entry["geographic_coverage"],
             data_visualization=entry["data_visualization"],
             temporal_resolution=entry["temporal_resolution"],
             spatial_resolution=entry["spatial_resolution"],
-            projects=entry["projects"],
+            project=entry["projects"],  # Changed from 'projects' to 'project' to match model
         )
         ej_row.save()
 
 
-process_ej_dump("backups/ej_dump_20241017_133151.json")
+process_ej_dump("backups/ej_dump_20241120_211754.json")

From 6ee7333caa35e54ca303b4223abc32854f18454c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 21:33:46 -0600
Subject: [PATCH 173/441] update intended use to reference path names

---
 scripts/ej/cmr_processing.py      |  8 ++++----
 scripts/ej/test_cmr_processing.py | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
index 93793575..5dc71e29 100644
--- a/scripts/ej/cmr_processing.py
+++ b/scripts/ej/cmr_processing.py
@@ -220,22 +220,22 @@ def latency(self) -> str:
 
     @property
     def intended_use(self) -> str:
-        """Get dataset intended use."""
+        """Get dataset intended use path."""
         level = self.processing_info.level
         collection_type = self.processing_info.collection_type
         data_centers = self.processing_info.data_centers
 
         if level == "4" and collection_type == "SCIENCE_QUALITY":
-            return "exploration"
+            return "Path A"  # maps to "exploration"
 
         if (
             (level in ["2", "2a", "2b"] and "SEDAC" in data_centers and collection_type == "SCIENCE_QUALITY")
             or (level in ["3", "3a"] and collection_type == "SCIENCE_QUALITY")
             or (level == "4" and collection_type != "SCIENCE_QUALITY")
         ):
-            return "basic analysis"
+            return "Path B"  # maps to "basic analysis"
 
-        return "advanced analysis"
+        return "Path C"  # maps to "advanced analysis"
 
     @property
     def geographic_coverage(self) -> str:
diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py
index 9e65e072..0100b55a 100644
--- a/scripts/ej/test_cmr_processing.py
+++ b/scripts/ej/test_cmr_processing.py
@@ -220,7 +220,7 @@ class TestProcessingLevelInfo:
     def test_intended_use_exploration(self):
         data = {"umm": {"ProcessingLevel": {"Id": "4"}, "CollectionDataType": "SCIENCE_QUALITY"}}
         dataset = CmrDataset(data)
-        assert dataset.intended_use == "exploration"
+        assert dataset.intended_use == "Path A"
 
     def test_intended_use_basic_analysis(self):
         data = {
@@ -231,7 +231,19 @@ def test_intended_use_basic_analysis(self):
             }
         }
         dataset = CmrDataset(data)
-        assert dataset.intended_use == "basic analysis"
+        assert dataset.intended_use == "Path B"
+
+    def test_intended_use_advanced_analysis(self):
+        # Added this test to cover Path C case
+        data = {
+            "umm": {
+                "ProcessingLevel": {"Id": "2"},
+                "CollectionDataType": "SCIENCE_QUALITY",
+                "DataCenters": [{"ShortName": "OTHER"}],
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.intended_use == "Path C"
 
     def test_latency_mapping(self):
         data = {"umm": {"CollectionDataType": "NEAR_REAL_TIME"}}

From 3c975fcd6925fa1a8e78c8164fb9733a09f642d1 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 22:01:54 -0600
Subject: [PATCH 174/441] improve resolution processing code

---
 scripts/ej/cmr_processing.py            |  68 +++++++-
 scripts/ej/test_threshold_processing.py | 207 ++++++++++++++++++++++++
 2 files changed, 273 insertions(+), 2 deletions(-)
 create mode 100644 scripts/ej/test_threshold_processing.py

diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
index 5dc71e29..8eb55965 100644
--- a/scripts/ej/cmr_processing.py
+++ b/scripts/ej/cmr_processing.py
@@ -116,11 +116,75 @@ def _process_spatial_info(self) -> SpatialInfo:
             for rect in rectangles
         )
 
-        resolution_system = horizontal_domain.get("ResolutionAndCoordinateSystem", {})
-        resolution = resolution_system.get("HorizontalDataResolution", "")
+        resolution = self._extract_spatial_resolution(horizontal_domain)
 
         return SpatialInfo(is_global, resolution, rectangles)
 
+    def _extract_spatial_resolution(self, horizontal_domain: dict) -> str:
+        """
+        Extract and format spatial resolution from horizontal domain data.
+
+        Args:
+            horizontal_domain: Dictionary containing resolution information
+
+        Returns:
+            Formatted resolution string or empty string if not available
+        """
+        resolution_system = horizontal_domain.get("ResolutionAndCoordinateSystem", {})
+        resolution_data = resolution_system.get("HorizontalDataResolution", {})
+
+        if not resolution_data:
+            return ""
+
+        # Check for Varies resolution
+        if resolution_data.get("VariesResolution") == "Varies":
+            return "Varies"
+
+        # Check for GriddedRangeResolutions (use maximum values)
+        gridded_range = resolution_data.get("GriddedRangeResolutions", [])
+        if gridded_range:
+            # I spot checked 200 datasets, and never saw more than one entry
+            # so I'm just going to use the first one for now for simplicity
+            range_data = gridded_range[0]
+            # in a gridded range, MinimumXDimension is also available,
+            # however I have chosen to use the less impressive MaximumXDimension
+            max_x = range_data.get("MaximumXDimension")
+            max_y = range_data.get("MaximumYDimension")
+            unit = range_data.get("Unit", "").lower()
+            if max_x and max_y and unit:
+                # Use the larger of the two dimensions
+                max_dim = max(max_x, max_y)
+                return f"{max_dim} {unit}"
+            return ""
+
+        # Check for GriddedResolutions
+        gridded = resolution_data.get("GriddedResolutions", [])
+        if gridded:
+            grid_data = gridded[0]
+            x_dim = grid_data.get("XDimension")
+            y_dim = grid_data.get("YDimension")
+            unit = grid_data.get("Unit", "").lower()
+            if x_dim and y_dim and unit:
+                # If dimensions differ, use the larger one
+                max_dim = max(x_dim, y_dim)
+                return f"{max_dim} {unit}"
+            return ""
+
+        # Check for GenericResolutions
+        generic = resolution_data.get("GenericResolutions", [])
+        if generic:
+            generic_data = generic[0]
+            x_dim = generic_data.get("XDimension")
+            y_dim = generic_data.get("YDimension")
+            unit = generic_data.get("Unit", "").lower()
+            if x_dim and y_dim and unit:
+                # If dimensions differ, use the larger one
+                max_dim = max(x_dim, y_dim)
+                return f"{max_dim} {unit}"
+            return ""
+
+        return ""
+
     def _process_download_info(self) -> DownloadInfo:
         """Process all download and visualization information."""
         has_distribution = False
diff --git a/scripts/ej/test_threshold_processing.py b/scripts/ej/test_threshold_processing.py
new file mode 100644
index 00000000..eb014c2f
--- /dev/null
+++ b/scripts/ej/test_threshold_processing.py
@@ -0,0 +1,207 @@
+"""Unit tests for threshold processing functionality."""
+
+import pytest
+from threshold_processing import ThresholdProcessor
+
+
+class TestThresholdProcessor:
+    """Test suite for ThresholdProcessor class."""
+
+    @pytest.fixture
+    def default_thresholds(self):
+        """Default thresholds for testing."""
+        return {
+            "Not EJ": 0.80,
+            "Urban Flooding": 0.50,
+            "Extreme Heat": 0.50,
+            "Water Availability": 0.80,
+            "Health & Air Quality": 0.90,
+            "Disasters": 0.80,
+            "Food Availability": 0.80,
+            "Human Dimensions": 0.80,
+        }
+
+    @pytest.fixture
+    def authorized_classifications(self):
+        """Authorized classifications for testing."""
+        return [
+            "Urban Flooding",
+            "Extreme Heat",
+            "Water Availability",
+            "Health & Air Quality",
+            "Disasters",
+            "Food Availability",
+            "Human Dimensions",
+        ]
+
+    @pytest.fixture
+    def processor(self, default_thresholds):
+        """Create a ThresholdProcessor instance with test thresholds."""
+        return ThresholdProcessor(thresholds=default_thresholds)
+
+    @pytest.fixture
+    def custom_processor(self):
+        """Create a ThresholdProcessor instance with simplified test thresholds."""
+        custom_thresholds = {
+            "Not EJ": 0.75,
+            "Test Category 1": 0.60,
+            "Test Category 2": 0.80,
+        }
+        return ThresholdProcessor(thresholds=custom_thresholds)
+
+    def test_initialization_with_thresholds(self, processor, default_thresholds):
+        """Test initialization with provided thresholds."""
+        assert processor.thresholds == default_thresholds
+        assert "Not EJ" in processor.thresholds
+        assert processor.thresholds["Not EJ"] == 0.80
+
+    def test_initialization_custom_thresholds(self, custom_processor):
+        """Test initialization with custom thresholds."""
+        assert custom_processor.thresholds["Not EJ"] == 0.75
+        assert custom_processor.thresholds["Test Category 1"] == 0.60
+        assert custom_processor.thresholds["Test Category 2"] == 0.80
+
+    def test_single_high_scoring_not_ej(self, processor):
+        """Test when 'Not EJ' has the highest score."""
+        predictions = [
+            {"label": "Not EJ", "score": 0.90},
+            {"label": "Urban Flooding", "score": 0.85},
+            {"label": "Water Availability", "score": 0.82},
+        ]
+        result = processor.process_predictions(predictions)
+        assert result == ["Not EJ"]
+        assert len(result) == 1
+
+    def test_multiple_indicators_above_threshold(self, processor):
+        """Test when multiple indicators exceed their thresholds."""
+        predictions = [
+            {"label": "Not EJ", "score": 0.30},
+            {"label": "Urban Flooding", "score": 0.75},  # Above 0.50 threshold
+            {"label": "Extreme Heat", "score": 0.60},  # Above 0.50 threshold
+            {"label": "Water Availability", "score": 0.85},  # Above 0.80 threshold
+        ]
+        result = processor.process_predictions(predictions)
+        assert len(result) == 3
+        assert "Urban Flooding" in result
+        assert "Extreme Heat" in result
+        assert "Water Availability" in result
+
+    def test_no_indicators_above_threshold(self, processor):
+        """Test when no indicators meet their thresholds."""
+        predictions = [
+            {"label": "Not EJ", "score": 0.70},
+            {"label": "Urban Flooding", "score": 0.45},  # Below 0.50 threshold
+            {"label": "Water Availability", "score": 0.75},  # Below 0.80 threshold
+        ]
+        result = processor.process_predictions(predictions)
+        assert result == ["Not EJ"]
+
+    def test_mixed_threshold_scenarios(self, processor):
+        """Test various mixed scenarios of threshold checking."""
+        predictions = [
+            {"label": "Not EJ", "score": 0.60},
+            {"label": "Urban Flooding", "score": 0.55},  # Above 0.50 threshold
+            {"label": "Extreme Heat", "score": 0.45},  # Below 0.50 threshold
+            {"label": "Water Availability", "score": 0.85},  # Above 0.80 threshold
+        ]
+        result = processor.process_predictions(predictions)
+        assert len(result) == 2
+        assert "Urban Flooding" in result
+        assert "Water Availability" in result
+        assert "Extreme Heat" not in result
+
+    def test_authorized_classifications_filtering(self, processor, authorized_classifications):
+        """Test filtering of authorized classifications."""
+        # Monkey patch the authorized classifications for this test
+        import threshold_processing
+
+        original_authorized = threshold_processing.AUTHORIZED_CLASSIFICATIONS
+        threshold_processing.AUTHORIZED_CLASSIFICATIONS = authorized_classifications
+
+        test_classifications = ["Urban Flooding", "Invalid Category", "Water Availability", "Another Invalid"]
+        result = processor.filter_authorized_classifications(test_classifications)
+        assert len(result) == 2
+        assert all(r in authorized_classifications for r in result)
+        assert "Invalid Category" not in result
+        assert "Another Invalid" not in result
+
+        # Restore original authorized classifications
+        threshold_processing.AUTHORIZED_CLASSIFICATIONS = original_authorized
+
+    def test_process_and_filter_complete_pipeline(self, processor, authorized_classifications):
+        """Test the complete processing pipeline with unauthorized categories."""
+        # Monkey patch the authorized classifications for this test
+        import threshold_processing
+
+        original_authorized = threshold_processing.AUTHORIZED_CLASSIFICATIONS
+        threshold_processing.AUTHORIZED_CLASSIFICATIONS = authorized_classifications
+
+        predictions = [
+            {"label": "Not EJ", "score": 0.30},
+            {"label": "Urban Flooding", "score": 0.75},
+            {"label": "Invalid Category", "score": 0.95},
+            {"label": "Water Availability", "score": 0.85},
+        ]
+        result = processor.process_and_filter(predictions)
+        assert len(result) == 2
+        assert "Urban Flooding" in result
+        assert "Water Availability" in result
+        assert "Invalid Category" not in result
+
+        # Restore original authorized classifications
+        threshold_processing.AUTHORIZED_CLASSIFICATIONS = original_authorized
+
+    def test_edge_case_empty_predictions(self, processor):
+        """Test handling of empty predictions list."""
+        result = processor.process_predictions([])
+        assert result == ["Not EJ"]
+
+    def test_edge_case_missing_scores(self, processor):
+        """Test handling of predictions with missing scores."""
+        predictions = [{"label": "Urban Flooding"}, {"label": "Water Availability", "score": 0.85}]  # Missing score
+        with pytest.raises(KeyError):
+            processor.process_predictions(predictions)
+
+    def test_edge_case_invalid_score_values(self, processor):
+        """Test handling of invalid score values."""
+        predictions = [{"label": "Not EJ", "score": "invalid"}, {"label": "Urban Flooding", "score": 0.75}]
+        with pytest.raises(TypeError):
+            processor.process_predictions(predictions)
+
+    def test_threshold_boundary_conditions(self, processor):
+        """Test classification at exact threshold boundaries."""
+        predictions = [
+            {"label": "Not EJ", "score": 0.30},
+            {"label": "Urban Flooding", "score": 0.50},  # Exactly at threshold
+            {"label": "Water Availability", "score": 0.80},  # Exactly at threshold
+            {"label": "Health & Air Quality", "score": 0.89},  # Just below threshold
+        ]
+        result = processor.process_predictions(predictions)
+        assert len(result) == 2
+        assert "Urban Flooding" in result
+        assert "Water Availability" in result
+        assert "Health & Air Quality" not in result
+
+    def test_all_indicators_same_score(self, processor):
+        """Test behavior when all indicators have the same score."""
+        predictions = [
+            {"label": "Not EJ", "score": 0.85},
+            {"label": "Urban Flooding", "score": 0.85},
+            {"label": "Water Availability", "score": 0.85},
+        ]
+        result = processor.process_predictions(predictions)
+        assert result == ["Not EJ"]  # Since Not EJ is highest scoring (tied) prediction
+
+    def test_high_scores_below_threshold(self, processor):
+        """Test when scores are high but still below their respective thresholds."""
+        predictions = [
+            {"label": "Not EJ", "score": 0.70},
+            {"label": "Health & Air Quality", "score": 0.89},  # High but below 0.90 threshold
+            {"label": "Water Availability", "score": 0.79},  # High but below 0.80 threshold
+        ]
+        result = processor.process_predictions(predictions)
+        assert result == ["Not EJ"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From 6a1906f49013a39f32af5af86f138776ccabef1d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 22:10:11 -0600
Subject: [PATCH 175/441] add spatial resolution tests and import error
 handling

---
 scripts/ej/create_ej_dump.py            |  20 ++--
 scripts/ej/test_cmr_processing.py       | 126 +++++++++++++++++++++++-
 scripts/ej/test_threshold_processing.py |   2 +
 scripts/ej/threshold_processing.py      |   5 +-
 4 files changed, 145 insertions(+), 8 deletions(-)

diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py
index 233b6296..0ee73270 100644
--- a/scripts/ej/create_ej_dump.py
+++ b/scripts/ej/create_ej_dump.py
@@ -8,12 +8,20 @@
 from cmr_processing import CmrDataset
 from threshold_processing import ThresholdProcessor
 
-from config import (
-    CMR_FILENAME,
-    INFERENCE_FILENAME,
-    OUTPUT_FILENAME_TEMPLATE,
-    TIMESTAMP_FORMAT,
-)
+try:
+    from config import (
+        CMR_FILENAME,
+        INFERENCE_FILENAME,
+        OUTPUT_FILENAME_TEMPLATE,
+        TIMESTAMP_FORMAT,
+    )
+except ImportError:
+    from scripts.ej.config import (
+        CMR_FILENAME,
+        INFERENCE_FILENAME,
+        OUTPUT_FILENAME_TEMPLATE,
+        TIMESTAMP_FORMAT,
+    )
 
 
 def load_json_file(file_path: str) -> dict:
diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py
index 0100b55a..2c994243 100644
--- a/scripts/ej/test_cmr_processing.py
+++ b/scripts/ej/test_cmr_processing.py
@@ -26,7 +26,7 @@ def test_full_dataset_processing(self, cmr_dataset):
         assert cmr_dataset.limitations == "None"
         assert cmr_dataset.format == "application/vnd.nasa.cmr.umm+json"
         assert cmr_dataset.temporal_extent == ""  # No SingleDateTimes in example
-        assert cmr_dataset.intended_use == "exploration"  # ProcessingLevel is 4
+        assert cmr_dataset.intended_use == "Path A"  # ProcessingLevel is 4
         assert cmr_dataset.source_link == "https://doi.org/10.7927/H4NK3BZJ"
         assert "Long temporal extent" in cmr_dataset.strengths
         assert "No recent data available" in cmr_dataset.weaknesses
@@ -180,6 +180,130 @@ def test_non_global_coverage(self):
         dataset = CmrDataset(data)
         assert dataset.geographic_coverage == ""
 
+    def test_spatial_resolution_varies(self):
+        """Test spatial resolution when it varies."""
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "ResolutionAndCoordinateSystem": {"HorizontalDataResolution": {"VariesResolution": "Varies"}}
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.spatial_resolution == "Varies"
+
+    def test_spatial_resolution_gridded_range(self):
+        """Test spatial resolution with gridded range resolutions."""
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "ResolutionAndCoordinateSystem": {
+                            "HorizontalDataResolution": {
+                                "GriddedRangeResolutions": [
+                                    {
+                                        "MinimumXDimension": 5.0,
+                                        "MinimumYDimension": 5.0,
+                                        "MaximumXDimension": 50.0,
+                                        "MaximumYDimension": 40.0,
+                                        "Unit": "Kilometers",
+                                    }
+                                ]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.spatial_resolution == "50.0 kilometers"
+
+    def test_spatial_resolution_gridded(self):
+        """Test spatial resolution with gridded resolutions."""
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "ResolutionAndCoordinateSystem": {
+                            "HorizontalDataResolution": {
+                                "GriddedResolutions": [{"XDimension": 30.0, "YDimension": 30.0, "Unit": "Meters"}]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.spatial_resolution == "30.0 meters"
+
+    def test_spatial_resolution_generic(self):
+        """Test spatial resolution with generic resolutions."""
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "ResolutionAndCoordinateSystem": {
+                            "HorizontalDataResolution": {
+                                "GenericResolutions": [{"XDimension": 10.0, "YDimension": 10.0, "Unit": "Kilometers"}]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.spatial_resolution == "10.0 kilometers"
+
+    def test_spatial_resolution_missing(self):
+        """Test spatial resolution when resolution data is missing."""
+        data = {"umm": {"SpatialExtent": {"HorizontalSpatialDomain": {"ResolutionAndCoordinateSystem": {}}}}}
+        dataset = CmrDataset(data)
+        assert dataset.spatial_resolution == ""
+
+    def test_spatial_resolution_different_dimensions(self):
+        """Test spatial resolution when X and Y dimensions differ."""
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "ResolutionAndCoordinateSystem": {
+                            "HorizontalDataResolution": {
+                                "GriddedResolutions": [{"XDimension": 30.0, "YDimension": 40.0, "Unit": "Meters"}]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.spatial_resolution == "40.0 meters"
+
+    def test_spatial_resolution_incomplete_data(self):
+        """Test spatial resolution with incomplete resolution data."""
+        data = {
+            "umm": {
+                "SpatialExtent": {
+                    "HorizontalSpatialDomain": {
+                        "ResolutionAndCoordinateSystem": {
+                            "HorizontalDataResolution": {
+                                "GriddedResolutions": [
+                                    {
+                                        "XDimension": 30.0,
+                                        # Missing YDimension
+                                        "Unit": "Meters",
+                                    }
+                                ]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.spatial_resolution == ""
+
 
 class TestDownloadProcessing:
     """Unit tests for download information processing"""
diff --git a/scripts/ej/test_threshold_processing.py b/scripts/ej/test_threshold_processing.py
index eb014c2f..424a81fa 100644
--- a/scripts/ej/test_threshold_processing.py
+++ b/scripts/ej/test_threshold_processing.py
@@ -1,5 +1,7 @@
 """Unit tests for threshold processing functionality."""
 
+# docker-compose -f local.yml run --rm django pytest scripts/ej/test_threshold_processing.py
+
 import pytest
 from threshold_processing import ThresholdProcessor
 
diff --git a/scripts/ej/threshold_processing.py b/scripts/ej/threshold_processing.py
index 549a6281..0a8d5072 100644
--- a/scripts/ej/threshold_processing.py
+++ b/scripts/ej/threshold_processing.py
@@ -1,6 +1,9 @@
 """Module for processing classification predictions with thresholds."""
 
-from config import AUTHORIZED_CLASSIFICATIONS, INDICATOR_THRESHOLDS
+try:
+    from config import AUTHORIZED_CLASSIFICATIONS, INDICATOR_THRESHOLDS
+except ImportError:
+    from scripts.ej.config import AUTHORIZED_CLASSIFICATIONS, INDICATOR_THRESHOLDS
 
 
 class ThresholdProcessor:

From ff1484ba4f6f60389b563458a58ec7a97df6e774 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 20 Nov 2024 22:17:29 -0600
Subject: [PATCH 176/441] add tdamm_tag field to new serializers

---
 sde_collections/admin.py                 |  1 -
 sde_collections/serializers.py           | 23 +++++++++++++++++++++++
 sde_collections/tests/test_tdamm_tags.py |  2 +-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 28c96396..23a9b12c 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -13,7 +13,6 @@
 from .models.collection import Collection, WorkflowHistory
 from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .tasks import import_candidate_urls_from_api
 from .models.collection_choice_fields import TDAMMTags
 from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api
 
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index f77c28ab..e895cb28 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -62,6 +62,11 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     generated_title_id = serializers.SerializerMethodField(read_only=True)
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
+    tdamm_tag = serializers.SerializerMethodField()
+
+    def get_tdamm_tag(self, obj):
+        tags = obj.tdamm_tag
+        return tags if tags is not None else []
 
     def get_delta_urls_count(self, obj):
         titlepattern = obj.deltatitlepattern_delta_urls.last()
@@ -91,6 +96,7 @@ class Meta:
             "division",
             "division_display",
             "visited",
+            "tdamm_tag",
         )
 
 
@@ -102,6 +108,11 @@ class CuratedURLSerializer(serializers.ModelSerializer):
     generated_title_id = serializers.SerializerMethodField(read_only=True)
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
     curated_urls_count = serializers.SerializerMethodField(read_only=True)
+    tdamm_tag = serializers.SerializerMethodField()
+
+    def get_tdamm_tag(self, obj):
+        tags = obj.tdamm_tag
+        return tags if tags is not None else []
 
     def get_curated_urls_count(self, obj):
         titlepattern = obj.deltatitlepattern_curated_urls.last()
@@ -131,6 +142,7 @@ class Meta:
             "division",
             "division_display",
             "visited",
+            "tdamm_tag",
         )
 
 
@@ -148,6 +160,7 @@ class DeltaURLAPISerializer(serializers.ModelSerializer):
     title = serializers.SerializerMethodField()
     file_extension = serializers.SerializerMethodField()
     tree_root = serializers.SerializerMethodField()
+    tdamm_tag = serializers.SerializerMethodField()
 
     class Meta:
         model = DeltaUrl
@@ -157,8 +170,13 @@ class Meta:
             "document_type",
             "file_extension",
             "tree_root",
+            "tdamm_tag",
         )
 
+    def get_tdamm_tag(self, obj):
+        tags = obj.tdamm_tag
+        return tags if tags is not None else []
+
     def get_document_type(self, obj):
         if obj.document_type is not None:
             return obj.get_document_type_display()
@@ -198,8 +216,13 @@ class Meta:
             "document_type",
             "file_extension",
             "tree_root",
+            "tdamm_tag",
         )
 
+    def get_tdamm_tag(self, obj):
+        tags = obj.tdamm_tag
+        return tags if tags is not None else []
+
     def get_document_type(self, obj):
         if obj.document_type is not None:
             return obj.get_document_type_display()
diff --git a/sde_collections/tests/test_tdamm_tags.py b/sde_collections/tests/test_tdamm_tags.py
index 8811bd68..81b12f11 100644
--- a/sde_collections/tests/test_tdamm_tags.py
+++ b/sde_collections/tests/test_tdamm_tags.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from ..models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from ..models.delta_url import CuratedUrl, DeltaUrl
 from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory, DumpUrlFactory
 
 

From 403c71d9b45f595dcc02585386afb219a228d6f2 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 22:19:37 -0600
Subject: [PATCH 177/441] update threshold processing to better handle not ej
 cases

---
 scripts/ej/threshold_processing.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/scripts/ej/threshold_processing.py b/scripts/ej/threshold_processing.py
index 0a8d5072..da0ce048 100644
--- a/scripts/ej/threshold_processing.py
+++ b/scripts/ej/threshold_processing.py
@@ -27,11 +27,15 @@ def process_predictions(self, predictions: list[dict[str, float]]) -> list[str]:
 
         Args:
             predictions: List of dictionaries containing prediction labels and scores.
-                       Each dict should have 'label' and 'score' keys.
+                    Each dict should have 'label' and 'score' keys.
 
         Returns:
             List of classification labels that meet their respective thresholds.
         """
+        # Handle empty predictions
+        if not predictions:
+            return ["Not EJ"]
+
         # Find highest scoring prediction
         highest_prediction = max(predictions, key=lambda x: x["score"])
 
@@ -43,7 +47,11 @@ def process_predictions(self, predictions: list[dict[str, float]]) -> list[str]:
         classifications = [
             pred["label"]
             for pred in predictions
-            if (pred["score"] >= self.thresholds[pred["label"]] and pred["label"] != "Not EJ")
+            if (
+                pred["label"] in self.thresholds  # Only check labels we have thresholds for
+                and pred["score"] >= self.thresholds[pred["label"]]
+                and pred["label"] != "Not EJ"
+            )
         ]
 
         # Default to "Not EJ" if no classifications meet thresholds

From 1e27e284df1d2841ee99c6b7f2b309ac6f9b15ef Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 20 Nov 2024 23:16:34 -0600
Subject: [PATCH 178/441] improve format and dataset name handling

---
 scripts/ej/cmr_processing.py      | 28 ++++++++++++-------
 scripts/ej/test_cmr_processing.py | 45 +++++++++++++++++++++++++++++--
 2 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
index 8eb55965..ec851b2f 100644
--- a/scripts/ej/cmr_processing.py
+++ b/scripts/ej/cmr_processing.py
@@ -27,7 +27,7 @@ class DownloadInfo(NamedTuple):
     has_distribution: bool
     has_direct_download: bool
     visualization_urls: list[str]
-    format: str
+    formats: list[str]  # Changed from single format to list of formats
 
 
 class ProcessingInfo(NamedTuple):
@@ -190,7 +190,17 @@ def _process_download_info(self) -> DownloadInfo:
         has_distribution = False
         has_direct_download = False
         visualization_urls = []
+        formats = []
 
+        # Extract formats from FileDistributionInformation
+        archive_info = self.umm.get("ArchiveAndDistributionInformation", {})
+        distribution_info = archive_info.get("FileDistributionInformation", [])
+
+        for info in distribution_info:
+            if "Format" in info:
+                formats.append(info["Format"])
+
+        # Process RelatedUrls
         related_urls = self.umm.get("RelatedUrls", [])
         for url in related_urls:
             if url.get("URLContentType") == "DistributionURL" and url.get("Type") == "GET DATA":
@@ -204,9 +214,14 @@ def _process_download_info(self) -> DownloadInfo:
             has_distribution=has_distribution,
             has_direct_download=has_direct_download,
             visualization_urls=visualization_urls,
-            format=self.meta.get("format", ""),
+            formats=formats,
         )
 
+    @property
+    def format(self) -> str:
+        """Get dataset formats as semicolon-separated string."""
+        return "; ".join(self.download_info.formats) if self.download_info.formats else ""
+
     def _process_processing_info(self) -> ProcessingInfo:
         """Process all processing level information."""
         processing_level = self.umm.get("ProcessingLevel", {}).get("Id", "")
@@ -331,8 +346,8 @@ def projects(self) -> str:
 
     @property
     def dataset_name(self) -> str:
-        """Get dataset short name."""
-        return self.umm.get("ShortName", "")
+        """Get dataset entry title or shortname."""
+        return self.umm.get("EntryTitle", self.umm.get("ShortName", ""))
 
     @property
     def description(self) -> str:
@@ -344,11 +359,6 @@ def limitations(self) -> str:
         """Get dataset access constraints."""
         return self.umm.get("AccessConstraints", {}).get("Description", "")
 
-    @property
-    def format(self) -> str:
-        """Get dataset format."""
-        return self.download_info.format
-
     @property
     def temporal_extent(self) -> str:
         """Get dataset temporal extent."""
diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py
index 2c994243..e83b3b03 100644
--- a/scripts/ej/test_cmr_processing.py
+++ b/scripts/ej/test_cmr_processing.py
@@ -21,10 +21,10 @@ def cmr_dataset(self):
     def test_full_dataset_processing(self, cmr_dataset):
         """Test that all properties can be extracted from real data without errors"""
         # Test all property accessors
-        assert cmr_dataset.dataset_name == "CIESIN_SEDAC_ESI_2000"
+        assert cmr_dataset.dataset_name == "2000 Pilot Environmental Sustainability Index (ESI)"
         assert cmr_dataset.description.startswith("The 2000 Pilot Environmental Sustainability Index")
         assert cmr_dataset.limitations == "None"
-        assert cmr_dataset.format == "application/vnd.nasa.cmr.umm+json"
+        assert cmr_dataset.format == "PDF"
         assert cmr_dataset.temporal_extent == ""  # No SingleDateTimes in example
         assert cmr_dataset.intended_use == "Path A"  # ProcessingLevel is 4
         assert cmr_dataset.source_link == "https://doi.org/10.7927/H4NK3BZJ"
@@ -337,6 +337,47 @@ def test_visualization_urls(self):
         assert "http://example.com/viz1" in dataset.data_visualization
         assert "http://example.com/viz2" in dataset.data_visualization
 
+    def test_format_extraction_single(self):
+        data = {
+            "umm": {
+                "ArchiveAndDistributionInformation": {
+                    "FileDistributionInformation": [{"Format": "GeoTIFF", "Fees": "0"}]
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.format == "GeoTIFF"
+
+    def test_format_extraction_multiple(self):
+        data = {
+            "umm": {
+                "ArchiveAndDistributionInformation": {
+                    "FileDistributionInformation": [
+                        {"Format": "Excel", "Fees": "0"},
+                        {"Format": "PDF", "Fees": "0"},
+                        {"Format": "PNG", "Fees": "0"},
+                    ]
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.format == "Excel; PDF; PNG"
+
+    def test_format_extraction_empty(self):
+        data = {"umm": {"ArchiveAndDistributionInformation": {"FileDistributionInformation": []}}}
+        dataset = CmrDataset(data)
+        assert dataset.format == ""
+
+    def test_format_extraction_missing_info(self):
+        data = {"umm": {"ArchiveAndDistributionInformation": {}}}
+        dataset = CmrDataset(data)
+        assert dataset.format == ""
+
+    def test_format_extraction_no_archive_info(self):
+        data = {"umm": {}}
+        dataset = CmrDataset(data)
+        assert dataset.format == ""
+
 
 class TestProcessingLevelInfo:
     """Unit tests for processing level information"""

From 70ccb2ca234306c2800378c66ae12a106652ae80 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 05:36:53 +0000
Subject: [PATCH 179/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/admin.py                 | 2 +-
 sde_collections/models/delta_url.py      | 4 ++--
 sde_collections/tests/test_tdamm_tags.py | 7 ++++++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 23a9b12c..92ca41c3 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -11,9 +11,9 @@
 
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
+from .models.collection_choice_fields import TDAMMTags
 from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
-from .models.collection_choice_fields import TDAMMTags
 from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api
 
 
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 3f5effc6..e8d00483 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -1,12 +1,12 @@
 import os
 from urllib.parse import urlparse
 
+from django.contrib.postgres.fields import ArrayField
 from django.db import models
 
+from ..utils.paired_field_descriptor import PairedFieldDescriptor
 from .collection_choice_fields import Divisions, DocumentTypes, TDAMMTags
 from .delta_patterns import DeltaExcludePattern, DeltaTitlePattern
-from ..utils.paired_field_descriptor import PairedFieldDescriptor
-from django.contrib.postgres.fields import ArrayField
 
 
 class DeltaUrlQuerySet(models.QuerySet):
diff --git a/sde_collections/tests/test_tdamm_tags.py b/sde_collections/tests/test_tdamm_tags.py
index 81b12f11..f520b63b 100644
--- a/sde_collections/tests/test_tdamm_tags.py
+++ b/sde_collections/tests/test_tdamm_tags.py
@@ -2,8 +2,13 @@
 
 import pytest
 
+from sde_collections.tests.factories import (
+    CollectionFactory,
+    DeltaUrlFactory,
+    DumpUrlFactory,
+)
+
 from ..models.delta_url import CuratedUrl, DeltaUrl
-from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory, DumpUrlFactory
 
 
 @pytest.mark.django_db

From a976c9f277c170df48986b2137e05f1b8d69c650 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 21 Nov 2024 06:29:02 -0600
Subject: [PATCH 180/441] Fixes #1097

---
 .envs/.local/.django           |  2 +-
 local.yml                      | 25 +++++++----
 sde_collections/sinequa_api.py | 81 +++++++++++++++++++++++++---------
 sde_collections/tasks.py       | 31 ++++---------
 4 files changed, 85 insertions(+), 54 deletions(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 0978166d..9e7b56c4 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -46,4 +46,4 @@ LRM_QA_PASSWORD=''
 #Server Tokens
 #--------------------------------------------------------------------------------
 LRM_DEV_TOKEN=''
-XLI_TOKEN=''
+XLI_TOKEN='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJTaW5lcXVhIiwiaWF0IjoxNzI3OTAzMzAzLCJzaWQiOiJCRDkwN0Q4QzJCMjg0MDA2ODQ5OEZFOENCRjdEODQwNiIsImtpbmQiOiJhY2Nlc3MiLCJleHAiOjE3MzU2NzkzMDMsInN1YiI6IlNpbmVxdWF8Z3JhX3VzZXJzIn0.o1a3eDPgEWdoHu7S8KQi0wMw_brxfAM1lClbfncVQVI'
diff --git a/local.yml b/local.yml
index ebdb810b..84893914 100644
--- a/local.yml
+++ b/local.yml
@@ -54,14 +54,23 @@ services:
     container_name: sde_indexing_helper_local_redis
 
   celeryworker:
-    <<: *django
-    image: sde_indexing_helper_local_celeryworker
-    container_name: sde_indexing_helper_local_celeryworker
-    depends_on:
-      - redis
-      - postgres
-    ports: []
-    command: /start-celeryworker
+   <<: *django
+   image: sde_indexing_helper_local_celeryworker
+   container_name: sde_indexing_helper_local_celeryworker
+   depends_on:
+     - redis
+     - postgres
+   ports: []
+   command: /start-celeryworker
+   deploy:
+     resources:
+       limits:
+         cpus: '4.0'
+         memory: 8G
+       reservations:
+         cpus: '2.0'
+         memory: 4G
+
 
   # celerybeat:
   #   <<: *django
diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 868afb77..f3e7dda3 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -1,10 +1,10 @@
 import json
 from typing import Any
-
 import requests
 import urllib3
 from django.conf import settings
-
+from .models.delta_url import DumpUrl
+from django.db import transaction
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 server_configs = {
@@ -134,26 +134,64 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
             payload["query"]["advanced"]["collection"] = f"/{source}/{collection_config_folder}/"
 
         return self.process_response(url, payload)
-
-    def sql_query(self, sql: str) -> Any:
-        """Executes an SQL query on the configured server using token-based authentication."""
+    def sql_query(self, sql: str, collection) -> Any:
+        """Executes an SQL query on the configured server using token-based authentication with pagination."""
         token = self._get_token()
         if not token:
             raise ValueError("A token is required to use the SQL endpoint")
-
-        url = f"{self.base_url}/api/v1/engine.sql"
-        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
-        raw_payload = json.dumps(
-            {
+ 
+        page = 0
+        page_size = 5000  # Number of records per page
+        skip_records = 0 
+
+        while True:
+            paginated_sql = f"{sql} SKIP {skip_records} COUNT {page_size}"
+            url = f"{self.base_url}/api/v1/engine.sql"
+            headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
+            raw_payload = json.dumps({
                 "method": "engine.sql",
-                "sql": sql,
+                "sql": paginated_sql,
                 "pretty": True,
-            }
-        )
-
-        return self.process_response(url, headers=headers, raw_data=raw_payload)
-
-    def get_full_texts(self, collection_config_folder: str, source: str = None) -> Any:
+            })
+
+            response = self.process_response(url, headers=headers, raw_data=raw_payload)
+            batch_data = response.get('Rows', [])
+            total_row_count = response.get('TotalRowCount', 0)
+            processed_response = self._process_full_text_response(response)
+            self.process_and_update_data(processed_response, collection)
+            print(f"Batch {page + 1} is being processed and updated")
+
+            # Check if all rows have been fetched
+            if len(batch_data) == 0 or (skip_records + page_size) >= total_row_count:
+                break
+
+            page += 1
+            skip_records += page_size
+
+        return f"All {total_row_count} records have been processed and updated."
+
+    def process_and_update_data(self, batch_data, collection):
+        for record in batch_data:
+            try:
+                with transaction.atomic():
+                    url = record['url']
+                    scraped_text = record.get('full_text', '')
+                    scraped_title = record.get('title', '')
+                    # Ensure the collection is included in the defaults
+                    DumpUrl.objects.update_or_create(
+                        url=url, 
+                        defaults={
+                            'scraped_text': scraped_text, 
+                            'scraped_title': scraped_title,
+                            'collection': collection
+                        }
+                    )
+            except KeyError as e:
+                print(f"Missing key in data: {str(e)}")
+            except Exception as e:
+                print(f"Error processing record: {str(e)}")
+
+    def get_full_texts(self, collection_config_folder: str, source: str = None, collection=None) -> Any:
         """
         Retrieves the full texts, URLs, and titles for a specified collection.
 
@@ -184,11 +222,10 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> A
             raise ValueError("Index not defined for this server")
 
         sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
-        full_text_response = self.sql_query(sql)
-        return self._process_full_text_response(full_text_response)
-
+        return self.sql_query(sql,collection)
     @staticmethod
-    def _process_full_text_response(full_text_response: str):
+    def _process_full_text_response(batch_data:str):
         return [
-            {"url": url, "full_text": full_text, "title": title} for url, full_text, title in full_text_response["Rows"]
+            {"url": url, "full_text": full_text, "title": title} for url, full_text, title in batch_data["Rows"]
         ]
+    
\ No newline at end of file
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 47c96338..572eccdc 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -145,13 +145,11 @@ def resolve_title_pattern(title_pattern_id):
     title_pattern.apply()
 
 
-@celery_app.task
+@celery_app.task(soft_time_limit=600)
 def fetch_and_replace_full_text(collection_id, server_name):
     """
-    Task to fetch and replace full text and metadata for all URLs associated with a specified collection
-    from a given server. This task deletes all existing DumpUrl entries for the collection and creates
-    new entries based on the latest fetched data.
-
+    Task to initiate fetching and replacing full text and metadata for all URLs associated with a specified collection
+    from a given server.
     Args:
         collection_id (int): The identifier for the collection in the database.
         server_name (str): The name of the server.
@@ -161,28 +159,15 @@ def fetch_and_replace_full_text(collection_id, server_name):
     """
     collection = Collection.objects.get(id=collection_id)
     api = Api(server_name)
-    documents = api.get_full_texts(collection.config_folder)
 
     # Step 1: Delete all existing DumpUrl entries for the collection
     deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
+    print(f"Deleted {deleted_count} old records.")
 
-    # Step 2: Create new DumpUrl entries from the fetched documents
-    processed_count = 0
-    for doc in documents:
-        try:
-            DumpUrl.objects.create(
-                url=doc["url"],
-                collection=collection,
-                scraped_text=doc.get("full_text", ""),
-                scraped_title=doc.get("title", ""),
-            )
-            processed_count += 1
-        except IntegrityError:
-            # Handle duplicate URL case if needed
-            print(f"Duplicate URL found, skipping: {doc['url']}")
+    # Step 2: Fetch and process new data
+    result_message = api.get_full_texts(collection.config_folder,collection=collection)
 
+    # Step 3: Migrate DumpUrl to DeltaUrl
     collection.migrate_dump_to_delta()
 
-    print(f"Processed {processed_count} new records.")
-
-    return f"Successfully processed {len(documents)} records and updated the database."
+    return result_message

From 663457613ac301d613c08949406f2dfa4b5ed92d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:53:46 +0000
Subject: [PATCH 181/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/sinequa_api.py | 52 ++++++++++++++++++----------------
 sde_collections/tasks.py       |  2 +-
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index f3e7dda3..c16277d5 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -1,10 +1,13 @@
 import json
 from typing import Any
+
 import requests
 import urllib3
 from django.conf import settings
-from .models.delta_url import DumpUrl
 from django.db import transaction
+
+from .models.delta_url import DumpUrl
+
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 server_configs = {
@@ -134,29 +137,32 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
             payload["query"]["advanced"]["collection"] = f"/{source}/{collection_config_folder}/"
 
         return self.process_response(url, payload)
+
     def sql_query(self, sql: str, collection) -> Any:
         """Executes an SQL query on the configured server using token-based authentication with pagination."""
         token = self._get_token()
         if not token:
             raise ValueError("A token is required to use the SQL endpoint")
- 
+
         page = 0
         page_size = 5000  # Number of records per page
-        skip_records = 0 
+        skip_records = 0
 
         while True:
             paginated_sql = f"{sql} SKIP {skip_records} COUNT {page_size}"
             url = f"{self.base_url}/api/v1/engine.sql"
             headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
-            raw_payload = json.dumps({
-                "method": "engine.sql",
-                "sql": paginated_sql,
-                "pretty": True,
-            })
+            raw_payload = json.dumps(
+                {
+                    "method": "engine.sql",
+                    "sql": paginated_sql,
+                    "pretty": True,
+                }
+            )
 
             response = self.process_response(url, headers=headers, raw_data=raw_payload)
-            batch_data = response.get('Rows', [])
-            total_row_count = response.get('TotalRowCount', 0)
+            batch_data = response.get("Rows", [])
+            total_row_count = response.get("TotalRowCount", 0)
             processed_response = self._process_full_text_response(response)
             self.process_and_update_data(processed_response, collection)
             print(f"Batch {page + 1} is being processed and updated")
@@ -174,17 +180,17 @@ def process_and_update_data(self, batch_data, collection):
         for record in batch_data:
             try:
                 with transaction.atomic():
-                    url = record['url']
-                    scraped_text = record.get('full_text', '')
-                    scraped_title = record.get('title', '')
+                    url = record["url"]
+                    scraped_text = record.get("full_text", "")
+                    scraped_title = record.get("title", "")
                     # Ensure the collection is included in the defaults
                     DumpUrl.objects.update_or_create(
-                        url=url, 
+                        url=url,
                         defaults={
-                            'scraped_text': scraped_text, 
-                            'scraped_title': scraped_title,
-                            'collection': collection
-                        }
+                            "scraped_text": scraped_text,
+                            "scraped_title": scraped_title,
+                            "collection": collection,
+                        },
                     )
             except KeyError as e:
                 print(f"Missing key in data: {str(e)}")
@@ -222,10 +228,8 @@ def get_full_texts(self, collection_config_folder: str, source: str = None, coll
             raise ValueError("Index not defined for this server")
 
         sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
-        return self.sql_query(sql,collection)
+        return self.sql_query(sql, collection)
+
     @staticmethod
-    def _process_full_text_response(batch_data:str):
-        return [
-            {"url": url, "full_text": full_text, "title": title} for url, full_text, title in batch_data["Rows"]
-        ]
-    
\ No newline at end of file
+    def _process_full_text_response(batch_data: str):
+        return [{"url": url, "full_text": full_text, "title": title} for url, full_text, title in batch_data["Rows"]]
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 572eccdc..5a4bb3be 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -165,7 +165,7 @@ def fetch_and_replace_full_text(collection_id, server_name):
     print(f"Deleted {deleted_count} old records.")
 
     # Step 2: Fetch and process new data
-    result_message = api.get_full_texts(collection.config_folder,collection=collection)
+    result_message = api.get_full_texts(collection.config_folder, collection=collection)
 
     # Step 3: Migrate DumpUrl to DeltaUrl
     collection.migrate_dump_to_delta()

From 05ec13bd8c6ee135963b7a98cfe6829d3c69d2a3 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 21 Nov 2024 06:57:03 -0600
Subject: [PATCH 182/441] Updated_#1097

---
 sde_collections/tasks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 5a4bb3be..188147ac 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -7,7 +7,6 @@
 from django.conf import settings
 from django.core import management
 from django.core.management.commands import loaddata
-from django.db import IntegrityError
 
 from config import celery_app
 

From df4a1de1c0a8415a3e050a5d438d071b9d79d6ed Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 21 Nov 2024 07:00:14 -0600
Subject: [PATCH 183/441] Fixes_Issue__#1097

---
 .envs/.local/.django | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 9e7b56c4..54c76263 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -46,4 +46,4 @@ LRM_QA_PASSWORD=''
 #Server Tokens
 #--------------------------------------------------------------------------------
 LRM_DEV_TOKEN=''
-XLI_TOKEN='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJTaW5lcXVhIiwiaWF0IjoxNzI3OTAzMzAzLCJzaWQiOiJCRDkwN0Q4QzJCMjg0MDA2ODQ5OEZFOENCRjdEODQwNiIsImtpbmQiOiJhY2Nlc3MiLCJleHAiOjE3MzU2NzkzMDMsInN1YiI6IlNpbmVxdWF8Z3JhX3VzZXJzIn0.o1a3eDPgEWdoHu7S8KQi0wMw_brxfAM1lClbfncVQVI'
+XLI_TOKEN=''
\ No newline at end of file

From 03ce0e63bfa643e717f2acba946bf7719c44824f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 13:00:42 +0000
Subject: [PATCH 184/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .envs/.local/.django | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 54c76263..0978166d 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -46,4 +46,4 @@ LRM_QA_PASSWORD=''
 #Server Tokens
 #--------------------------------------------------------------------------------
 LRM_DEV_TOKEN=''
-XLI_TOKEN=''
\ No newline at end of file
+XLI_TOKEN=''

From 89756bfc769c17583235ec998d38c452cd05b182 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 21 Nov 2024 08:30:10 -0600
Subject: [PATCH 185/441] Include Api tests #1097

---
 sde_collections/tests/api_tests.py | 159 +++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 sde_collections/tests/api_tests.py

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
new file mode 100644
index 00000000..7f34e45f
--- /dev/null
+++ b/sde_collections/tests/api_tests.py
@@ -0,0 +1,159 @@
+#docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+import pytest
+from unittest.mock import patch, MagicMock
+from django.utils import timezone
+from sde_collections.models.collection import Collection, WorkflowStatusChoices
+from sde_collections.models.delta_url import DumpUrl
+from sde_collections.tests.factories import CollectionFactory, UserFactory
+from sde_collections.sinequa_api import Api
+from sde_collections.tasks import fetch_and_replace_full_text
+
+
+@pytest.mark.django_db
+class TestApiClass:
+    @pytest.fixture
+    def collection(self):
+        """Fixture to create a collection object for testing."""
+        user = UserFactory()
+        return CollectionFactory(
+            curated_by=user,
+            curation_started=timezone.now(),
+            config_folder="example_config",
+            workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS
+        )
+
+    @pytest.fixture
+    def api_instance(self):
+        """Fixture to create an Api instance with mocked server configs."""
+        with patch("sde_collections.sinequa_api.server_configs", {
+            "test_server": {
+                "app_name": "test_app",
+                "query_name": "test_query",
+                "base_url": "http://testserver.com/api",
+                "index": "test_index"
+            }
+        }):
+            return Api(server_name="test_server", user="test_user", password="test_pass", token="test_token")
+
+    @patch("requests.post")
+    def test_process_response_success(self, mock_post, api_instance):
+        """Test that process_response handles successful responses."""
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"key": "value"}
+        mock_post.return_value = mock_response
+
+        response = api_instance.process_response("http://example.com", payload={"test": "data"})
+        assert response == {"key": "value"}
+
+    @patch("requests.post")
+    def test_process_response_failure(self, mock_post, api_instance):
+        """Test that process_response raises an exception on failure."""
+        mock_response = MagicMock()
+        mock_response.status_code = 500
+        mock_post.return_value = mock_response
+        mock_response.raise_for_status.side_effect = Exception("Internal Server Error")
+
+        with pytest.raises(Exception, match="Internal Server Error"):
+            api_instance.process_response("http://example.com", payload={"test": "data"})
+
+    @patch("sde_collections.sinequa_api.Api.process_response")
+    def test_query(self, mock_process_response, api_instance):
+        """Test that query sends correct payload and processes response."""
+        mock_process_response.return_value = {"result": "success"}
+        response = api_instance.query(page=1, collection_config_folder="folder")
+        assert response == {"result": "success"}
+
+    @patch("sde_collections.sinequa_api.Api.process_response")
+    def test_sql_query(self, mock_process_response, api_instance, collection):
+        """Test SQL query execution and response processing."""
+        mock_process_response.return_value = {
+            "Rows": [{"url": "http://example.com", "full_text": "Text", "title": "Title"}],
+            "TotalRowCount": 1
+        }
+        response = api_instance.sql_query("SELECT * FROM test_index", collection)
+        assert response == "All 1 records have been processed and updated."
+
+    @patch("sde_collections.sinequa_api.Api.process_response")
+    def test_get_full_texts(self, mock_process_response, api_instance, collection):
+        """Test fetching full texts from the API."""
+        mock_process_response.return_value = {
+            "Rows": [{"url": "http://example.com", "text": "Example text", "title": "Example title"}]
+        }
+        response = api_instance.get_full_texts(collection_config_folder="folder", source="source", collection=collection)
+        assert response == "All 0 records have been processed and updated."
+
+    def test_process_and_update_data(self, api_instance, collection):
+        """Test processing and updating data in the database."""
+        batch_data = [
+            {"url": "http://example.com", "full_text": "Example text", "title": "Example title"}
+        ]
+        api_instance.process_and_update_data(batch_data, collection)
+        dump_urls = DumpUrl.objects.filter(collection=collection)
+        assert dump_urls.count() == 1
+        assert dump_urls.first().url == "http://example.com"
+
+    @patch("sde_collections.sinequa_api.Api.sql_query")
+    @patch("sde_collections.models.collection.Collection.migrate_dump_to_delta")
+    def test_fetch_and_replace_full_text(self, mock_migrate, mock_sql_query, collection):
+        """Test the fetch_and_replace_full_text Celery task."""
+        with patch("sde_collections.sinequa_api.server_configs", {
+            "test_server": {
+                "app_name": "test_app",
+                "query_name": "test_query",
+                "base_url": "http://testserver.com/api",
+                "index": "test_index"
+            }
+        }):
+            mock_sql_query.return_value = "All records processed"
+            mock_migrate.return_value = None
+
+            result = fetch_and_replace_full_text(collection.id, "test_server")
+            assert result == "All records processed"
+            mock_migrate.assert_called_once()
+
+    @patch("sde_collections.sinequa_api.server_configs", {
+        "test_server": {
+            "app_name": "test_app",
+            "query_name": "test_query",
+            "base_url": "http://testserver.com/api",
+            "index": "test_index"
+        }
+    })
+    @pytest.mark.parametrize("server_name, user, password, expected", [
+        ("test_server", "user1", "pass1", True),
+        ("invalid_server", None, None, False)
+    ])
+    def test_api_init(self, server_name, user, password, expected):
+        """Test API initialization with valid and invalid server names."""
+        if expected:
+            api = Api(server_name=server_name, user=user, password=password)
+            assert api.server_name == server_name
+        else:
+            with pytest.raises(ValueError):
+                Api(server_name=server_name)
+
+    @patch("requests.post")
+    def test_query_dev_server_authentication(self, mock_post, api_instance):
+        """Test query on dev servers requiring authentication."""
+        api_instance.server_name = "xli"  # Setting a dev server
+        mock_post.return_value = MagicMock(status_code=200, json=lambda: {"result": "success"})
+
+        response = api_instance.query(page=1, collection_config_folder="folder")
+        assert response == {"result": "success"}
+
+        # Extract URL from call_args (positional arguments)
+        called_url = mock_post.call_args[0][0]  # URL is the first positional argument
+        assert "?Password=test_pass&User=test_user" in called_url
+
+    @patch("sde_collections.sinequa_api.Api.process_response")
+    def test_sql_query_pagination(self, mock_process_response, api_instance, collection):
+        """Test SQL query with pagination."""
+        mock_process_response.side_effect = [
+            {"Rows": [{"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"}], "TotalRowCount": 6},
+            {"Rows": [{"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"}], "TotalRowCount": 6},
+            {"Rows": [], "TotalRowCount": 6},
+        ]
+
+        result = api_instance.sql_query("SELECT * FROM test_index", collection)
+        assert result == "All 6 records have been processed and updated."

From 280e49134fc0e6186596442bf7cb44165ecb08ab Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 14:31:12 +0000
Subject: [PATCH 186/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/tests/api_tests.py | 85 +++++++++++++++++-------------
 1 file changed, 48 insertions(+), 37 deletions(-)

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 7f34e45f..46487d19 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -1,12 +1,14 @@
-#docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+from unittest.mock import MagicMock, patch
+
 import pytest
-from unittest.mock import patch, MagicMock
 from django.utils import timezone
+
 from sde_collections.models.collection import Collection, WorkflowStatusChoices
 from sde_collections.models.delta_url import DumpUrl
-from sde_collections.tests.factories import CollectionFactory, UserFactory
 from sde_collections.sinequa_api import Api
 from sde_collections.tasks import fetch_and_replace_full_text
+from sde_collections.tests.factories import CollectionFactory, UserFactory
 
 
 @pytest.mark.django_db
@@ -19,20 +21,23 @@ def collection(self):
             curated_by=user,
             curation_started=timezone.now(),
             config_folder="example_config",
-            workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS
+            workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS,
         )
 
     @pytest.fixture
     def api_instance(self):
         """Fixture to create an Api instance with mocked server configs."""
-        with patch("sde_collections.sinequa_api.server_configs", {
-            "test_server": {
-                "app_name": "test_app",
-                "query_name": "test_query",
-                "base_url": "http://testserver.com/api",
-                "index": "test_index"
-            }
-        }):
+        with patch(
+            "sde_collections.sinequa_api.server_configs",
+            {
+                "test_server": {
+                    "app_name": "test_app",
+                    "query_name": "test_query",
+                    "base_url": "http://testserver.com/api",
+                    "index": "test_index",
+                }
+            },
+        ):
             return Api(server_name="test_server", user="test_user", password="test_pass", token="test_token")
 
     @patch("requests.post")
@@ -69,7 +74,7 @@ def test_sql_query(self, mock_process_response, api_instance, collection):
         """Test SQL query execution and response processing."""
         mock_process_response.return_value = {
             "Rows": [{"url": "http://example.com", "full_text": "Text", "title": "Title"}],
-            "TotalRowCount": 1
+            "TotalRowCount": 1,
         }
         response = api_instance.sql_query("SELECT * FROM test_index", collection)
         assert response == "All 1 records have been processed and updated."
@@ -80,14 +85,14 @@ def test_get_full_texts(self, mock_process_response, api_instance, collection):
         mock_process_response.return_value = {
             "Rows": [{"url": "http://example.com", "text": "Example text", "title": "Example title"}]
         }
-        response = api_instance.get_full_texts(collection_config_folder="folder", source="source", collection=collection)
+        response = api_instance.get_full_texts(
+            collection_config_folder="folder", source="source", collection=collection
+        )
         assert response == "All 0 records have been processed and updated."
 
     def test_process_and_update_data(self, api_instance, collection):
         """Test processing and updating data in the database."""
-        batch_data = [
-            {"url": "http://example.com", "full_text": "Example text", "title": "Example title"}
-        ]
+        batch_data = [{"url": "http://example.com", "full_text": "Example text", "title": "Example title"}]
         api_instance.process_and_update_data(batch_data, collection)
         dump_urls = DumpUrl.objects.filter(collection=collection)
         assert dump_urls.count() == 1
@@ -97,14 +102,17 @@ def test_process_and_update_data(self, api_instance, collection):
     @patch("sde_collections.models.collection.Collection.migrate_dump_to_delta")
     def test_fetch_and_replace_full_text(self, mock_migrate, mock_sql_query, collection):
         """Test the fetch_and_replace_full_text Celery task."""
-        with patch("sde_collections.sinequa_api.server_configs", {
-            "test_server": {
-                "app_name": "test_app",
-                "query_name": "test_query",
-                "base_url": "http://testserver.com/api",
-                "index": "test_index"
-            }
-        }):
+        with patch(
+            "sde_collections.sinequa_api.server_configs",
+            {
+                "test_server": {
+                    "app_name": "test_app",
+                    "query_name": "test_query",
+                    "base_url": "http://testserver.com/api",
+                    "index": "test_index",
+                }
+            },
+        ):
             mock_sql_query.return_value = "All records processed"
             mock_migrate.return_value = None
 
@@ -112,18 +120,21 @@ def test_fetch_and_replace_full_text(self, mock_migrate, mock_sql_query, collect
             assert result == "All records processed"
             mock_migrate.assert_called_once()
 
-    @patch("sde_collections.sinequa_api.server_configs", {
-        "test_server": {
-            "app_name": "test_app",
-            "query_name": "test_query",
-            "base_url": "http://testserver.com/api",
-            "index": "test_index"
-        }
-    })
-    @pytest.mark.parametrize("server_name, user, password, expected", [
-        ("test_server", "user1", "pass1", True),
-        ("invalid_server", None, None, False)
-    ])
+    @patch(
+        "sde_collections.sinequa_api.server_configs",
+        {
+            "test_server": {
+                "app_name": "test_app",
+                "query_name": "test_query",
+                "base_url": "http://testserver.com/api",
+                "index": "test_index",
+            }
+        },
+    )
+    @pytest.mark.parametrize(
+        "server_name, user, password, expected",
+        [("test_server", "user1", "pass1", True), ("invalid_server", None, None, False)],
+    )
     def test_api_init(self, server_name, user, password, expected):
         """Test API initialization with valid and invalid server names."""
         if expected:

From 3c1114389844cf6452bd0bb9c8f11ae1603fe0ed Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 21 Nov 2024 08:45:43 -0600
Subject: [PATCH 187/441] Include_Api_tests #1097

---
 sde_collections/tests/api_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 46487d19..0a7a9245 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -4,7 +4,7 @@
 import pytest
 from django.utils import timezone
 
-from sde_collections.models.collection import Collection, WorkflowStatusChoices
+from sde_collections.models.collection import WorkflowStatusChoices
 from sde_collections.models.delta_url import DumpUrl
 from sde_collections.sinequa_api import Api
 from sde_collections.tasks import fetch_and_replace_full_text

From 5d39c26756ddae5750e93732be475f0ea6ff8e56 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 21 Nov 2024 13:11:52 -0600
Subject: [PATCH 188/441] update processing of projects to the format shortname
 - longname

---
 scripts/ej/cmr_processing.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
index ec851b2f..ce9252d8 100644
--- a/scripts/ej/cmr_processing.py
+++ b/scripts/ej/cmr_processing.py
@@ -340,9 +340,22 @@ def spatial_resolution(self) -> str:
 
     @property
     def projects(self) -> str:
-        """Get dataset projects."""
+        """Get dataset projects with both short and long names where available."""
         projects = self.umm.get("Projects", [])
-        return "; ".join(project.get("ShortName", "") for project in projects if project.get("ShortName"))
+        formatted_projects = []
+
+        for project in projects:
+            short_name = project.get("ShortName", "")
+            long_name = project.get("LongName", "")
+
+            if short_name and long_name:
+                formatted_projects.append(f"{short_name} - {long_name}")
+            elif short_name:
+                formatted_projects.append(short_name)
+            elif long_name:
+                formatted_projects.append(long_name)
+
+        return "; ".join(formatted_projects)
 
     @property
     def dataset_name(self) -> str:

From 73ad90e41edc7a7958e224fe1860f81340c351f5 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 21 Nov 2024 13:13:04 -0600
Subject: [PATCH 189/441] add cmr example for testing

---
 scripts/ej/cmr_example.json | 692 ++++++++++++++++++++++++++++++++++++
 1 file changed, 692 insertions(+)
 create mode 100644 scripts/ej/cmr_example.json

diff --git a/scripts/ej/cmr_example.json b/scripts/ej/cmr_example.json
new file mode 100644
index 00000000..dfc92fac
--- /dev/null
+++ b/scripts/ej/cmr_example.json
@@ -0,0 +1,692 @@
+[
+    {
+        "meta": {
+            "revision-id": 41,
+            "deleted": false,
+            "format": "application/vnd.nasa.cmr.umm+json",
+            "provider-id": "SEDAC",
+            "has-combine": false,
+            "user-id": "mhansen",
+            "has-formats": false,
+            "has-spatial-subsetting": false,
+            "native-id": "2000 Pilot Environmental Sustainability Index (ESI)",
+            "has-transforms": false,
+            "has-variables": false,
+            "concept-id": "C179001887-SEDAC",
+            "revision-date": "2022-12-05T20:48:32.236Z",
+            "has-temporal-subsetting": false,
+            "concept-type": "collection"
+        },
+        "umm": {
+            "DataLanguage": "English",
+            "CollectionCitations": [
+                {
+                    "Version": "2000.00",
+                    "Title": "2000 Pilot Environmental Sustainability Index (ESI)",
+                    "Creator": "World Economic Forum - WEF - Global Leaders for Tomorrow Environment Task Force, Yale Center for Environmental Law and Policy - YCELP - Yale University, and Center for International Earth Science Information Network - CIESIN - Columbia University",
+                    "ReleaseDate": "2000-12-31T00:00:00.000Z",
+                    "ReleasePlace": "New Haven, CT",
+                    "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University",
+                    "OnlineResource": {
+                        "Linkage": "https://doi.org/10.7927/H4NK3BZJ"
+                    }
+                }
+            ],
+            "SpatialExtent": {
+                "SpatialCoverageType": "HORIZONTAL",
+                "HorizontalSpatialDomain": {
+                    "Geometry": {
+                        "CoordinateSystem": "CARTESIAN",
+                        "BoundingRectangles": [
+                            {
+                                "NorthBoundingCoordinate": 90.0,
+                                "WestBoundingCoordinate": -180.0,
+                                "EastBoundingCoordinate": 180.0,
+                                "SouthBoundingCoordinate": -55.0
+                            }
+                        ]
+                    }
+                },
+                "GranuleSpatialRepresentation": "CARTESIAN"
+            },
+            "CollectionProgress": "COMPLETE",
+            "ScienceKeywords": [
+                {
+                    "Category": "EARTH SCIENCE",
+                    "Topic": "HUMAN DIMENSIONS",
+                    "Term": "SUSTAINABILITY",
+                    "VariableLevel1": "ENVIRONMENTAL SUSTAINABILITY"
+                }
+            ],
+            "TemporalExtents": [
+                {
+                    "EndsAtPresentFlag": false,
+                    "RangeDateTimes": [
+                        {
+                            "BeginningDateTime": "1978-01-01T00:00:00.000Z",
+                            "EndingDateTime": "1999-12-31T00:00:00.000Z"
+                        }
+                    ]
+                }
+            ],
+            "ProcessingLevel": {
+                "Id": "4"
+            },
+            "DOI": {
+                "DOI": "10.7927/H4NK3BZJ",
+                "Authority": "https://doi.org/"
+            },
+            "ShortName": "CIESIN_SEDAC_ESI_2000",
+            "EntryTitle": "2000 Pilot Environmental Sustainability Index (ESI)",
+            "PublicationReferences": [
+                {
+                    "Title": "2001 Environmental Sustainability Index (ESI)",
+                    "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University",
+                    "DOI": {
+                        "DOI": "10.7927/H4X34VDM",
+                        "Authority": "https://doi.org/"
+                    },
+                    "Author": "World Economic Forum - WEF - Global Leaders for Tomorrow Environment Task Force, Yale Center for Environmental Law and Policy - YCELP - Yale University, and Center for International Earth Science Information Network - CIESIN - Columbia University",
+                    "PublicationDate": "2001-12-31T00:00:00.000Z",
+                    "Edition": "2001.00",
+                    "PublicationPlace": "New Haven, CT"
+                },
+                {
+                    "Title": "2002 Environmental Sustainability Index (ESI)",
+                    "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University",
+                    "DOI": {
+                        "DOI": "10.7927/H4SB43P8",
+                        "Authority": "https://doi.org/"
+                    },
+                    "Author": "World Economic Forum - WEF - Global Leaders for Tomorrow Environment Task Force, Yale Center for Environmental Law and Policy - YCELP - Yale University, and Center for International Earth Science Information Network - CIESIN - Columbia University",
+                    "PublicationDate": "2002-12-31T00:00:00.000Z",
+                    "Edition": "2002.00",
+                    "PublicationPlace": "New Haven, CT"
+                },
+                {
+                    "Title": "2005 Environmental Sustainability Index (ESI)",
+                    "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University",
+                    "DOI": {
+                        "DOI": "10.7927/H40V89R6",
+                        "Authority": "https://doi.org/"
+                    },
+                    "Author": "Yale Center for Environmental Law and Policy - YCELP - Yale University, Center for International Earth Science Information Network - CIESIN - Columbia University, World Economic Forum - WEF, and Joint Research Centre - JRC - European Commission",
+                    "PublicationDate": "2005-12-31T00:00:00.000Z",
+                    "Edition": "2005.00",
+                    "PublicationPlace": "New Haven, CT"
+                }
+            ],
+            "AccessConstraints": {
+                "Description": "None"
+            },
+            "RelatedUrls": [
+                {
+                    "Description": "Sample browse graphic of the data set.",
+                    "URLContentType": "VisualizationURL",
+                    "Type": "GET RELATED VISUALIZATION",
+                    "URL": "https://sedac.ciesin.columbia.edu/downloads/maps/esi/esi-pilot-environmental-sustainability-index-2000/sedac-logo.jpg"
+                },
+                {
+                    "Description": "Data Download Page",
+                    "URLContentType": "DistributionURL",
+                    "Type": "GET DATA",
+                    "Subtype": "DIRECT DOWNLOAD",
+                    "URL": "https://sedac.ciesin.columbia.edu/data/set/esi-pilot-environmental-sustainability-index-2000/data-download"
+                },
+                {
+                    "Description": "Data Set\u00a0Overview Page",
+                    "URLContentType": "PublicationURL",
+                    "Type": "VIEW RELATED INFORMATION",
+                    "Subtype": "GENERAL DOCUMENTATION",
+                    "URL": "https://sedac.ciesin.columbia.edu/data/set/esi-pilot-environmental-sustainability-index-2000"
+                }
+            ],
+            "ContactGroups": [
+                {
+                    "Roles": [
+                        "Metadata Author"
+                    ],
+                    "GroupName": "CIESIN METADATA ADMINISTRATION",
+                    "ContactInformation": {
+                        "ContactMechanisms": [
+                            {
+                                "Type": "Email",
+                                "Value": "metadata@ciesin.columbia.edu"
+                            },
+                            {
+                                "Type": "Fax",
+                                "Value": "+1 845-365-8922"
+                            },
+                            {
+                                "Type": "Telephone",
+                                "Value": "+1 845-365-8988"
+                            }
+                        ],
+                        "Addresses": [
+                            {
+                                "Country": "United States",
+                                "StreetAddresses": [
+                                    "CIESIN, Columbia University, 61 Route 9W, P.O. Box 1000"
+                                ],
+                                "City": "Palisades",
+                                "StateProvince": "New York",
+                                "PostalCode": "10964"
+                            }
+                        ],
+                        "RelatedUrls": [
+                            {
+                                "Description": "Metadata Author\u00a0Home\u00a0Page",
+                                "URLContentType": "DataContactURL",
+                                "Type": "HOME PAGE",
+                                "URL": "https://sedac.ciesin.columbia.edu"
+                            }
+                        ]
+                    }
+                }
+            ],
+            "DataDates": [
+                {
+                    "Date": "2000-12-31T00:00:00.000Z",
+                    "Type": "CREATE"
+                },
+                {
+                    "Date": "2000-12-31T00:00:00.000Z",
+                    "Type": "UPDATE"
+                }
+            ],
+            "Abstract": "The 2000 Pilot Environmental Sustainability Index (ESI) is an exploratory effort to construct an index that measures the ability of a nation's economy to achieve sustainable development, with the long term goal of finding a single indicator for environmental sustainability analagous to that of the Gross Domestic Product (GDP). The index covering 56 countries is a composite measure of the current status of a nation's environmental systems, pressures on those systems, human vulnerability to environmental change, national capacity to respond, and contributions to global environmental stewardship. The index was unveiled at the World Economic Forum's annual meeting, January 2000, Davos, Switzerland. The 2000 Pilot ESI is the result of collaboration among the World Economic Forum (WEF), Yale Center for Environmental Law and Policy (YCELP), and the Columbia University Center for International Earth Science Information Network (CIESIN).",
+            "Purpose": "To test the feasibility of creating a comparative index of national-level environmental sustainability.",
+            "LocationKeywords": [
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "WESTERN ASIA",
+                    "Subregion2": "MIDDLE EAST",
+                    "Subregion3": "BAHRAIN"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "NORTH AMERICA",
+                    "Subregion1": "CANADA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "ARGENTINA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "AUSTRALIA/NEW ZEALAND",
+                    "Subregion1": "AUSTRALIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "WESTERN EUROPE",
+                    "Subregion2": "AUSTRIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "WESTERN EUROPE",
+                    "Subregion2": "BELGIUM"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "BOLIVIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "BRAZIL"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "EASTERN EUROPE",
+                    "Subregion2": "BULGARIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "CHILE"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "COLOMBIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "NORTH AMERICA",
+                    "Subregion1": "CENTRAL AMERICA",
+                    "Subregion2": "COSTA RICA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "NORTHERN EUROPE",
+                    "Subregion2": "SCANDINAVIA",
+                    "Subregion3": "DENMARK"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "ECUADOR"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "AFRICA",
+                    "Subregion1": "NORTHERN AFRICA",
+                    "Subregion2": "EGYPT"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "NORTH AMERICA",
+                    "Subregion1": "CENTRAL AMERICA",
+                    "Subregion2": "EL SALVADOR"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "NORTHERN EUROPE",
+                    "Subregion2": "SCANDINAVIA",
+                    "Subregion3": "FINLAND"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "WESTERN EUROPE",
+                    "Subregion2": "FRANCE"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "WESTERN EUROPE",
+                    "Subregion2": "GERMANY"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "SOUTHERN EUROPE",
+                    "Subregion2": "GREECE"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "EASTERN EUROPE",
+                    "Subregion2": "HUNGARY"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "NORTHERN EUROPE",
+                    "Subregion2": "ICELAND"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "SOUTHCENTRAL ASIA",
+                    "Subregion2": "INDIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "NORTHERN EUROPE",
+                    "Subregion2": "BRITISH ISLES",
+                    "Subregion3": "IRELAND"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "WESTERN ASIA",
+                    "Subregion2": "MIDDLE EAST",
+                    "Subregion3": "ISRAEL"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "SOUTHERN EUROPE",
+                    "Subregion2": "ITALY"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "EASTERN ASIA",
+                    "Subregion2": "JAPAN"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "WESTERN ASIA",
+                    "Subregion2": "MIDDLE EAST",
+                    "Subregion3": "JORDAN"
+                },
+                {
+                    "Category": "OCEAN",
+                    "Type": "INDIAN OCEAN",
+                    "Subregion1": "MALAYSIA"
+                },
+                {
+                    "Category": "OCEAN",
+                    "Type": "INDIAN OCEAN",
+                    "Subregion1": "MAURITIUS"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "NORTH AMERICA",
+                    "Subregion1": "MEXICO"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "WESTERN EUROPE",
+                    "Subregion2": "NETHERLANDS"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "AUSTRALIA/NEW ZEALAND",
+                    "Subregion1": "NEW ZEALAND"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "NORTHERN EUROPE",
+                    "Subregion2": "SCANDINAVIA",
+                    "Subregion3": "NORWAY"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "PERU"
+                },
+                {
+                    "Category": "OCEAN",
+                    "Type": "PACIFIC OCEAN",
+                    "Subregion1": "WESTERN PACIFIC OCEAN",
+                    "Subregion2": "PHILIPPINES"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "EASTERN EUROPE",
+                    "Subregion2": "POLAND"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "WESTERN EUROPE",
+                    "Subregion2": "PORTUGAL"
+                },
+                {
+                    "Category": "OCEAN",
+                    "Type": "INDIAN OCEAN",
+                    "Subregion1": "SINGAPORE"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "EASTERN EUROPE",
+                    "Subregion2": "SLOVAKIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "AFRICA",
+                    "Subregion1": "SOUTHERN AFRICA",
+                    "Subregion2": "SOUTH AFRICA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "EASTERN ASIA",
+                    "Subregion2": "SOUTH KOREA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "SOUTHERN EUROPE",
+                    "Subregion2": "SPAIN",
+                    "Subregion3": "GIBRALTAR"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "NORTHERN EUROPE",
+                    "Subregion2": "SCANDINAVIA",
+                    "Subregion3": "SWEDEN"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "WESTERN EUROPE",
+                    "Subregion2": "SWITZERLAND"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "SOUTHEASTERN ASIA",
+                    "Subregion2": "THAILAND"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "WESTERN ASIA",
+                    "Subregion2": "TURKEY"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "EASTERN EUROPE",
+                    "Subregion2": "UKRAINE"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "NORTHERN EUROPE",
+                    "Subregion2": "BRITISH ISLES",
+                    "Subregion3": "UNITED KINGDOM"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "SOUTH AMERICA",
+                    "Subregion1": "VENEZUELA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "SOUTHEASTERN ASIA",
+                    "Subregion2": "VIETNAM"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "AFRICA",
+                    "Subregion1": "EASTERN AFRICA",
+                    "Subregion2": "ZIMBABWE"
+                },
+                {
+                    "Category": "GEOGRAPHIC REGION",
+                    "Type": "GLOBAL"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "NORTH AMERICA",
+                    "Subregion1": "UNITED STATES OF AMERICA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "SOUTHERN EUROPE",
+                    "Subregion2": "SPAIN"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "EASTERN EUROPE",
+                    "Subregion2": "RUSSIAN FEDERATION"
+                },
+                {
+                    "Category": "OCEAN",
+                    "Type": "INDIAN OCEAN",
+                    "Subregion1": "INDONESIA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "ASIA",
+                    "Subregion1": "EASTERN ASIA",
+                    "Subregion2": "CHINA"
+                },
+                {
+                    "Category": "CONTINENT",
+                    "Type": "EUROPE",
+                    "Subregion1": "EASTERN EUROPE",
+                    "Subregion2": "CZECHIA"
+                }
+            ],
+            "MetadataDates": [
+                {
+                    "Type": "CREATE",
+                    "Date": "2012-12-12T00:00:00.000Z"
+                },
+                {
+                    "Type": "UPDATE",
+                    "Date": "2022-12-05T20:40:00.000Z"
+                }
+            ],
+            "DirectoryNames": [
+                {
+                    "ShortName": "CIESIN_SEDAC_ESI_2000"
+                }
+            ],
+            "Version": "2000.00",
+            "Projects": [
+                {
+                    "ShortName": "ESI",
+                    "LongName": "Environmental Sustainability Index"
+                }
+            ],
+            "UseConstraints": {
+                "Description": "Users are free to use, copy, distribute, transmit, and adapt the work for commercial and non-commercial purposes, without restriction, as long as clear attribution of the source is provided."
+            },
+            "CollectionDataType": "SCIENCE_QUALITY",
+            "DataCenters": [
+                {
+                    "Roles": [
+                        "DISTRIBUTOR"
+                    ],
+                    "ShortName": "SEDAC",
+                    "LongName": "Socioeconomic Data and Applications Center",
+                    "ContactInformation": {
+                        "ContactMechanisms": [
+                            {
+                                "Type": "Telephone",
+                                "Value": "+1 845-365-8920"
+                            },
+                            {
+                                "Type": "Fax",
+                                "Value": "+1 845-365-8922"
+                            },
+                            {
+                                "Type": "Email",
+                                "Value": "ciesin.info@ciesin.columbia.edu"
+                            }
+                        ],
+                        "Addresses": [
+                            {
+                                "Country": "United States",
+                                "StreetAddresses": [
+                                    "CIESIN, Columbia University, 61 Route 9W, P.O. Box 1000"
+                                ],
+                                "City": "Palisades",
+                                "StateProvince": "New York",
+                                "PostalCode": "10964"
+                            }
+                        ],
+                        "RelatedUrls": [
+                            {
+                                "Description": "Distributor Home Page",
+                                "URLContentType": "DataCenterURL",
+                                "Type": "HOME PAGE",
+                                "URL": "https://sedac.ciesin.columbia.edu/"
+                            }
+                        ]
+                    }
+                },
+                {
+                    "Roles": [
+                        "ARCHIVER"
+                    ],
+                    "ShortName": "SEDAC",
+                    "LongName": "Socioeconomic Data and Applications Center",
+                    "ContactInformation": {
+                        "ContactMechanisms": [
+                            {
+                                "Type": "Telephone",
+                                "Value": "+1 845-365-8920"
+                            },
+                            {
+                                "Type": "Fax",
+                                "Value": "+1 845-365-8922"
+                            },
+                            {
+                                "Type": "Email",
+                                "Value": "ciesin.info@ciesin.columbia.edu"
+                            }
+                        ],
+                        "Addresses": [
+                            {
+                                "Country": "United States",
+                                "StreetAddresses": [
+                                    "CIESIN, Columbia University, 61 Route 9W, P.O. Box 1000"
+                                ],
+                                "City": "Palisades",
+                                "StateProvince": "New York",
+                                "PostalCode": "10964"
+                            }
+                        ],
+                        "RelatedUrls": [
+                            {
+                                "Description": "Archiver Home Page",
+                                "URLContentType": "DataCenterURL",
+                                "Type": "HOME PAGE",
+                                "URL": "https://sedac.ciesin.columbia.edu/"
+                            }
+                        ]
+                    }
+                }
+            ],
+            "Platforms": [
+                {
+                    "Type": "Models",
+                    "ShortName": "MODELS",
+                    "LongName": "MODELS",
+                    "Instruments": [
+                        {
+                            "ShortName": "Computer",
+                            "LongName": "Computer"
+                        }
+                    ]
+                }
+            ],
+            "MetadataSpecification": {
+                "URL": "https://cdn.earthdata.nasa.gov/umm/collection/v1.18.1",
+                "Name": "UMM-C",
+                "Version": "1.18.1"
+            },
+            "ArchiveAndDistributionInformation": {
+                "FileArchiveInformation": [
+                    {
+                        "Format": "PDF"
+                    }
+                ],
+                "FileDistributionInformation": [
+                    {
+                        "Format": "PDF",
+                        "Fees": "0"
+                    }
+                ]
+            }
+        },
+        "indicators": "Human Dimensions"
+    }
+]

From e370c757f8c3ad76bb3bffd1130d32b4240ff315 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 21 Nov 2024 14:59:15 -0600
Subject: [PATCH 190/441] add initial readme explaining the pattern system

---
 sde_collections/models/README.md | 112 +++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 sde_collections/models/README.md

diff --git a/sde_collections/models/README.md b/sde_collections/models/README.md
new file mode 100644
index 00000000..b8381747
--- /dev/null
+++ b/sde_collections/models/README.md
@@ -0,0 +1,112 @@
+# Understanding the Pattern System
+
+## Overview
+The pattern system is designed to manage and track changes to URLs in a content curation workflow. It provides a way to systematically modify, exclude, or categorize URLs while maintaining a clear separation between work-in-progress changes (Delta URLs) and production content (Curated URLs).
+
+## Core Concepts
+
+### URL States
+- **Curated URLs**: Production-ready, approved content
+- **Delta URLs**: Work-in-progress changes, additions, or deletions to curated content
+- **Dump URLs**: Raw content from the dev server
+
+### Pattern Types
+1. **Exclude Patterns**: Mark URLs for exclusion from the collection
+2. **Include Patterns**: Explicitly include URLs in the collection
+3. **Title Patterns**: Change or modify the original title
+4. **Document Type Patterns**: Assign document type classifications
+5. **Division Patterns**: Assign SMD division
+
+## Pattern Lifecycle
+
+### 1. Pattern Creation & Application
+When a new pattern is created:
+1. System identifies all matching URLs based on the pattern criteria
+2. For matching Curated URLs:
+   - If the pattern would change the URL's properties
+   - And no Delta URL exists → Create a Delta URL with the changes
+   - If Delta URL exists → Update it with additional changes
+3. For matching Delta URLs:
+   - Apply the pattern's effects directly
+
+
+### 2. Pattern Effects
+- Each pattern type has specific effects:
+  - Exclude: Sets exclusion status
+  - Include: Clears exclusion status
+  - Title: Modifies scraped title
+  - Document Type: Sets document classification
+  - Division: Sets organizational division
+
+### 3. Delta URL Generation Rules
+Delta URLs are created when:
+1. A new pattern would modify a Curated URL
+2. An existing pattern effecting a Curated URL is removed, requiring reversal of its effects
+3. Reindexed content in DumpUrl differs from Curated content
+
+Delta URLs are not created when:
+1. Pattern effects match current Curated URL state
+2. Reindexed content matches Curated content
+
+### 4. Pattern Removal
+When a pattern is deleted:
+1. System identifies all URLs affected by the pattern
+2. For each affected Curated URL:
+   - Create Delta URL to reverse effects
+3. For affected Delta URLs:
+   - Remove pattern's effects
+   - If other patterns still affect it → Keep with updated state
+   - If Delta URL becomes identical to Curated URL → Delete Delta URL
+
+## Working Principles
+
+### 1. Idempotency
+- Applying the same pattern multiple times should have the same effect as applying it once
+- System tracks pattern effects to ensure consistency
+- Multiple patterns can affect the same URL
+
+### 2. Separation of Concerns
+- Pattern effects on Delta URLs don't directly affect Curated URLs
+- Exclusion status tracked separately for Delta and Curated URLs
+- Changes only propagate to Curated URLs during promotion
+
+### 3. Change Tracking
+- System maintains relationships between patterns and affected URLs
+- Each pattern's effects are tracked separately
+- Changes can be reversed if patterns are removed
+
+### 4. Delta URL Lifecycle
+1. Creation:
+   - When patterns would modify Curated URLs
+   - When DumpUrl content differs from Curated content
+   - When patterns are removed and effects on CuratedUrls need reversal
+
+2. Updates:
+   - When new patterns affect the URL
+   - When pattern effects change
+   - When source content changes
+
+3. Deletion:
+   - When identical to Curated URL with no pattern effects
+   - When explicitly marked for deletion
+   - During promotion to Curated status
+
+## Pattern Interaction Examples
+
+### Scenario 1: Multiple Patterns
+- Pattern A excludes URLs containing "draft"
+- Pattern B sets document type for URLs containing "spec"
+- URL: "example.com/draft-spec"
+- Result: URL is excluded, document type is set (both patterns apply)
+
+### Scenario 2: Pattern Removal
+- Pattern sets custom title for URLs
+- URLs have custom titles in production
+- Pattern is deleted
+- Result: Delta URLs created to restore original titles
+
+### Scenario 3: Conflicting Patterns
+- Pattern A includes URLs containing "docs"
+- Pattern B excludes URLs containing "internal"
+- URL: "example.com/docs/internal"
+- Result: Url is included - Includes always take precedence

From eafb2b5334bbfe85b5d38982e866b093cd119021 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 22 Nov 2024 20:21:12 -0600
Subject: [PATCH 191/441] add initial exclude tests

---
 .../tests/test_exclude_patterns.py            | 308 ++++++++++++++++++
 1 file changed, 308 insertions(+)
 create mode 100644 sde_collections/tests/test_exclude_patterns.py

diff --git a/sde_collections/tests/test_exclude_patterns.py b/sde_collections/tests/test_exclude_patterns.py
new file mode 100644
index 00000000..459f4403
--- /dev/null
+++ b/sde_collections/tests/test_exclude_patterns.py
@@ -0,0 +1,308 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_exclude_patterns.py
+
+import pytest
+from django.contrib.contenttypes.models import ContentType
+from django.test import TestCase
+
+from sde_collections.models.delta_patterns import DeltaExcludePattern
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+
+from .factories import (
+    CollectionFactory,
+    CuratedUrlFactory,
+    DeltaUrlFactory,
+    DumpUrlFactory,
+)
+
+
+class BaseCollectionTest(TestCase):
+    def setUp(self):
+        super().setUp()
+        self.collection = CollectionFactory()
+
+        # Ensure ContentTypes are created for all pattern models
+        ContentType.objects.get_or_create(
+            app_label="sde_collections",
+            model="deltaexcludepattern",
+        )
+        ContentType.objects.get_or_create(
+            app_label="sde_collections",
+            model="deltaincludepattern",
+        )
+        ContentType.objects.get_or_create(
+            app_label="sde_collections",
+            model="deltatitlepattern",
+        )
+        ContentType.objects.get_or_create(
+            app_label="sde_collections",
+            model="deltadocumenttypepattern",
+        )
+        ContentType.objects.get_or_create(
+            app_label="sde_collections",
+            model="deltadivisionpattern",
+        )
+
+
+@pytest.mark.django_db
+class TestDeltaExcludePatternBasics(TestCase):
+    """Test basic functionality of exclude patterns."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_create_simple_exclude_pattern(self):
+        """Test creation of a basic exclude pattern."""
+        pattern = DeltaExcludePattern.objects.create(
+            collection=self.collection, match_pattern="https://example.com/exclude-me", reason="Test exclusion"
+        )
+        assert pattern.match_pattern_type == DeltaExcludePattern.MatchPatternTypeChoices.INDIVIDUAL_URL
+
+    def test_exclude_single_curated_url(self):
+        """Test excluding a single curated URL creates appropriate delta."""
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/exclude-me", scraped_title="Test Title"
+        )
+
+        pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url)
+
+        # Pattern should create a delta URL
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url is not None
+        assert pattern.delta_urls.filter(id=delta_url.id).exists()
+        assert not pattern.curated_urls.filter(id=curated_url.id).exists()
+
+    def test_exclude_single_curated_url_multiple_applies(self):
+        """
+        Test excluding a single curated URL creates appropriate delta.
+        even if the pattern is applied multiple times
+        """
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/exclude-me", scraped_title="Test Title"
+        )
+
+        pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url)
+        pattern.save()
+        pattern.apply()
+        pattern.apply()
+        pattern.save()
+
+        # Pattern should create a delta URL
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url is not None
+        assert pattern.delta_urls.filter(id=delta_url.id).exists()
+        assert not pattern.curated_urls.filter(id=curated_url.id).exists()
+
+    def test_wildcard_pattern_exclusion(self):
+        """Test excluding multiple URLs with wildcard pattern."""
+        # Create multiple curated URLs
+        urls = [
+            CuratedUrlFactory(
+                collection=self.collection,
+                url=f"https://example.com/docs/internal/{i}",
+                scraped_title=f"Internal Doc {i}",
+            )
+            for i in range(3)
+        ]
+
+        pattern = DeltaExcludePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/docs/internal/*",
+            match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+        )
+
+        # All URLs should have corresponding deltas
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 3
+        for url in urls:
+            assert pattern.delta_urls.filter(url=url.url).exists()
+            assert not pattern.curated_urls.filter(id=url.id).exists()
+
+    def test_exclusion_selectivity(self):
+        """
+        new patterns should only exclude DeltaUrls, not CuratedUrls
+        """
+        curated_url = CuratedUrlFactory(collection=self.collection, url="https://example.com/page/1")
+        delta_url = DeltaUrlFactory(collection=self.collection, url="https://example.com/page/2")
+
+        # confirm they both start as not excluded
+        assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False
+        assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False
+
+        # Create an exclusion pattern matches both urls
+        pattern = DeltaExcludePattern.objects.create(
+            collection=self.collection, match_pattern="*page*", match_pattern_type=2
+        )
+        pattern.apply()
+
+        # curated urls should not be affected by patterns until the collection is promoted
+        # curated should be included, but delta should be excluded
+        assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True
+        assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False
+
+
+class TestDeltaExcludePatternWorkflow(BaseCollectionTest):
+    """Test complex workflows involving exclude patterns."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_pattern_removal_creates_reversal_deltas(self):
+        """
+        Test that removing an exclude pattern after promotion creates delta URLs
+        to reverse the exclusion of previously excluded curated URLs.
+        """
+        collection = self.collection
+        # Create curated URL
+        curated_url = CuratedUrlFactory(
+            collection=collection, url="https://example.com/test", scraped_title="Test Title"
+        )
+
+        # Create exclude pattern - this should create excluded delta URL
+        pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern=curated_url.url)
+
+        # Verify delta URL was created and is excluded
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url.excluded is True
+
+        # Promote collection - this should convert excluded delta URL to excluded curated URL
+        collection.promote_to_curated()
+
+        # Verify curated URL is now excluded and delta URL is gone
+        assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
+
+        curated_url = CuratedUrl.objects.get(url=curated_url.url)
+        assert curated_url.excluded is True
+
+        # Remove pattern - this should create new delta URL to show URL will be included
+        pattern.delete()
+
+        reversal_delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert reversal_delta.excluded is False
+
+        collection.promote_to_curated()
+        assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
+
+        curated_url = CuratedUrl.objects.get(url=curated_url.url)
+        assert curated_url.excluded is False
+
+    def test_promote_and_new_exclude_workflow(self):
+        """Test workflow: add URLs, exclude some, promote, then add new exclude pattern."""
+        # Initial setup with curated URLs
+        [
+            CuratedUrlFactory(collection=self.collection, url=f"https://example.com/page{i}", scraped_title=f"Page {i}")
+            for i in range(3)
+        ]
+
+        # Create first exclude pattern
+        DeltaExcludePattern.objects.create(collection=self.collection, match_pattern="https://example.com/page1")
+
+        # Verify delta URL created
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+
+        # Simulate promotion
+        self.collection.promote_to_curated()
+
+        # Create new exclude pattern after promotion
+        pattern2 = DeltaExcludePattern.objects.create(
+            collection=self.collection, match_pattern="https://example.com/page2"
+        )
+
+        # Should have new delta URL for newly excluded URL
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+        assert pattern2.delta_urls.count() == 1
+
+    def test_dump_migration_with_excludes(self):
+        """Test handling of excluded URLs during dump migration."""
+        # Create initial curated URLs
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/test", scraped_title="Original Title"
+        )
+
+        # Create exclude pattern
+        pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url)
+
+        # Create dump URL with different content
+        DumpUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Updated Title")
+
+        # Migrate dump to delta
+        self.collection.migrate_dump_to_delta()
+
+        # Should have delta URL reflecting both exclusion and content change
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url is not None
+        assert delta_url.scraped_title == "Updated Title"
+        assert pattern.delta_urls.filter(id=delta_url.id).exists()
+
+
+# class TestDeltaExcludePatternEdgeCases(TestCase):
+#     """Test edge cases and complex scenarios."""
+
+#     def setUp(self):
+#         self.collection = CollectionFactory()
+
+#     def test_exclude_already_excluded_url(self):
+#         """Test applying multiple exclude patterns to same URL."""
+#         curated_url = CuratedUrlFactory(
+#             collection=self.collection, url="https://example.com/test", scraped_title="Test Title"
+#         )
+
+#         # Create first exclude pattern
+#         pattern1 = DeltaExcludePattern.objects.create(
+#             collection=self.collection, match_pattern=curated_url.url, reason="First exclusion"
+#         )
+
+#         # Create second exclude pattern for same URL
+#         pattern2 = DeltaExcludePattern.objects.create(
+#             collection=self.collection, match_pattern=curated_url.url, reason="Second exclusion"
+#         )
+
+#         # Should still only have one delta URL
+#         assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+#         delta_url = DeltaUrl.objects.get(url=curated_url.url)
+
+#         # URL should be associated with both patterns
+#         assert pattern1.delta_urls.filter(id=delta_url.id).exists()
+#         assert pattern2.delta_urls.filter(id=delta_url.id).exists()
+
+#     def test_exclude_modified_url(self):
+#         """Test excluding a URL that already has modifications in delta."""
+#         # Create curated URL
+#         curated_url = CuratedUrlFactory(
+#             collection=self.collection, url="https://example.com/test", scraped_title="Original Title"
+#         )
+
+#         # Create modified delta URL
+#         delta_url = DeltaUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Modified Title")
+
+#         # Create exclude pattern
+#         pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url)
+
+#         # Should still only have one delta URL with both modification and exclusion
+#         assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+#         updated_delta = DeltaUrl.objects.get(url=curated_url.url)
+#         assert updated_delta.scraped_title == "Modified Title"
+#         assert pattern.delta_urls.filter(id=updated_delta.id).exists()
+
+#     def test_pattern_update_workflow(self):
+#         """Test updating an exclude pattern's criteria."""
+#         # Create multiple curated URLs
+#         urls = [
+#             CuratedUrlFactory(
+#                 collection=self.collection, url=f"https://example.com/section{i}/page", scraped_title=f"Page {i}"
+#             )
+#             for i in range(3)
+#         ]
+
+#         # Create initial pattern
+#         pattern = DeltaExcludePattern.objects.create(
+#             collection=self.collection,
+#             match_pattern="https://example.com/section1/*",
+#             match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+#         )
+
+#         # Update pattern to match different URLs
+#         pattern.match_pattern = "https://example.com/section2/*"
+#         pattern.save()
+
+#         # Verify delta URLs are updated correctly
+#         assert not pattern.delta_urls.filter(url=urls[0].url).exists()
+#         assert pattern.delta_urls.filter(url=urls[1].url).exists()

From 6518c4745d09ba9883ab0043724fc4f5f372e739 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 22 Nov 2024 21:35:44 -0600
Subject: [PATCH 192/441] remove destination_server and add datasource

---
 ...ljusticerow_destination_server_and_more.py | 52 +++++++++++++++++++
 environmental_justice/models.py               | 12 ++---
 environmental_justice/views.py                | 46 +++++++++++++---
 3 files changed, 98 insertions(+), 12 deletions(-)
 create mode 100644 environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py

diff --git a/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py b/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py
new file mode 100644
index 00000000..c51219b4
--- /dev/null
+++ b/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py
@@ -0,0 +1,52 @@
+# Generated by Django 4.2.9 on 2024-11-23 03:18
+
+from django.db import migrations, models
+
+
+def migrate_destination_server_to_data_source(apps, schema_editor):
+    EnvironmentalJusticeRow = apps.get_model("environmental_justice", "EnvironmentalJusticeRow")
+
+    # Migrate prod to spreadsheet
+    EnvironmentalJusticeRow.objects.filter(destination_server="prod").update(
+        data_source="spreadsheet", destination_server=""
+    )
+
+    # Migrate dev to ml_production
+    EnvironmentalJusticeRow.objects.filter(destination_server="dev").update(
+        data_source="ml_production", destination_server=""
+    )
+
+    # Migrate test to ml_testing
+    EnvironmentalJusticeRow.objects.filter(destination_server="test").update(
+        data_source="ml_testing", destination_server=""
+    )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("environmental_justice", "0005_environmentaljusticerow_destination_server"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="environmentaljusticerow",
+            name="data_source",
+            field=models.CharField(
+                blank=True,
+                choices=[
+                    ("spreadsheet", "Spreadsheet"),
+                    ("ml_production", "ML Production"),
+                    ("ml_testing", "ML Testing"),
+                ],
+                default="",
+                max_length=20,
+                verbose_name="Data Source",
+            ),
+        ),
+        migrations.RunPython(migrate_destination_server_to_data_source, reverse_code=migrations.RunPython.noop),
+        migrations.RemoveField(
+            model_name="environmentaljusticerow",
+            name="destination_server",
+        ),
+    ]
diff --git a/environmental_justice/models.py b/environmental_justice/models.py
index 97cb1d61..d7cb705b 100644
--- a/environmental_justice/models.py
+++ b/environmental_justice/models.py
@@ -6,13 +6,13 @@ class EnvironmentalJusticeRow(models.Model):
     Environmental Justice data from the spreadsheet
     """
 
-    class DestinationServerChoices(models.TextChoices):
-        DEV = "dev", "Development"
-        TEST = "test", "Testing"
-        PROD = "prod", "Production"
+    class DataSourceChoices(models.TextChoices):
+        SPREADSHEET = "spreadsheet", "Spreadsheet"
+        ML_PRODUCTION = "ml_production", "ML Production"
+        ML_TESTING = "ml_testing", "ML Testing"
 
-    destination_server = models.CharField(
-        "Destination Server", max_length=10, choices=DestinationServerChoices.choices, default="", blank=True
+    data_source = models.CharField(
+        "Data Source", max_length=20, choices=DataSourceChoices.choices, default="", blank=True
     )
 
     dataset = models.CharField("Dataset", blank=True, default="")
diff --git a/environmental_justice/views.py b/environmental_justice/views.py
index 4e999a4c..f4d2afbe 100644
--- a/environmental_justice/views.py
+++ b/environmental_justice/views.py
@@ -1,3 +1,4 @@
+from django.db.models import Q
 from django_filters.rest_framework import DjangoFilterBackend
 from rest_framework import viewsets
 
@@ -8,19 +9,52 @@
 class EnvironmentalJusticeRowViewSet(viewsets.ModelViewSet):
     """
     API endpoint that allows environmental justice rows to be read.
+    When combining spreadsheet and ml_production data, spreadsheet takes precedence
+    for any matching dataset values.
     """
 
     queryset = EnvironmentalJusticeRow.objects.all()
     serializer_class = EnvironmentalJusticeRowSerializer
     http_method_names = ["get"]
     filter_backends = [DjangoFilterBackend]
-    filterset_fields = ["destination_server"]
+    filterset_fields = ["data_source"]
+
+    def get_combined_queryset(self):
+        """
+        Returns combined data where:
+        1. All spreadsheet data is included
+        2. ML production data is included only if there's no spreadsheet data with matching dataset
+        """
+        # First, get all unique datasets that exist in spreadsheet
+        spreadsheet_datasets = (
+            EnvironmentalJusticeRow.objects.filter(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET)
+            .values_list("dataset", flat=True)
+            .distinct()
+        )
+
+        # Build query to get:
+        # 1. ALL spreadsheet records
+        # 2. ML production records where dataset isn't in spreadsheet
+        combined_query = Q(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) | Q(
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, dataset__not_in=spreadsheet_datasets
+        )
+
+        return EnvironmentalJusticeRow.objects.filter(combined_query).order_by(
+            "dataset"
+        )  # Optional: orders results by dataset name
 
     def get_queryset(self):
         """
-        if no destination_server is provided, default to PROD
+        Handle different data_source filter scenarios:
+        - No filter: Return combined data (spreadsheet takes precedence)
+        - 'combined': Same as no filter
+        - specific source: Return data for that source only
         """
-        queryset = super().get_queryset()
-        if not self.request.query_params.get("destination_server"):
-            queryset = queryset.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.PROD)
-        return queryset
+        data_source = self.request.query_params.get("data_source", "combined")
+
+        # straightfoward case: return data for specific source
+        if data_source in EnvironmentalJusticeRow.DataSourceChoices.values:
+            return super().get_queryset().filter(data_source=data_source)
+
+        # Handle 'combined' or no filter case
+        return self.get_combined_queryset()

From 888c53b516aff217b45cdf59cd3f6e7adbb60583 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 22 Nov 2024 21:40:09 -0600
Subject: [PATCH 193/441] add readme explaining EJ api behavior

---
 environmental_justice/README.md | 86 +++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 environmental_justice/README.md

diff --git a/environmental_justice/README.md b/environmental_justice/README.md
new file mode 100644
index 00000000..0dffaf84
--- /dev/null
+++ b/environmental_justice/README.md
@@ -0,0 +1,86 @@
+# Environmental Justice API
+
+## Overview
+This API provides access to Environmental Justice data from multiple sources. It supports retrieving data from individual sources or as a combined dataset with defined precedence rules.
+
+## Endpoints
+
+### GET /api/environmental-justice/
+
+Retrieves environmental justice data based on specified data source.
+
+#### Query Parameters
+
+| Parameter    | Description | Default    | Options                                      |
+|-------------|-------------|------------|----------------------------------------------|
+| data_source | Data source filter | "combined" | "spreadsheet", "ml_production", "ml_testing", "combined" |
+
+#### Data Source Behavior
+
+1. **Single Source**
+   - `?data_source=spreadsheet`: Returns only spreadsheet data
+   - `?data_source=ml_production`: Returns only ML production data
+   - `?data_source=ml_testing`: Returns only ML testing data
+
+2. **Combined Data** (Default)
+   - Access via `?data_source=combined` or no parameter
+   - Merges data from 'spreadsheet' and 'ml_production' sources
+   - Precedence rules:
+     - If the same dataset exists in both sources, the spreadsheet version is used
+     - Unique datasets from ml_production are included
+     - ML testing data is not included in combined view
+
+#### Example Requests
+
+```bash
+# Get combined data (default)
+GET /api/environmental-justice/
+
+# Get combined data (explicit)
+GET /api/environmental-justice/?data_source=combined
+
+# Get only spreadsheet data
+GET /api/environmental-justice/?data_source=spreadsheet
+
+# Get only ML production data
+GET /api/environmental-justice/?data_source=ml_production
+
+# Get only ML testing data
+GET /api/environmental-justice/?data_source=ml_testing
+```
+
+#### Response Fields
+
+Each record includes the following fields:
+- dataset
+- description
+- description_simplified
+- indicators
+- intended_use
+- latency
+- limitations
+- project
+- source_link
+- strengths
+- format
+- geographic_coverage
+- data_visualization
+- spatial_resolution
+- temporal_extent
+- temporal_resolution
+- sde_link
+- data_source
+
+## Data Source Definitions
+
+- **spreadsheet**: Primary source data from environmental justice spreadsheets
+- **ml_production**: Production machine learning processed data
+- **ml_testing**: Testing/staging machine learning processed data
+
+## Precedence Rules
+When retrieving combined data:
+1. If a dataset exists in both spreadsheet and ml_production:
+   - The spreadsheet version takes precedence
+   - The ml_production version is excluded
+2. Datasets unique to ml_production are included in the response
+3. ML testing data is never included in combined results

From 1a5ae32a5244c923c38b17c63f08e97bc132aa38 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 22 Nov 2024 21:57:51 -0600
Subject: [PATCH 194/441] update query to explicitly handle 'combined'
 parameter

---
 environmental_justice/views.py | 42 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/environmental_justice/views.py b/environmental_justice/views.py
index f4d2afbe..4959c168 100644
--- a/environmental_justice/views.py
+++ b/environmental_justice/views.py
@@ -1,6 +1,6 @@
-from django.db.models import Q
 from django_filters.rest_framework import DjangoFilterBackend
 from rest_framework import viewsets
+from rest_framework.exceptions import ValidationError
 
 from .models import EnvironmentalJusticeRow
 from .serializers import EnvironmentalJusticeRowSerializer
@@ -17,31 +17,27 @@ class EnvironmentalJusticeRowViewSet(viewsets.ModelViewSet):
     serializer_class = EnvironmentalJusticeRowSerializer
     http_method_names = ["get"]
     filter_backends = [DjangoFilterBackend]
-    filterset_fields = ["data_source"]
+    filterset_fields = []
 
     def get_combined_queryset(self):
         """
         Returns combined data where:
         1. All spreadsheet data is included
         2. ML production data is included only if there's no spreadsheet data with matching dataset
+        Records are sorted by dataset name and then data_source (ensuring spreadsheet comes before ml_production)
         """
-        # First, get all unique datasets that exist in spreadsheet
-        spreadsheet_datasets = (
-            EnvironmentalJusticeRow.objects.filter(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET)
-            .values_list("dataset", flat=True)
-            .distinct()
+        # Get spreadsheet data
+        spreadsheet_data = EnvironmentalJusticeRow.objects.filter(
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET
         )
 
-        # Build query to get:
-        # 1. ALL spreadsheet records
-        # 2. ML production records where dataset isn't in spreadsheet
-        combined_query = Q(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) | Q(
-            data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, dataset__not_in=spreadsheet_datasets
-        )
+        # Get ML production data excluding datasets that exist in spreadsheet
+        ml_production_data = EnvironmentalJusticeRow.objects.filter(
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
+        ).exclude(dataset__in=spreadsheet_data.values_list("dataset", flat=True))
 
-        return EnvironmentalJusticeRow.objects.filter(combined_query).order_by(
-            "dataset"
-        )  # Optional: orders results by dataset name
+        # Combine the querysets and sort
+        return spreadsheet_data.union(ml_production_data).order_by("dataset", "data_source")
 
     def get_queryset(self):
         """
@@ -52,9 +48,13 @@ def get_queryset(self):
         """
         data_source = self.request.query_params.get("data_source", "combined")
 
-        # straightfoward case: return data for specific source
-        if data_source in EnvironmentalJusticeRow.DataSourceChoices.values:
-            return super().get_queryset().filter(data_source=data_source)
+        # Handle the 'combined' case or no parameter case
+        if not data_source or data_source == "combined":
+            return self.get_combined_queryset()
+
+        # Validate specific data source
+        if data_source not in EnvironmentalJusticeRow.DataSourceChoices.values:
+            valid_choices = list(EnvironmentalJusticeRow.DataSourceChoices.values) + ["combined"]
+            raise ValidationError(f"Invalid data_source. Valid choices are: {', '.join(valid_choices)}")
 
-        # Handle 'combined' or no filter case
-        return self.get_combined_queryset()
+        return super().get_queryset().filter(data_source=data_source).order_by("dataset")

From 9a20863e2cf3b19ae3fecd372bbea489b7d8b31b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 22 Nov 2024 22:13:53 -0600
Subject: [PATCH 195/441] add api tests for EJ

---
 environmental_justice/tests.py            |   3 -
 environmental_justice/tests/conftest.py   |  30 +++++
 environmental_justice/tests/factories.py  |  28 ++++
 environmental_justice/tests/test_views.py | 153 ++++++++++++++++++++++
 4 files changed, 211 insertions(+), 3 deletions(-)
 delete mode 100644 environmental_justice/tests.py
 create mode 100644 environmental_justice/tests/conftest.py
 create mode 100644 environmental_justice/tests/factories.py
 create mode 100644 environmental_justice/tests/test_views.py

diff --git a/environmental_justice/tests.py b/environmental_justice/tests.py
deleted file mode 100644
index 9a30df3b..00000000
--- a/environmental_justice/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase  # noqa
-
-# Create your tests here.
diff --git a/environmental_justice/tests/conftest.py b/environmental_justice/tests/conftest.py
new file mode 100644
index 00000000..d8b53c9a
--- /dev/null
+++ b/environmental_justice/tests/conftest.py
@@ -0,0 +1,30 @@
+import pytest
+from django.urls import include, path
+from rest_framework.routers import DefaultRouter
+from rest_framework.test import APIClient
+
+from environmental_justice.views import EnvironmentalJusticeRowViewSet
+
+# Create router and register our viewset
+router = DefaultRouter()
+router.register(r"environmental-justice", EnvironmentalJusticeRowViewSet)
+
+# Create temporary urlpatterns for testing
+urlpatterns = [
+    path("api/", include(router.urls)),
+]
+
+
+# Override default URL conf for testing
+@pytest.fixture
+def client():
+    """Return a Django REST framework API client"""
+    return APIClient()
+
+
+@pytest.fixture(autouse=True)
+def setup_urls():
+    """Setup URLs for testing"""
+    from django.conf import settings
+
+    settings.ROOT_URLCONF = __name__
diff --git a/environmental_justice/tests/factories.py b/environmental_justice/tests/factories.py
new file mode 100644
index 00000000..42d05735
--- /dev/null
+++ b/environmental_justice/tests/factories.py
@@ -0,0 +1,28 @@
+import factory
+from factory.django import DjangoModelFactory
+
+from environmental_justice.models import EnvironmentalJusticeRow
+
+
+class EnvironmentalJusticeRowFactory(DjangoModelFactory):
+    class Meta:
+        model = EnvironmentalJusticeRow
+
+    dataset = factory.Sequence(lambda n: f"dataset_{n}")
+    description = factory.Faker("sentence")
+    description_simplified = factory.Faker("sentence")
+    indicators = factory.Faker("sentence")
+    intended_use = factory.Faker("sentence")
+    latency = factory.Faker("word")
+    limitations = factory.Faker("sentence")
+    project = factory.Faker("word")
+    source_link = factory.Faker("url")
+    strengths = factory.Faker("sentence")
+    format = factory.Faker("file_extension")
+    geographic_coverage = factory.Faker("country")
+    data_visualization = factory.Faker("sentence")
+    spatial_resolution = factory.Faker("word")
+    temporal_extent = factory.Faker("date")
+    temporal_resolution = factory.Faker("word")
+    sde_link = factory.Faker("url")
+    data_source = EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET
diff --git a/environmental_justice/tests/test_views.py b/environmental_justice/tests/test_views.py
new file mode 100644
index 00000000..1632d45b
--- /dev/null
+++ b/environmental_justice/tests/test_views.py
@@ -0,0 +1,153 @@
+# docker-compose -f local.yml run --rm django pytest environmental_justice/tests/test_views.py
+import pytest
+from rest_framework import status
+
+from environmental_justice.models import EnvironmentalJusticeRow
+from environmental_justice.tests.factories import EnvironmentalJusticeRowFactory
+
+
+@pytest.mark.django_db
+class TestEnvironmentalJusticeRowViewSet:
+    """Test suite for the EnvironmentalJusticeRow API endpoints"""
+
+    def setup_method(self):
+        """Setup URL for API endpoint"""
+        self.url = "/api/environmental-justice/"
+
+    def test_empty_database_returns_empty_list(self, client):
+        """Should return empty list when no records exist"""
+        response = client.get(self.url)
+        assert response.status_code == status.HTTP_200_OK
+        assert response.json()["results"] == []
+        assert response.json()["count"] == 0
+
+    def test_single_source_filtering(self, client):
+        """Should return records only from requested data source"""
+        # Create records for each data source
+        spreadsheet_record = EnvironmentalJusticeRowFactory(
+            dataset="test_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET
+        )
+        ml_prod_record = EnvironmentalJusticeRowFactory(
+            dataset="another_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
+        )
+        ml_test_record = EnvironmentalJusticeRowFactory(
+            dataset="test_dataset_3", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_TESTING
+        )
+
+        # Test spreadsheet filter
+        response = client.get(f"{self.url}?data_source=spreadsheet")
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["dataset"] == spreadsheet_record.dataset
+
+        # Test ml_production filter
+        response = client.get(f"{self.url}?data_source=ml_production")
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["dataset"] == ml_prod_record.dataset
+
+        # Test ml_testing filter
+        response = client.get(f"{self.url}?data_source=ml_testing")
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["dataset"] == ml_test_record.dataset
+
+    def test_combined_data_precedence(self, client):
+        """
+        Should return combined data with spreadsheet taking precedence over ml_production
+        for matching datasets
+        """
+        # Create spreadsheet record
+        EnvironmentalJusticeRowFactory(
+            dataset="common_dataset",
+            description="spreadsheet version",
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET,
+        )
+
+        # Create ML production record with same dataset
+        EnvironmentalJusticeRowFactory(
+            dataset="common_dataset",
+            description="ml version",
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION,
+        )
+
+        # Create unique ML production record
+        EnvironmentalJusticeRowFactory(
+            dataset="unique_ml_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
+        )
+
+        # Test combined view (default)
+        response = client.get(self.url)
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 2  # Should only return 2 records (not 3)
+
+        # Verify correct records are returned
+        datasets = [record["dataset"] for record in data]
+        assert "common_dataset" in datasets
+        assert "unique_ml_dataset" in datasets
+
+        # Verify precedence - should get spreadsheet version of common dataset
+        common_record = next(r for r in data if r["dataset"] == "common_dataset")
+        assert common_record["description"] == "spreadsheet version"
+
+    def test_combined_explicit_parameter(self, client):
+        """Should handle explicit 'combined' parameter same as default"""
+        EnvironmentalJusticeRowFactory(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET)
+        EnvironmentalJusticeRowFactory(
+            dataset="unique_ml_dataset",  # Ensure different dataset
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION,
+        )
+
+        # Compare default and explicit combined responses
+        default_response = client.get(self.url)
+        combined_response = client.get(f"{self.url}?data_source=combined")
+
+        assert default_response.status_code == status.HTTP_200_OK
+        assert combined_response.status_code == status.HTTP_200_OK
+        assert default_response.json()["results"] == combined_response.json()["results"]
+
+    def test_invalid_data_source(self, client):
+        """Should return 400 error for invalid data_source parameter"""
+        response = client.get(f"{self.url}?data_source=invalid")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert "Invalid data_source" in str(response.json())
+
+    def test_sorting_in_combined_view(self, client):
+        """Should return combined results sorted by dataset name"""
+        # Create records in non-alphabetical order
+        EnvironmentalJusticeRowFactory(
+            dataset="zebra_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET
+        )
+        EnvironmentalJusticeRowFactory(
+            dataset="alpha_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
+        )
+
+        response = client.get(self.url)
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+
+        # Verify sorting
+        datasets = [record["dataset"] for record in data]
+        assert datasets == sorted(datasets)
+
+    def test_http_methods_allowed(self, client):
+        """Should only allow GET requests"""
+        # Test GET (should work)
+        get_response = client.get(self.url)
+        assert get_response.status_code == status.HTTP_200_OK
+
+        # Test POST (should fail)
+        post_response = client.post(self.url, {})
+        assert post_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED
+
+        # Test PUT (should fail)
+        put_response = client.put(self.url, {})
+        assert put_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED
+
+        # Test DELETE (should fail)
+        delete_response = client.delete(self.url)
+        assert delete_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED

From a63096976706b5c9a4096880c11cf58824fd539a Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sat, 23 Nov 2024 12:26:54 -0600
Subject: [PATCH 196/441] finalize delta exclude tests

---
 .../tests/test_exclude_patterns.py            | 208 +++++++++++-------
 1 file changed, 133 insertions(+), 75 deletions(-)

diff --git a/sde_collections/tests/test_exclude_patterns.py b/sde_collections/tests/test_exclude_patterns.py
index 459f4403..3bf474d2 100644
--- a/sde_collections/tests/test_exclude_patterns.py
+++ b/sde_collections/tests/test_exclude_patterns.py
@@ -2,6 +2,7 @@
 
 import pytest
 from django.contrib.contenttypes.models import ContentType
+from django.db import IntegrityError
 from django.test import TestCase
 
 from sde_collections.models.delta_patterns import DeltaExcludePattern
@@ -217,10 +218,10 @@ def test_dump_migration_with_excludes(self):
             collection=self.collection, url="https://example.com/test", scraped_title="Original Title"
         )
 
-        # Create exclude pattern
+        # Create exclude pattern, this should not effect the curated
         pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url)
 
-        # Create dump URL with different content
+        # Create dump URL with different content, same as curated, different title, will make delta
         DumpUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Updated Title")
 
         # Migrate dump to delta
@@ -231,78 +232,135 @@ def test_dump_migration_with_excludes(self):
         assert delta_url is not None
         assert delta_url.scraped_title == "Updated Title"
         assert pattern.delta_urls.filter(id=delta_url.id).exists()
+        assert delta_url.excluded is True
+
+
+class TestDeltaExcludePatternEdgeCases(TestCase):
+    """Test edge cases and complex scenarios."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_exclude_pattern_uniqueness(self):
+        """Test that we cannot create duplicate exclude patterns for the same URL in a collection."""
+        from django.db import transaction
+
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/test", scraped_title="Test Title"
+        )
+
+        # Create first exclude pattern
+        pattern1 = DeltaExcludePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, reason="First exclusion"
+        )
+
+        # Verify we start with one pattern
+        assert DeltaExcludePattern.objects.filter(collection=self.collection).count() == 1
+
+        # Attempt to create second exclude pattern with same match_pattern should fail
+        with pytest.raises(IntegrityError), transaction.atomic():
+            DeltaExcludePattern.objects.create(
+                collection=self.collection, match_pattern=curated_url.url, reason="Second exclusion"
+            )
+
+        # Verify we still only have one pattern
+        assert DeltaExcludePattern.objects.filter(collection=self.collection).count() == 1
+
+        # Verify only one delta URL exists and is associated with the pattern
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert pattern1.delta_urls.filter(id=delta_url.id).exists()
+
+    def test_different_patterns_matching_same_url(self):
+        """Test that different patterns can affect the same URL."""
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/test/page", scraped_title="Test Title"
+        )
+
+        # Create pattern matching exact URL
+        pattern1 = DeltaExcludePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, reason="Exact match exclusion"
+        )
+
+        # Create pattern with wildcard that also matches the URL
+        pattern2 = DeltaExcludePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/test/*",
+            match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            reason="Wildcard exclusion",
+        )
+
+        # Should still only have one delta URL
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+
+        # URL should be associated with both patterns
+        assert pattern1.delta_urls.filter(id=delta_url.id).exists()
+        assert pattern2.delta_urls.filter(id=delta_url.id).exists()
+
+    def test_exclude_modified_url(self):
+        """Test excluding a URL that already has modifications in delta doesn't lose delta mods"""
+        # Create curated URL
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/test", scraped_title="Original Title"
+        )
+
+        # Create modified delta URL
+        DeltaUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Modified Title")
+
+        # Create exclude pattern
+        pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url)
+
+        # Should still only have one delta URL with both modification and exclusion
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+        updated_delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert updated_delta.scraped_title == "Modified Title"
+        assert pattern.delta_urls.filter(id=updated_delta.id).exists()
+
+    def test_pattern_update_workflow(self):
+        """
+        Test updating an exclude pattern's criteria properly updates URL associations
+        while preserving existing delta changes.
+        """
+        # Create multiple curated URLs
+        urls = [
+            CuratedUrlFactory(
+                collection=self.collection, url=f"https://example.com/section{i}/page", scraped_title=f"Page {i}"
+            )
+            for i in range(3)
+        ]
+
+        # Create a delta URL for section1 with a modified title
+        DeltaUrlFactory(
+            collection=self.collection, url=urls[1].url, scraped_title="Modified Title for Section 1"  # section1
+        )
+
+        # Create initial pattern matching section1/*
+        pattern = DeltaExcludePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/section1/*",
+            match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+        )
+
+        # Verify initial state
+        assert not pattern.delta_urls.filter(url=urls[0].url).exists()  # section0
+        assert pattern.delta_urls.filter(url=urls[1].url).exists()  # section1
+        assert not pattern.delta_urls.filter(url=urls[2].url).exists()  # section2
+
+        # Verify the delta URL still exists and has its modified title
+        assert DeltaUrl.objects.filter(url=urls[1].url).exists()
+        assert DeltaUrl.objects.get(url=urls[1].url).scraped_title == "Modified Title for Section 1"
+
+        # Update pattern to match section2/* instead
+        pattern.match_pattern = "https://example.com/section2/*"
+        pattern.save()
 
+        # Verify pattern associations have updated correctly
+        assert not pattern.delta_urls.filter(url=urls[0].url).exists()  # section0
+        assert not pattern.delta_urls.filter(url=urls[1].url).exists()  # section1
+        assert pattern.delta_urls.filter(url=urls[2].url).exists()  # section2
 
-# class TestDeltaExcludePatternEdgeCases(TestCase):
-#     """Test edge cases and complex scenarios."""
-
-#     def setUp(self):
-#         self.collection = CollectionFactory()
-
-#     def test_exclude_already_excluded_url(self):
-#         """Test applying multiple exclude patterns to same URL."""
-#         curated_url = CuratedUrlFactory(
-#             collection=self.collection, url="https://example.com/test", scraped_title="Test Title"
-#         )
-
-#         # Create first exclude pattern
-#         pattern1 = DeltaExcludePattern.objects.create(
-#             collection=self.collection, match_pattern=curated_url.url, reason="First exclusion"
-#         )
-
-#         # Create second exclude pattern for same URL
-#         pattern2 = DeltaExcludePattern.objects.create(
-#             collection=self.collection, match_pattern=curated_url.url, reason="Second exclusion"
-#         )
-
-#         # Should still only have one delta URL
-#         assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
-#         delta_url = DeltaUrl.objects.get(url=curated_url.url)
-
-#         # URL should be associated with both patterns
-#         assert pattern1.delta_urls.filter(id=delta_url.id).exists()
-#         assert pattern2.delta_urls.filter(id=delta_url.id).exists()
-
-#     def test_exclude_modified_url(self):
-#         """Test excluding a URL that already has modifications in delta."""
-#         # Create curated URL
-#         curated_url = CuratedUrlFactory(
-#             collection=self.collection, url="https://example.com/test", scraped_title="Original Title"
-#         )
-
-#         # Create modified delta URL
-#         delta_url = DeltaUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Modified Title")
-
-#         # Create exclude pattern
-#         pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url)
-
-#         # Should still only have one delta URL with both modification and exclusion
-#         assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
-#         updated_delta = DeltaUrl.objects.get(url=curated_url.url)
-#         assert updated_delta.scraped_title == "Modified Title"
-#         assert pattern.delta_urls.filter(id=updated_delta.id).exists()
-
-#     def test_pattern_update_workflow(self):
-#         """Test updating an exclude pattern's criteria."""
-#         # Create multiple curated URLs
-#         urls = [
-#             CuratedUrlFactory(
-#                 collection=self.collection, url=f"https://example.com/section{i}/page", scraped_title=f"Page {i}"
-#             )
-#             for i in range(3)
-#         ]
-
-#         # Create initial pattern
-#         pattern = DeltaExcludePattern.objects.create(
-#             collection=self.collection,
-#             match_pattern="https://example.com/section1/*",
-#             match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
-#         )
-
-#         # Update pattern to match different URLs
-#         pattern.match_pattern = "https://example.com/section2/*"
-#         pattern.save()
-
-#         # Verify delta URLs are updated correctly
-#         assert not pattern.delta_urls.filter(url=urls[0].url).exists()
-#         assert pattern.delta_urls.filter(url=urls[1].url).exists()
+        # Verify section1's delta URL still exists with its modified title
+        assert DeltaUrl.objects.filter(url=urls[1].url).exists()
+        delta_after_update = DeltaUrl.objects.get(url=urls[1].url)
+        assert delta_after_update.scraped_title == "Modified Title for Section 1"

From 095d7a4964e7bb46b2c86b3d1c80ff5afab7975d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sat, 23 Nov 2024 22:55:07 -0600
Subject: [PATCH 197/441] refactor apply logic

---
 ...ivisionpattern_unique_together_and_more.py |  83 +++
 ...r_deltadivisionpattern_options_and_more.py |  73 +++
 sde_collections/models/collection.py          |   3 +-
 sde_collections/models/delta_patterns.py      | 563 ++++++++++++------
 .../tests/pattern_unapply_logic.md            | 103 ++++
 sde_collections/tests/test_delta_patterns.py  | 238 ++------
 .../tests/test_field_modifier_patterns.py     | 490 +++++++++++++++
 7 files changed, 1177 insertions(+), 376 deletions(-)
 create mode 100644 sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py
 create mode 100644 sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py
 create mode 100644 sde_collections/tests/pattern_unapply_logic.md
 create mode 100644 sde_collections/tests/test_field_modifier_patterns.py

diff --git a/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py b/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py
new file mode 100644
index 00000000..f9be360b
--- /dev/null
+++ b/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py
@@ -0,0 +1,83 @@
+# Generated by Django 4.2.9 on 2024-11-23 17:44
+
+from django.db import migrations, models
+import sde_collections.models.delta_patterns
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterUniqueTogether(
+            name="deltadivisionpattern",
+            unique_together=set(),
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltadocumenttypepattern",
+            unique_together=set(),
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltaexcludepattern",
+            unique_together=set(),
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltaincludepattern",
+            unique_together=set(),
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltatitlepattern",
+            unique_together=set(),
+        ),
+        migrations.AlterField(
+            model_name="deltadivisionpattern",
+            name="match_pattern",
+            field=models.CharField(
+                help_text="This pattern is compared against the URL of all documents in the collection",
+                verbose_name="Pattern",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltadocumenttypepattern",
+            name="match_pattern",
+            field=models.CharField(
+                help_text="This pattern is compared against the URL of all documents in the collection",
+                verbose_name="Pattern",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaexcludepattern",
+            name="match_pattern",
+            field=models.CharField(
+                help_text="This pattern is compared against the URL of all documents in the collection",
+                verbose_name="Pattern",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaincludepattern",
+            name="match_pattern",
+            field=models.CharField(
+                help_text="This pattern is compared against the URL of all documents in the collection",
+                verbose_name="Pattern",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltatitlepattern",
+            name="match_pattern",
+            field=models.CharField(
+                help_text="This pattern is compared against the URL of all documents in the collection",
+                verbose_name="Pattern",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltatitlepattern",
+            name="title_pattern",
+            field=models.CharField(
+                help_text="Pattern for the new title. Support exact replacement or sinequa-valid code",
+                validators=[sde_collections.models.delta_patterns.validate_title_pattern],
+                verbose_name="Title Pattern",
+            ),
+        ),
+    ]
diff --git a/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py b/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py
new file mode 100644
index 00000000..4a244362
--- /dev/null
+++ b/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py
@@ -0,0 +1,73 @@
+# Generated by Django 4.2.9 on 2024-11-23 18:14
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0066_alter_deltadivisionpattern_unique_together_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name="deltadivisionpattern",
+            options={
+                "ordering": ["match_pattern"],
+                "verbose_name": "Delta Division Pattern",
+                "verbose_name_plural": "Delta Division Patterns",
+            },
+        ),
+        migrations.AlterModelOptions(
+            name="deltadocumenttypepattern",
+            options={
+                "ordering": ["match_pattern"],
+                "verbose_name": "Delta Document Type Pattern",
+                "verbose_name_plural": "Delta Document Type Patterns",
+            },
+        ),
+        migrations.AlterModelOptions(
+            name="deltaexcludepattern",
+            options={
+                "ordering": ["match_pattern"],
+                "verbose_name": "Delta Exclude Pattern",
+                "verbose_name_plural": "Delta Exclude Patterns",
+            },
+        ),
+        migrations.AlterModelOptions(
+            name="deltaincludepattern",
+            options={
+                "ordering": ["match_pattern"],
+                "verbose_name": "Delta Include Pattern",
+                "verbose_name_plural": "Delta Include Patterns",
+            },
+        ),
+        migrations.AlterModelOptions(
+            name="deltatitlepattern",
+            options={
+                "ordering": ["match_pattern"],
+                "verbose_name": "Delta Title Pattern",
+                "verbose_name_plural": "Delta Title Patterns",
+            },
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltadivisionpattern",
+            unique_together={("collection", "match_pattern")},
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltadocumenttypepattern",
+            unique_together={("collection", "match_pattern")},
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltaexcludepattern",
+            unique_together={("collection", "match_pattern")},
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltaincludepattern",
+            unique_together={("collection", "match_pattern")},
+        ),
+        migrations.AlterUniqueTogether(
+            name="deltatitlepattern",
+            unique_together={("collection", "match_pattern")},
+        ),
+    ]
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 083e231e..fccf4345 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -114,7 +114,8 @@ def refresh_url_lists_for_all_patterns(self):
 
             # Filter patterns for the current collection and update relations
             for pattern in model.objects.filter(collection=self):
-                pattern.refresh_url_lists()
+                pattern.update_affected_delta_urls_list()
+                pattern.update_affected_curated_urls_list()
 
     def migrate_dump_to_delta(self):
         """Main function to handle migration from DumpUrls to DeltaUrls with specific rules."""
diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 44aaf863..8511631e 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -1,4 +1,5 @@
 import re
+from typing import Any
 
 from django.apps import apps
 from django.core.exceptions import ValidationError
@@ -14,6 +15,8 @@
 
 
 class BaseMatchPattern(models.Model):
+    """Base class for all delta patterns."""
+
     class MatchPatternTypeChoices(models.IntegerChoices):
         INDIVIDUAL_URL = 1, "Individual URL Pattern"
         MULTI_URL_PATTERN = 2, "Multi-URL Pattern"
@@ -25,9 +28,7 @@ class MatchPatternTypeChoices(models.IntegerChoices):
         related_query_name="%(class)ss",
     )
     match_pattern = models.CharField(
-        "Pattern",
-        help_text="This pattern is compared against the URL of all the documents in the collection "
-        "and matching documents will be returned",
+        "Pattern", help_text="This pattern is compared against the URL of all documents in the collection"
     )
     match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1)
     delta_urls = models.ManyToManyField(
@@ -39,87 +40,46 @@ class MatchPatternTypeChoices(models.IntegerChoices):
         related_name="%(class)s_curated_urls",
     )
 
-    def matched_urls(self):
-        """
-        Find all URLs matching the pattern.
-        This does not update pattern.delta_urls or pattern.curated_urls.
-        """
-        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
-        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
-
-        # Construct the regex pattern based on match type
-        escaped_match_pattern = re.escape(self.match_pattern)
-        regex_pattern = (
-            f"{escaped_match_pattern}$"
-            if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL
-            else escaped_match_pattern.replace(r"\*", ".*")
-        )
-
-        # Directly query DeltaUrl and CuratedUrl with collection filter
-        matching_delta_urls = DeltaUrl.objects.filter(collection=self.collection, url__regex=regex_pattern)
-        matching_curated_urls = CuratedUrl.objects.filter(collection=self.collection, url__regex=regex_pattern)
-
-        return {
-            "matching_delta_urls": matching_delta_urls,
-            "matching_curated_urls": matching_curated_urls,
-        }
-
-    def refresh_url_lists(self):
-        """Update the delta_urls and curated_urls ManyToMany relationships."""
-        matched_urls = self.matched_urls()
-        self.delta_urls.set(matched_urls["matching_delta_urls"])
-        self.curated_urls.set(matched_urls["matching_curated_urls"])
+    def get_regex_pattern(self) -> str:
+        """Convert the match pattern into a proper regex based on pattern type."""
+        escaped_pattern = re.escape(self.match_pattern)
+        if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL:
+            return f"{escaped_pattern}$"
+        return escaped_pattern.replace(r"\*", ".*")
 
-    def generate_delta_url(self, curated_url, fields_to_copy=None):
-        """
-        Generates or updates a DeltaUrl based on a CuratedUrl.
-        Only specified fields are copied if fields_to_copy is provided.
-        """
-        # Import DeltaUrl dynamically to avoid circular import issues
+    def get_matching_delta_urls(self) -> models.QuerySet:
+        """Get all DeltaUrls that match this pattern."""
         DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+        regex_pattern = self.get_regex_pattern()
+        return DeltaUrl.objects.filter(collection=self.collection, url__regex=regex_pattern)
 
-        delta_url, created = DeltaUrl.objects.get_or_create(
-            collection=self.collection,
-            url=curated_url.url,
-            defaults={field: getattr(curated_url, field) for field in (fields_to_copy or [])},
-        )
-        if not created and fields_to_copy:
-            # Update only if certain fields are missing in DeltaUrl
-            for field in fields_to_copy:
-                if getattr(delta_url, field, None) in [None, ""]:
-                    setattr(delta_url, field, getattr(curated_url, field))
-            delta_url.save()
-
-    def apply(self, fields_to_copy=None, update_fields=None):
-        matched_urls = self.matched_urls()
+    def get_matching_curated_urls(self) -> models.QuerySet:
+        """Get all CuratedUrls that match this pattern."""
+        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
+        regex_pattern = self.get_regex_pattern()
+        return CuratedUrl.objects.filter(collection=self.collection, url__regex=regex_pattern)
 
-        # Step 1: Generate or update DeltaUrls for each matching CuratedUrl
-        for curated_url in matched_urls["matching_curated_urls"]:
-            # Check if the curated_url is already linked to this pattern
-            if self.curated_urls.filter(pk=curated_url.pk).exists():
-                # Skip creating a DeltaUrl if the curated_url is already associated with this pattern
-                continue
-            self.generate_delta_url(curated_url, fields_to_copy)
+    def update_affected_delta_urls_list(self) -> None:
+        """Update the many-to-many relationship for matched DeltaUrls."""
+        self.delta_urls.set(self.get_matching_delta_urls())
 
-        # Step 2: Apply updates to fields on matching DeltaUrls
-        if update_fields:
-            matched_urls["matching_delta_urls"].update(**update_fields)
+    def update_affected_curated_urls_list(self) -> None:
+        """Update the many-to-many relationship for matched CuratedUrls."""
+        self.curated_urls.set(self.get_matching_curated_urls())
 
-        # Update ManyToMany relationships
-        self.refresh_url_lists()
+    def apply(self) -> None:
+        """Apply pattern effects. Must be implemented by subclasses."""
+        raise NotImplementedError
 
-    def unapply(self):
-        """Default unapply behavior."""
-        self.delta_urls.clear()
-        self.curated_urls.clear()
+    def unapply(self) -> None:
+        """Remove pattern effects. Must be implemented by subclasses."""
+        raise NotImplementedError
 
-    def save(self, *args, **kwargs):
-        """Save the pattern and apply it."""
+    def save(self, *args, **kwargs) -> None:
         super().save(*args, **kwargs)
         self.apply()
 
-    def delete(self, *args, **kwargs):
-        """Delete the pattern and unapply it."""
+    def delete(self, *args, **kwargs) -> None:
         self.unapply()
         super().delete(*args, **kwargs)
 
@@ -133,35 +93,117 @@ def __str__(self):
 
 
 class DeltaExcludePattern(BaseMatchPattern):
+    """Pattern for marking URLs for exclusion."""
+
     reason = models.TextField("Reason for excluding", default="", blank=True)
 
-    # No need to override `apply`—we use the base class logic as-is.
-    # This pattern's functionality is handled by the `excluded` annotation in the manager.
+    def apply(self) -> None:
+        """
+        Apply the pattern's effects to matching URLs. This involves:
+        1. Finding new Curated URLs that match the pattern but weren't previously affected
+        2. Creating Delta URLs for those newly affected Curated URLs ONLY IF they don't already have Delta URLs
+        3. Updating the pattern's list of affected Delta and Curated URLs
 
-    class Meta:
+        Note: The actual exclusion is handled through the many-to-many relationship,
+        so we don't need to modify existing Delta URLs.
+        """
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+
+        # Get QuerySet of all matching CuratedUrls
+        matching_curated_urls = self.get_matching_curated_urls()
+
+        # Find Curated URLs that match but weren't previously affected by this pattern
+        previously_unaffected_curated = matching_curated_urls.exclude(
+            id__in=self.curated_urls.values_list("id", flat=True)
+        )
+
+        # For each previously unaffected curated URL, ensure a DeltaUrl exists
+        # but ONLY create one if it doesn't already exist
+        for curated_url in previously_unaffected_curated:
+            # Skip if ANY DeltaUrl exists for this URL (regardless of pattern association)
+            if DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists():
+                continue
+
+            # Only create a new DeltaUrl if one doesn't exist
+            fields = {
+                field.name: getattr(curated_url, field.name)
+                for field in curated_url._meta.fields
+                if field.name not in ["id", "collection"]
+            }
+            fields["to_delete"] = False
+            fields["collection"] = self.collection
+
+            DeltaUrl.objects.create(**fields)
+
+        # Update the pattern's relationships - this is what actually handles the exclusion
+        self.update_affected_delta_urls_list()
+
+    def unapply(self) -> None:
+        """
+        Remove this pattern's effects by:
+        1. Creating Delta URLs for previously excluded Curated URLs to show they're no longer excluded
+        2. Cleaning up any Delta URLs that are now identical to their Curated URL counterparts
+           (these would have only existed to show their exclusion)
+        """
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
+
+        # Create Delta URLs for previously affected Curated URLs
+        for curated_url in self.curated_urls.all():
+            fields = {
+                field.name: getattr(curated_url, field.name)
+                for field in curated_url._meta.fields
+                if field.name not in ["id", "collection"]
+            }
+            fields["to_delete"] = False
+            fields["collection"] = self.collection
+
+            DeltaUrl.objects.get_or_create(**fields)  # Use get_or_create to avoid duplicates
+
+        # Clean up Delta URLs that are now identical to their Curated URLs
+        for delta_url in self.delta_urls.filter(to_delete=False):
+            try:
+                curated_url = CuratedUrl.objects.get(collection=self.collection, url=delta_url.url)
+
+                # Compare all fields except 'id' and 'to_delete'
+                fields_match = all(
+                    getattr(delta_url, field.name) == getattr(curated_url, field.name)
+                    for field in delta_url._meta.fields
+                    if field.name not in ["id", "to_delete"]
+                )
+
+                if fields_match:
+                    delta_url.delete()
+
+            except CuratedUrl.DoesNotExist:
+                # If there's no corresponding CuratedUrl, keep the DeltaUrl
+                continue
+
+        # Clear the pattern's relationships
+        self.delta_urls.clear()
+        self.curated_urls.clear()
+
+    class Meta(BaseMatchPattern.Meta):
         verbose_name = "Delta Exclude Pattern"
         verbose_name_plural = "Delta Exclude Patterns"
-        unique_together = ("collection", "match_pattern")
 
 
 class DeltaIncludePattern(BaseMatchPattern):
-    # No additional logic needed for `apply`—using base class functionality.
+    """Pattern for explicitly including URLs."""
 
-    class Meta:
+    class Meta(BaseMatchPattern.Meta):
         verbose_name = "Delta Include Pattern"
         verbose_name_plural = "Delta Include Patterns"
-        unique_together = ("collection", "match_pattern")
 
 
-def validate_title_pattern(title_pattern_string):
+def validate_title_pattern(title_pattern_string: str) -> None:
+    """Validate title pattern format."""
     parsed_title = parse_title(title_pattern_string)
 
-    for element in parsed_title:
-        element_type, element_value = element
-
+    for element_type, element_value in parsed_title:
         if element_type == "xpath":
             if not is_valid_xpath(element_value):
-                raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.")  # noqa: E231
+                raise ValidationError(f"Invalid xpath: {element_value}")
         elif element_type == "brace":
             try:
                 is_valid_fstring(element_value)
@@ -170,151 +212,302 @@ def validate_title_pattern(title_pattern_string):
 
 
 class DeltaTitlePattern(BaseMatchPattern):
+    """Pattern for modifying titles of URLs based on a template pattern."""
+
     title_pattern = models.CharField(
         "Title Pattern",
-        help_text="This is the pattern for the new title. You can either write an exact replacement string"
-        " (no quotes required) or you can write sinequa-valid code",
+        help_text="Pattern for the new title. Can be an exact replacement string or sinequa-valid code",
         validators=[validate_title_pattern],
     )
 
+    def generate_title_for_url(self, url_obj) -> tuple[str, str | None]:
+        """
+        Generate a new title for a URL using the pattern.
+        Returns tuple of (generated_title, error_message).
+        """
+        context = {
+            "url": url_obj.url,
+            "title": url_obj.scraped_title,
+            "collection": self.collection.name,
+        }
+
+        try:
+            return resolve_title(self.title_pattern, context), None
+        except (ValueError, ValidationError) as e:
+            return None, str(e)
+
     def apply(self) -> None:
-        # Dynamically get the DeltaResolvedTitle and DeltaResolvedTitleError models to avoid circular import issues
+        """
+        Apply the title pattern to matching URLs:
+        1. Find new Curated URLs that match but weren't previously affected
+        2. Create Delta URLs only where the generated title differs
+        3. Update all matching Delta URLs with new titles
+        4. Track title resolution status and errors
+        """
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
         DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle")
         DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError")
 
-        matched_urls = self.matched_urls()
+        # Get newly matching Curated URLs
+        matching_curated_urls = self.get_matching_curated_urls()
+        previously_unaffected_curated = matching_curated_urls.exclude(
+            id__in=self.curated_urls.values_list("id", flat=True)
+        )
+
+        # Process each previously unaffected curated URL
+        for curated_url in previously_unaffected_curated:
+            new_title, error = self.generate_title_for_url(curated_url)
+
+            if error:
+                # Log error and continue to next URL
+                DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=curated_url, error_string=error)
+                continue
+
+            # Skip if the generated title matches existing or if Delta already exists
+            if (
+                curated_url.generated_title == new_title
+                or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists()
+            ):
+                continue
+
+            # Create new Delta URL with the new title
+            fields = {
+                field.name: getattr(curated_url, field.name)
+                for field in curated_url._meta.fields
+                if field.name not in ["id", "collection"]
+            }
+            fields["generated_title"] = new_title
+            fields["to_delete"] = False
+            fields["collection"] = self.collection
 
-        # Step 1: Apply title pattern to matching DeltaUrls
-        for delta_url in matched_urls["matching_delta_urls"]:
-            self.apply_title_to_url(delta_url, DeltaResolvedTitle, DeltaResolvedTitleError)
+            delta_url = DeltaUrl.objects.create(**fields)
 
-        # Step 2: Check and potentially create DeltaUrls for matching CuratedUrls
-        for curated_url in matched_urls["matching_curated_urls"]:
-            self.create_delta_if_title_differs(curated_url, DeltaResolvedTitle, DeltaResolvedTitleError)
+            # Record successful title resolution
+            DeltaResolvedTitle.objects.create(title_pattern=self, delta_url=delta_url, resolved_title=new_title)
 
-        # Step 3: Update ManyToMany relationships for DeltaUrls and CuratedUrls
-        self.refresh_url_lists()
+        # Update titles for all matching Delta URLs
+        for delta_url in self.get_matching_delta_urls():
+            new_title, error = self.generate_title_for_url(delta_url)
 
-    def create_delta_if_title_differs(self, curated_url, DeltaResolvedTitle, DeltaResolvedTitleError):
+            if error:
+                DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error)
+                continue
+
+            # Update title and record resolution
+            DeltaResolvedTitle.objects.update_or_create(
+                title_pattern=self, delta_url=delta_url, defaults={"resolved_title": new_title}
+            )
+
+            delta_url.generated_title = new_title
+            delta_url.save()
+
+        # Update pattern relationships
+        self.update_affected_delta_urls_list()
+        self.update_affected_curated_urls_list()
+
+    def unapply(self) -> None:
         """
-        Checks if the title generated by the pattern differs from the existing generated title
-        in CuratedUrl. If it does, creates or updates a DeltaUrl with the new title.
+        Remove title modifications:
+        1. Create Delta URLs for affected Curated URLs to explicitly clear titles
+        2. Remove generated titles from affected Delta URLs
+        3. Clean up Delta URLs that become identical to their Curated URL
+        4. Clear resolution tracking
         """
-        # Calculate the title that would be generated if the pattern is applied
-        context = {
-            "url": curated_url.url,
-            "title": curated_url.scraped_title,
-            "collection": self.collection.name,
-        }
-        try:
-            new_generated_title = resolve_title(self.title_pattern, context)
-
-            # Compare against the existing generated title in CuratedUrl
-            if curated_url.generated_title != new_generated_title:
-                # Only create a DeltaUrl if the titles differ
-                DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
-                delta_url, created = DeltaUrl.objects.get_or_create(
-                    collection=self.collection,
-                    url=curated_url.url,
-                    defaults={"scraped_title": curated_url.scraped_title},
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
+        DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle")
+        DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError")
+
+        # Get all affected URLs
+        affected_deltas = self.delta_urls.all()
+        affected_curated = self.curated_urls.all()
+
+        # Process each affected delta URL
+        for delta in affected_deltas:
+            curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()
+
+            if not curated:
+                # Scenario 1: Delta only - clear generated title
+                delta.generated_title = ""
+                delta.save()
+            else:
+                # Scenario 2: Both exist - revert to curated title
+                delta.generated_title = curated.generated_title
+                delta.save()
+
+                # Check if delta is now redundant
+                fields_match = all(
+                    getattr(delta, f.name) == getattr(curated, f.name)
+                    for f in delta._meta.fields
+                    if f.name not in ["id", "to_delete"]
                 )
-                delta_url.generated_title = new_generated_title
-                delta_url.save()
-                self.apply_title_to_url(delta_url, DeltaResolvedTitle, DeltaResolvedTitleError)
+                if fields_match:
+                    delta.delete()
+
+        # Handle curated URLs that don't have deltas
+        for curated in affected_curated:
+            if not DeltaUrl.objects.filter(url=curated.url).exists():
+                # Scenario 3: Curated only - create delta with cleared title
+                fields = {
+                    f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
+                }
+                fields["generated_title"] = ""
+                DeltaUrl.objects.create(collection=self.collection, **fields)
+
+        # Clear resolution tracking
+        DeltaResolvedTitle.objects.filter(title_pattern=self).delete()
+        DeltaResolvedTitleError.objects.filter(title_pattern=self).delete()
+
+        # Clear pattern relationships
+        self.delta_urls.clear()
+        self.curated_urls.clear()
 
-        except (ValueError, ValidationError) as e:
-            self.log_title_error(curated_url, DeltaResolvedTitleError, str(e))
+    class Meta(BaseMatchPattern.Meta):
+        verbose_name = "Delta Title Pattern"
+        verbose_name_plural = "Delta Title Patterns"
+
+
+class FieldModifyingPattern(BaseMatchPattern):
+    """
+    Abstract base class for patterns that modify a single field on matching URLs.
+    Examples: DeltaDivisionPattern, DeltaDocumentTypePattern
+    """
+
+    class Meta(BaseMatchPattern.Meta):
+        abstract = True
+
+    def get_field_to_modify(self) -> str:
+        """Return the name of the field this pattern modifies. Must be implemented by subclasses."""
+        raise NotImplementedError
+
+    def get_new_value(self) -> Any:
+        """Return the new value for the field. Must be implemented by subclasses."""
+        raise NotImplementedError
 
-    def apply_title_to_url(self, url_obj, DeltaResolvedTitle, DeltaResolvedTitleError):
+    def apply(self) -> None:
         """
-        Applies the title pattern to a DeltaUrl or CuratedUrl and records the resolved title or errors.
+        Apply field modification to matching URLs:
+        1. Find new Curated URLs that match but weren't previously affected
+        2. Create Delta URLs only for Curated URLs where the field value would change
+        3. Update the pattern's list of affected URLs
+        4. Set the field value on all matching Delta URLs
         """
-        context = {
-            "url": url_obj.url,
-            "title": url_obj.scraped_title,
-            "collection": self.collection.name,
-        }
-        try:
-            generated_title = resolve_title(self.title_pattern, context)
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+
+        field = self.get_field_to_modify()
+        new_value = self.get_new_value()
 
-            # Remove existing resolved title entries for this URL
-            DeltaResolvedTitle.objects.filter(delta_url=url_obj).delete()
+        # Get newly matching Curated URLs
+        matching_curated_urls = self.get_matching_curated_urls()
+        previously_unaffected_curated = matching_curated_urls.exclude(
+            id__in=self.curated_urls.values_list("id", flat=True)
+        )
 
-            # Create a new resolved title entry
-            DeltaResolvedTitle.objects.create(title_pattern=self, delta_url=url_obj, resolved_title=generated_title)
+        # Create DeltaUrls only where field value would change
+        for curated_url in previously_unaffected_curated:
+            if (
+                getattr(curated_url, field) == new_value
+                or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists()
+            ):
+                continue
 
-            # Set generated title only on DeltaUrl
-            url_obj.generated_title = generated_title
-            url_obj.save()
+            fields = {
+                f.name: getattr(curated_url, f.name)
+                for f in curated_url._meta.fields
+                if f.name not in ["id", "collection"]
+            }
+            fields[field] = new_value
+            fields["to_delete"] = False
+            fields["collection"] = self.collection
 
-        except (ValueError, ValidationError) as e:
-            self.log_title_error(url_obj, DeltaResolvedTitleError, str(e))
+            DeltaUrl.objects.create(**fields)
 
-    def log_title_error(self, url_obj, DeltaResolvedTitleError, message):
-        """Logs an error when resolving a title."""
-        resolved_title_error = DeltaResolvedTitleError.objects.create(
-            title_pattern=self, delta_url=url_obj, error_string=message
-        )
-        status_code = re.search(r"Status code: (\d+)", message)
-        if status_code:
-            resolved_title_error.http_status_code = int(status_code.group(1))
-        resolved_title_error.save()
+        # Update all matching DeltaUrls with the new field value
+        self.get_matching_delta_urls().update(**{field: new_value})
+        self.update_affected_delta_urls_list()
 
     def unapply(self) -> None:
-        """Clears generated titles for DeltaUrls affected by this pattern and dissociates URLs from the pattern."""
-        matched_urls = self.matched_urls()
+        """
+        Remove field modifications:
+        1. Create Delta URLs for affected Curated URLs to explicitly set NULL
+        2. Remove field value from affected Delta URLs only if no other patterns affect them
+        3. Clean up Delta URLs that become identical to their Curated URL
+        """
 
-        # Clear the `generated_title` for all matching DeltaUrls
-        matched_urls["matching_delta_urls"].update(generated_title="")
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
 
-        # Clear relationships
+        field = self.get_field_to_modify()
+
+        # Get all affected URLs
+        affected_deltas = self.delta_urls.all()
+        affected_curated = self.curated_urls.all()
+
+        # Process each affected delta URL
+        for delta in affected_deltas:
+            curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()
+
+            if not curated:
+                # Scenario 1: Delta only - new URL
+                setattr(delta, field, None)
+                delta.save()
+            else:
+                # Scenario 2: Both exist
+                setattr(delta, field, getattr(curated, field))
+                delta.save()
+
+                # Check if delta is now redundant
+                fields_match = all(
+                    getattr(delta, f.name) == getattr(curated, f.name)
+                    for f in delta._meta.fields
+                    if f.name not in ["id", "to_delete"]
+                )
+                if fields_match:
+                    delta.delete()
+
+        # Handle curated URLs that don't have deltas
+        for curated in affected_curated:
+            if not DeltaUrl.objects.filter(url=curated.url).exists():
+                # Scenario 3: Curated only
+                # Copy all fields from curated except the one we're nulling
+                fields = {
+                    f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
+                }
+                fields[field] = None  # Set the pattern's field to None
+                delta = DeltaUrl.objects.create(collection=self.collection, **fields)
+
+        # Clear pattern relationships
         self.delta_urls.clear()
         self.curated_urls.clear()
 
-    class Meta:
-        verbose_name = "Delta Title Pattern"
-        verbose_name_plural = "Delta Title Patterns"
-        unique_together = ("collection", "match_pattern")
 
+class DeltaDocumentTypePattern(FieldModifyingPattern):
+    """Pattern for setting document types."""
 
-class DeltaDocumentTypePattern(BaseMatchPattern):
     document_type = models.IntegerField(choices=DocumentTypes.choices)
 
-    # We use `update_fields` in the base apply method to set `document_type`.
-    def apply(self) -> None:
-        super().apply(update_fields={"document_type": self.document_type})
+    def get_field_to_modify(self) -> str:
+        return "document_type"
 
-    def unapply(self) -> None:
-        """Clear document type from associated delta and curated URLs."""
-        self.delta_urls.update(document_type=None)
-        self.delta_urls.clear()
-        self.curated_urls.clear()
+    def get_new_value(self) -> Any:
+        return self.document_type
 
-    class Meta:
+    class Meta(FieldModifyingPattern.Meta):
         verbose_name = "Delta Document Type Pattern"
         verbose_name_plural = "Delta Document Type Patterns"
-        unique_together = ("collection", "match_pattern")
 
 
-class DeltaDivisionPattern(BaseMatchPattern):
+class DeltaDivisionPattern(FieldModifyingPattern):
+    """Pattern for setting divisions."""
+
     division = models.IntegerField(choices=Divisions.choices)
 
-    # We use `update_fields` in the base apply method to set `division`.
-    def apply(self) -> None:
-        super().apply(update_fields={"division": self.division})
+    def get_field_to_modify(self) -> str:
+        return "division"
 
-    def unapply(self) -> None:
-        """Clear division from associated delta and curated URLs."""
-        # TODO: need to double check this logic for complicated cases
-        self.delta_urls.update(division=None)
+    def get_new_value(self) -> Any:
+        return self.division
 
-    class Meta:
+    class Meta(FieldModifyingPattern.Meta):
         verbose_name = "Delta Division Pattern"
         verbose_name_plural = "Delta Division Patterns"
-        unique_together = ("collection", "match_pattern")
-
-
-# @receiver(post_save, sender=DeltaTitlePattern)
-# def post_save_handler(sender, instance, created, **kwargs):
-#     if created:
-#         transaction.on_commit(lambda: resolve_title_pattern.delay(instance.pk))
diff --git a/sde_collections/tests/pattern_unapply_logic.md b/sde_collections/tests/pattern_unapply_logic.md
new file mode 100644
index 00000000..f4c75f8f
--- /dev/null
+++ b/sde_collections/tests/pattern_unapply_logic.md
@@ -0,0 +1,103 @@
+# Pattern System Unapply Logic
+
+## Core Principles
+1. When patterns are removed, we need to handle deltas based on their relationship to curated URLs
+2. Deltas should only exist if they differ from their curated counterparts, or if no curated URL exists
+3. Multiple patterns can affect the same URL
+4. Manual changes to deltas should be preserved
+
+## Cases to Handle
+
+### Case 1: Delta Only (New URL)
+**Scenario:**
+- No curated URL exists for this URL
+- Delta URL exists with pattern effect
+- Pattern is removed
+```
+Curated: None
+Delta: division=BIOLOGY (from pattern)
+[Pattern removed]
+Result: Delta remains with division=None
+```
+
+### Case 2: Delta and Curated Exist
+**Scenario:**
+- Both curated and delta URLs exist
+- Pattern is removed
+```
+Curated: division=GENERAL
+Delta: division=BIOLOGY (from pattern)
+[Pattern removed]
+Result: Delta reverts to curated value (division=GENERAL)
+If delta now matches curated exactly, delta is deleted
+```
+
+### Case 3: Curated Only
+**Scenario:**
+- Only curated URL exists
+- Pattern is removed
+```
+Curated: division=GENERAL
+Delta: None
+[Pattern removed]
+Result: New delta created with division=None
+```
+
+### Case 4: Multiple Pattern Effects
+**Scenario:**
+- Delta has changes from multiple patterns
+- One pattern is removed
+```
+Curated: division=GENERAL, doc_type=DOCUMENTATION
+Delta: division=BIOLOGY, doc_type=DATA (from two patterns)
+[Division pattern removed]
+Result: Delta remains with division=GENERAL, doc_type=DATA preserved
+```
+
+### Case 5: Pattern Removal with Manual Changes
+**Scenario:**
+- Delta has both pattern effect and manual changes
+- Pattern is removed
+```
+Curated: division=GENERAL, title="Original"
+Delta: division=BIOLOGY, title="Modified" (pattern + manual)
+[Pattern removed]
+Result: Delta remains with division=GENERAL, title="Modified" preserved
+```
+
+## Implementation Steps
+
+1. **Get Affected URLs**
+   - Get all deltas and curated URLs that match pattern
+   - For each URL determine what exists (delta only, both, or curated only)
+
+2. **For Each Delta URL Found**
+   - If no matching curated exists:
+     - Set pattern's field to null
+   - If matching curated exists:
+     - Set pattern's field to curated value
+     - If delta now matches curated exactly, delete delta
+
+3. **For Each Curated URL without Delta**
+   - Create new delta with pattern's field set to null
+
+4. **Cleanup**
+   - Clear pattern's relationships with URLs
+   - Remove pattern from database
+
+## Edge Cases to Handle
+
+1. **Field Comparison**
+   - When comparing delta to curated, ignore id and to_delete fields
+   - All other fields must match exactly for delta deletion
+
+2. **Manual Changes**
+   - Preserve any delta fields not modified by this pattern
+   - Only delete delta if ALL fields match curated
+
+3. **Multiple Collections**
+   - Only affect URLs in pattern's collection
+
+4. **Invalid States**
+   - Handle missing URLs gracefully
+   - Skip URLs that no longer exist
diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py
index 1fd9c886..625618a1 100644
--- a/sde_collections/tests/test_delta_patterns.py
+++ b/sde_collections/tests/test_delta_patterns.py
@@ -2,13 +2,7 @@
 
 import pytest
 
-from sde_collections.models.delta_patterns import (
-    DeltaDivisionPattern,
-    DeltaDocumentTypePattern,
-    DeltaExcludePattern,
-    DeltaIncludePattern,
-    DeltaTitlePattern,
-)
+from sde_collections.models.delta_patterns import DeltaExcludePattern, DeltaTitlePattern
 from sde_collections.models.delta_url import (
     CuratedUrl,
     DeltaResolvedTitleError,
@@ -18,162 +12,83 @@
     CollectionFactory,
     CuratedUrlFactory,
     DeltaUrlFactory,
-    DumpUrlFactory,
 )
 from sde_collections.utils.title_resolver import resolve_title
 
 
 @pytest.mark.django_db
 def test_exclusion_status():
+    """
+    new patterns should only exclude DeltaUrls, not CuratedUrls
+    """
     collection = CollectionFactory()
-    curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
+    curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page/1")
+    delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page/2")
+
+    # confirm they both start as not excluded
+    assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False
+    assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False
 
-    # Create an exclusion pattern that should apply to this URL
-    DeltaExcludePattern.objects.create(collection=collection, match_pattern="https://example.com/page")
+    # Create an exclusion pattern matches both urls
+    pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern="*page*", match_pattern_type=2)
+    pattern.apply()
 
-    # Assert that the `excluded` field is set to True, as expected
-    assert CuratedUrl.objects.get(pk=curated_url.pk).excluded
+    # curated urls should not be affected by patterns until the collection is promoted
+    # curated should be included, but delta should be excluded
+    assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True
+    assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False
 
 
 @pytest.mark.django_db
 class TestBaseMatchPattern:
-    def test_individual_url_pattern_matching(self):
+    def test_pattern_save_applies_effects(self):
+        """Test that pattern creation automatically applies effects."""
         collection = CollectionFactory()
-        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
-        pattern = DeltaIncludePattern.objects.create(
-            collection=collection, match_pattern="https://example.com/page", match_pattern_type=1  # INDIVIDUAL_URL
-        )
-        pattern.apply()
-        matching_urls = pattern.matched_urls()
-        CuratedUrl.objects.filter(collection=collection, url__regex=pattern.match_pattern)
-
-        assert curated_url in matching_urls["matching_curated_urls"]
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/test")
 
-    def test_multi_url_pattern_matching(self):
-        collection = CollectionFactory()
-        curated_url_1 = CuratedUrlFactory(collection=collection, url="https://example.com/page1")
-        curated_url_2 = CuratedUrlFactory(collection=collection, url="https://example.com/page2")
-        pattern = DeltaIncludePattern.objects.create(
-            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2  # MULTI_URL_PATTERN
+        # Create pattern - should automatically apply
+        pattern = DeltaExcludePattern.objects.create(
+            collection=collection, match_pattern=curated_url.url, match_pattern_type=1
         )
 
-        matching_urls = pattern.matched_urls()
-        assert curated_url_1 in matching_urls["matching_curated_urls"]
-        assert curated_url_2 in matching_urls["matching_curated_urls"]
-
-    def test_generate_delta_url_creation_and_update(self):
-        collection = CollectionFactory()
-        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
-        pattern = DeltaIncludePattern.objects.create(collection=collection, match_pattern="https://example.com/page")
-
-        # First call to generate DeltaUrl
-        pattern.generate_delta_url(curated_url, fields_to_copy=["scraped_title"])
+        # Delta URL should be created and excluded
         delta_url = DeltaUrl.objects.get(url=curated_url.url)
-        original_delta_title = delta_url.scraped_title
-        assert delta_url.scraped_title == curated_url.scraped_title
-
-        # Update DeltaUrl with additional fields
-        # this is kinda weird, but basically if you have a deltaurl with a
-        # scraped_title, that value is gospel. if for some reason generate_delta_url is called
-        # again and it hits that deltaurl, it will not update the scraped_title field, since that
-        # field already exists and is assumed correct.
-        # this is true of title. but i think not of other fields?
-        curated_url.scraped_title = "Updated Title"
-        curated_url.save()
-        curated_url.refresh_from_db()
-        pattern.generate_delta_url(curated_url, fields_to_copy=["scraped_title"])
-        delta_url.refresh_from_db()
-        assert delta_url.scraped_title == original_delta_title
-
-    def test_apply_creates_delta_url_if_curated_url_does_not_exist(self):
-        """
-        Ensures that the `apply` logic creates a new `DeltaUrl` if a matching `CuratedUrl` does not exist.
-        """
-        collection = CollectionFactory()
-        delta_url = DeltaUrlFactory(
-            collection=collection, url="https://example.com/page", scraped_title="Original Title"
-        )
-
-        # Create a pattern matching the URL
-        pattern = DeltaIncludePattern.objects.create(
-            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
-        )
-
-        # Apply the pattern
-        pattern.apply()
-
-        # Verify that a DeltaUrl is created
-        assert DeltaUrl.objects.filter(url=delta_url.url).exists()
+        assert delta_url.excluded is True
+        assert pattern.delta_urls.filter(id=delta_url.id).exists()
 
-    def test_apply_skips_delta_url_creation_if_curated_url_exists(self):
-        """
-        Ensures that the `apply` logic does not create a new `DeltaUrl` if a matching `CuratedUrl` already exists.
-        """
+    def test_pattern_delete_removes_effects(self):
+        """Test that deleting a pattern properly removes its effects."""
         collection = CollectionFactory()
-        delta_url = DeltaUrlFactory(
-            collection=collection, url="https://example.com/page", scraped_title="Original Title"
-        )
+        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/test")
 
-        # Create a pattern matching the URL
-        pattern = DeltaIncludePattern.objects.create(
-            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
-        )
+        pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern=curated_url.url)
 
-        # Promote the DeltaUrl to a CuratedUrl
-        collection.promote_to_curated()
-        curated_url = CuratedUrl.objects.get(url=delta_url.url)
+        # Verify initial state
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url.excluded is True
 
-        # ReApply the pattern
-        pattern.apply()
+        # Delete pattern
+        pattern.delete()
 
-        # Verify that no DeltaUrl is created after the CuratedUrl exists
+        # Delta URL should be gone since it was only created for exclusion
         assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
 
-    def test_apply_creates_delta_url_if_no_curated_url_exists(self):
-        """
-        Ensures that if no `CuratedUrl` exists for a given pattern, a new `DeltaUrl` is created.
-        """
-        collection = CollectionFactory()
-        dump_url = DumpUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title")
+    def test_different_collections_isolation(self):
+        """Test that patterns only affect URLs in their collection."""
+        collection1 = CollectionFactory()
+        collection2 = CollectionFactory()
 
-        # Migrate DumpUrl to DeltaUrl
-        collection.migrate_dump_to_delta()
+        # Create URLs with different paths
+        curated_url1 = CuratedUrlFactory(collection=collection1, url="https://example.com/test1")
+        curated_url2 = CuratedUrlFactory(collection=collection2, url="https://example.com/test2")
 
-        # Create a pattern matching the URL
-        pattern = DeltaIncludePattern.objects.create(
-            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2
+        DeltaExcludePattern.objects.create(
+            collection=collection1, match_pattern="https://example.com/*", match_pattern_type=2
         )
 
-        # Apply the pattern
-        pattern.apply()
-
-        # A `DeltaUrl` should now exist
-        delta_url = DeltaUrl.objects.get(url=dump_url.url)
-        assert delta_url.scraped_title == dump_url.scraped_title
-
-    def test_apply_and_unapply_pattern(self):
-        # if we make a new exclude pattern and it affects an old url
-        # that wasn't previously affected, what should happen?
-        # for now, let's say the curated_url should be excluded, and a delta_url is created which is also excluded
-        # when the pattern is deleted, they should both be unexcluded again
-        collection = CollectionFactory()
-        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
-        assert not CuratedUrl.objects.get(pk=curated_url.pk).excluded
-
-        pattern = DeltaExcludePattern.objects.create(
-            collection=collection,
-            match_pattern="https://example.com/*",
-            match_pattern_type=2,  # MULTI_URL_PATTERN
-        )
-
-        assert CuratedUrl.objects.get(pk=curated_url.pk).excluded
-        assert DeltaUrl.objects.get(url=curated_url.url).excluded
-
-        pattern.delete()
-
-        # TODO: for now the DeltaUrl is persisting, but i think we might want to find a way to delete it eventually
-        assert not CuratedUrl.objects.get(pk=curated_url.pk).excluded
-        assert not DeltaUrl.objects.get(url=curated_url.url).excluded
+        # Only collection1's URL should be affected
+        assert DeltaUrl.objects.filter(collection=collection1, url=curated_url1.url).exists()
+        assert not DeltaUrl.objects.filter(collection=collection2, url=curated_url2.url).exists()
 
 
 @pytest.mark.django_db
@@ -354,60 +269,3 @@ def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
         # Ensure no new `DeltaUrl` is created after reapplying the pattern
         pattern.apply()
         assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
-
-
-@pytest.mark.django_db
-class TestDeltaDocumentTypePattern:
-    def test_apply_document_type_pattern(self):
-        collection = CollectionFactory()
-        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
-        pattern = DeltaDocumentTypePattern.objects.create(
-            collection=collection,
-            match_pattern="https://example.com/page",
-            document_type=2,  # A different document type than default
-        )
-        pattern.apply()
-
-        delta_url = DeltaUrl.objects.get(url=curated_url.url)
-        assert delta_url.document_type == 2
-
-    def test_unapply_document_type_pattern(self):
-        collection = CollectionFactory()
-        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page")
-        pattern = DeltaDocumentTypePattern.objects.create(
-            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, document_type=2
-        )
-        pattern.apply()
-
-        delta_url = DeltaUrl.objects.get(url=curated_url.url)
-        assert delta_url.document_type == 2
-
-        pattern.unapply()
-        delta_url.refresh_from_db()
-        assert delta_url.document_type is None
-
-
-@pytest.mark.django_db
-class TestDeltaDivisionPattern:
-    def test_apply_and_unapply_division_pattern(self):
-        # Step 1: Create a collection and a CuratedUrl that matches the pattern
-        collection = CollectionFactory()
-        curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page", division=1)
-
-        # Step 2: Create a DeltaDivisionPattern to apply to matching URLs
-        pattern = DeltaDivisionPattern.objects.create(
-            collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, division=2
-        )
-
-        # Step 3: Apply the pattern, which should generate a DeltaUrl with the division set to 2
-        delta_url = DeltaUrl.objects.get(url=curated_url.url)
-        assert delta_url.division == 2
-
-        # confirm the curated url maintains its original division
-        curated_url = CuratedUrl.objects.get(url=curated_url.url)
-        assert curated_url.division == 1
-
-        # Step 4: Unapply the pattern and confirm the division field is cleared
-        pattern.unapply()
-        delta_url.refresh_from_db()
-        assert delta_url.division is None
diff --git a/sde_collections/tests/test_field_modifier_patterns.py b/sde_collections/tests/test_field_modifier_patterns.py
new file mode 100644
index 00000000..db15a21e
--- /dev/null
+++ b/sde_collections/tests/test_field_modifier_patterns.py
@@ -0,0 +1,490 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_patterns.py
+
+import pytest
+from django.contrib.contenttypes.models import ContentType
+from django.db import IntegrityError
+from django.test import TestCase
+
+from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes
+from sde_collections.models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+)
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+
+from .factories import CollectionFactory, CuratedUrlFactory, DeltaUrlFactory
+
+
+class BaseCollectionTest(TestCase):
+    def setUp(self):
+        super().setUp()
+        self.collection = CollectionFactory()
+
+        # Ensure ContentTypes are created for all pattern models
+        for model in [
+            "deltaexcludepattern",
+            "deltaincludepattern",
+            "deltatitlepattern",
+            "deltadocumenttypepattern",
+            "deltadivisionpattern",
+        ]:
+            ContentType.objects.get_or_create(
+                app_label="sde_collections",
+                model=model,
+            )
+
+
+@pytest.mark.django_db
+class TestFieldModifierPatternBasics(TestCase):
+    """Test basic functionality of field modifier patterns."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_create_document_type_pattern_single(self):
+        """Test creation of a document type pattern for single URL."""
+        pattern = DeltaDocumentTypePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/docs/guide.pdf",
+            document_type=DocumentTypes.DOCUMENTATION,
+        )
+        assert pattern.match_pattern_type == DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL
+        assert pattern.document_type == DocumentTypes.DOCUMENTATION
+
+    def test_create_document_type_pattern_multi(self):
+        """Test creation of a document type pattern with wildcard."""
+        pattern = DeltaDocumentTypePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/docs/*.pdf",
+            match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            document_type=DocumentTypes.DOCUMENTATION,
+        )
+        assert pattern.match_pattern_type == DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN
+        assert pattern.document_type == DocumentTypes.DOCUMENTATION
+
+    def test_create_division_pattern(self):
+        """Test creation of a division pattern."""
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/helio/data.html",
+            division=Divisions.HELIOPHYSICS,
+        )
+        assert pattern.match_pattern_type == DeltaDivisionPattern.MatchPatternTypeChoices.INDIVIDUAL_URL
+        assert pattern.division == Divisions.HELIOPHYSICS
+
+    def test_modify_single_curated_url_document_type(self):
+        """Test modifying document type for a single curated URL."""
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/tools/analysis.html", document_type=DocumentTypes.DATA
+        )
+
+        pattern = DeltaDocumentTypePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.SOFTWARETOOLS
+        )
+
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url is not None
+        assert delta_url.document_type == DocumentTypes.SOFTWARETOOLS
+        assert pattern.delta_urls.filter(id=delta_url.id).exists()
+        # curated url should be unchanged
+        assert CuratedUrl.objects.get(url=curated_url.url).document_type == DocumentTypes.DATA
+
+    def test_modify_single_curated_url_division(self):
+        """Test modifying division for a single curated URL."""
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/planetary/mars.html", division=Divisions.EARTH_SCIENCE
+        )
+
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.PLANETARY
+        )
+
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url is not None
+        assert delta_url.division == Divisions.PLANETARY
+        assert pattern.delta_urls.filter(id=delta_url.id).exists()
+
+
+@pytest.mark.django_db
+class TestFieldModifierPatternBehavior(TestCase):
+    """Test complex behaviors of field modifier patterns."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_pattern_with_existing_delta(self):
+        """Test applying pattern when delta URL already exists."""
+        curated_url = CuratedUrlFactory(
+            collection=self.collection,
+            url="https://example.com/instruments/telescope.html",
+            document_type=DocumentTypes.DOCUMENTATION,
+        )
+
+        # Create delta URL with different title
+        delta_url = DeltaUrlFactory(
+            collection=self.collection,
+            url=curated_url.url,
+            scraped_title="Updated Telescope Info",
+            document_type=DocumentTypes.DOCUMENTATION,
+        )
+
+        # Apply pattern - should modify existing delta
+        DeltaDocumentTypePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.MISSIONSINSTRUMENTS
+        )
+
+        # Should still be only one delta URL with both changes
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 1
+        updated_delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert updated_delta.id == delta_url.id
+        assert updated_delta.document_type == DocumentTypes.MISSIONSINSTRUMENTS
+        assert updated_delta.scraped_title == "Updated Telescope Info"
+        assert CuratedUrl.objects.get(url=curated_url.url).document_type == DocumentTypes.DOCUMENTATION
+
+    def test_multi_url_pattern_modification(self):
+        """Test modifying multiple URLs with wildcard pattern."""
+        # Create multiple curated URLs
+        [
+            CuratedUrlFactory(
+                collection=self.collection,
+                url=f"https://example.com/images/galaxy{i}.jpg",
+                document_type=DocumentTypes.DOCUMENTATION,
+            )
+            for i in range(3)
+        ]
+
+        pattern = DeltaDocumentTypePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/images/*.jpg",
+            document_type=DocumentTypes.IMAGES,
+            match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+        )
+
+        assert DeltaUrl.objects.filter(collection=self.collection).count() == 3
+        for delta_url in DeltaUrl.objects.all():
+            assert delta_url.document_type == DocumentTypes.IMAGES
+            assert pattern.delta_urls.filter(id=delta_url.id).exists()
+
+
+@pytest.mark.django_db
+class TestFieldModifierPatternLifecycle(TestCase):
+    """Test pattern lifecycle including promotion and removal."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_pattern_removal_creates_reversal_deltas(self):
+        """Test that removing a pattern creates deltas to reverse its effects."""
+        curated_url = CuratedUrlFactory(
+            collection=self.collection, url="https://example.com/bio/experiment.html", division=Divisions.GENERAL
+        )
+
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY
+        )
+
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url.division == Divisions.BIOLOGY
+
+        self.collection.promote_to_curated()
+
+        curated_url = CuratedUrl.objects.get(url=curated_url.url)
+
+        assert curated_url.division == Divisions.BIOLOGY
+        assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
+
+        pattern.delete()
+
+        # when all you have in the system is a curated url and a pattern setting a value
+        # removal of the pattern should make a delta that sets the value to None
+        reversal_delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert reversal_delta.division is None
+
+    def test_multiple_patterns_same_url(self):
+        """Test that different types of patterns can affect same URL."""
+        url = "https://example.com/astro/telescope_data.fits"
+
+        CuratedUrlFactory(
+            collection=self.collection, url=url, division=Divisions.GENERAL, document_type=DocumentTypes.DOCUMENTATION
+        )
+
+        # Apply both division and document type patterns
+        division_pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=url, division=Divisions.ASTROPHYSICS
+        )
+
+        doc_type_pattern = DeltaDocumentTypePattern.objects.create(
+            collection=self.collection, match_pattern=url, document_type=DocumentTypes.DATA
+        )
+
+        # Should have one delta URL reflecting both changes
+        assert DeltaUrl.objects.count() == 1
+        delta_url = DeltaUrl.objects.get()
+        assert delta_url.division == Divisions.ASTROPHYSICS
+        assert delta_url.document_type == DocumentTypes.DATA
+        assert division_pattern.delta_urls.filter(id=delta_url.id).exists()
+        assert doc_type_pattern.delta_urls.filter(id=delta_url.id).exists()
+
+
+@pytest.mark.django_db
+class TestFieldModifierPatternConstraints(TestCase):
+    """Test pattern constraints and validation."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_pattern_uniqueness_per_collection(self):
+        """Test that patterns must be unique per collection."""
+        url = "https://example.com/data/sample.fits"
+
+        DeltaDocumentTypePattern.objects.create(
+            collection=self.collection, match_pattern=url, document_type=DocumentTypes.DATA
+        )
+
+        with pytest.raises(IntegrityError):
+            DeltaDocumentTypePattern.objects.create(
+                collection=self.collection, match_pattern=url, document_type=DocumentTypes.DOCUMENTATION
+            )
+
+
+@pytest.mark.django_db
+class TestFieldModifierDeltaCleanup(TestCase):
+    """
+    Test complex delta URL cleanup scenarios, particularly around pattern removal
+    and interaction between multiple patterns.
+    """
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_delta_retained_with_other_changes(self):
+        """
+        Test that a delta URL with changes from multiple patterns is properly
+        handled when one pattern is removed.
+        """
+        curated_url = CuratedUrlFactory(
+            collection=self.collection,
+            url="https://example.com/astro/data.fits",
+            division=Divisions.GENERAL,
+            document_type=DocumentTypes.DOCUMENTATION,
+            scraped_title="Original Title",  # Adding this to test preservation of manual changes
+        )
+
+        # Create two patterns affecting the same URL
+        division_pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.ASTROPHYSICS
+        )
+
+        DeltaDocumentTypePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA
+        )
+
+        # Manually modify the title to simulate a non-pattern change
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        delta_url.scraped_title = "Modified Title"
+        delta_url.save()
+
+        # Remove one pattern - delta should be retained with other changes
+        division_pattern.delete()
+
+        # Delta should still exist with doc type change and manual title change
+        retained_delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert retained_delta.document_type == DocumentTypes.DATA
+        assert retained_delta.scraped_title == "Modified Title"
+        assert retained_delta.division == Divisions.GENERAL  # Division reverted to curated value
+
+    def test_delta_cleanup_after_all_patterns_removed(self):
+        """
+        Test cleanup of delta URLs when all patterns affecting them are removed,
+        but only if no other changes exist.
+        """
+        curated_url = CuratedUrlFactory(
+            collection=self.collection,
+            url="https://example.com/astro/data.fits",
+            division=Divisions.GENERAL,
+            document_type=DocumentTypes.DOCUMENTATION,
+        )
+
+        doc_type_pattern = DeltaDocumentTypePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA
+        )
+
+        # Verify delta exists with both changes
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta_url.document_type == DocumentTypes.DATA
+
+        # Remove pattern
+        doc_type_pattern.delete()
+
+        assert not DeltaUrl.objects.filter(url=curated_url.url).exists()
+
+    def test_delta_cleanup_with_manual_changes(self):
+        """
+        Test that deltas are retained when patterns are removed but manual changes exist.
+        """
+        curated_url = CuratedUrlFactory(
+            collection=self.collection,
+            url="https://example.com/astro/data.fits",
+            division=Divisions.GENERAL,
+            document_type=DocumentTypes.DOCUMENTATION,
+            scraped_title="Original Title",
+        )
+
+        # Create pattern and let it create a delta
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.ASTROPHYSICS
+        )
+
+        # Add manual change to delta
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        delta_url.scraped_title = "Modified Title"
+        delta_url.save()
+
+        # Remove pattern
+        pattern.delete()
+
+        # Delta should be retained due to manual title change
+        retained_delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert retained_delta.scraped_title == "Modified Title"
+        assert retained_delta.division == Divisions.GENERAL
+
+    def test_multi_url_pattern_cleanup(self):
+        """
+        Test cleanup behavior when removing a pattern that affects multiple URLs.
+        """
+        # Create several curated URLs
+        curated_urls = [
+            CuratedUrlFactory(
+                collection=self.collection,
+                url=f"https://example.com/data/set{i}.fits",
+                document_type=DocumentTypes.DOCUMENTATION,
+            )
+            for i in range(3)
+        ]
+
+        # Create pattern affecting all URLs
+        pattern = DeltaDocumentTypePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/data/*.fits",
+            document_type=DocumentTypes.DATA,
+            match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+        )
+
+        # Modify one delta with additional changes
+        delta_to_retain = DeltaUrl.objects.get(url=curated_urls[0].url)
+        delta_to_retain.scraped_title = "Modified Title"
+        delta_to_retain.save()
+
+        # Remove pattern
+        pattern.delete()
+
+        # Only the delta with manual changes should remain
+        assert DeltaUrl.objects.count() == 1
+        retained_delta = DeltaUrl.objects.get()
+        assert retained_delta.url == curated_urls[0].url
+        assert retained_delta.scraped_title == "Modified Title"
+        assert retained_delta.document_type == DocumentTypes.DOCUMENTATION
+
+    def test_pattern_removal_after_promotion(self):
+        """
+        Test that removing a pattern after promotion creates appropriate reversal deltas.
+        """
+        curated_urls = [
+            CuratedUrlFactory(
+                collection=self.collection,
+                url=f"https://example.com/helio/data{i}.fits",
+                division=Divisions.GENERAL,
+                document_type=DocumentTypes.DOCUMENTATION,
+            )
+            for i in range(2)
+        ]
+
+        # Create patterns and manually modify one URL
+        division_pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/helio/*.fits",
+            division=Divisions.HELIOPHYSICS,
+            match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+        )
+
+        # Modify first delta with additional changes
+        delta = DeltaUrl.objects.get(url=curated_urls[0].url)
+        delta.scraped_title = "Modified Title"
+        delta.save()
+
+        # Promote collection
+        self.collection.promote_to_curated()
+
+        # Remove pattern - should create reversal deltas
+        division_pattern.delete()
+
+        # Should have two deltas: one with just division reversal,
+        # one with division reversal plus preserved title change
+        assert DeltaUrl.objects.count() == 2
+
+        # Check delta with manual changes
+        modified_delta = DeltaUrl.objects.get(url=curated_urls[0].url)
+        assert modified_delta.division is None
+        assert modified_delta.scraped_title == "Modified Title"
+
+        # Check plain reversal delta
+        plain_delta = DeltaUrl.objects.get(url=curated_urls[1].url)
+        assert plain_delta.division is None
+        assert plain_delta.scraped_title == curated_urls[1].scraped_title
+
+    def test_pattern_removal_creates_null_deltas(self):
+        """ """
+        curated_url = DeltaUrlFactory(
+            collection=self.collection,
+            url="https://example.com/astro/data.fits",
+            division=Divisions.ASTROPHYSICS,
+            document_type=DocumentTypes.DATA,
+        )
+
+        # Create pattern
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.HELIOPHYSICS
+        )
+
+        # Verify initial state
+        delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta.division == Divisions.HELIOPHYSICS
+
+        # Remove pattern
+        pattern.delete()
+
+        # Should have delta with explicit NULL
+        new_delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert new_delta.division is None
+
+    # def test_pattern_removal_with_multiple_patterns(self):
+    #     """
+    #     Test that removing one pattern doesn't NULL the field if other
+    #     patterns of same type still affect the URL.
+    #     """
+    #     # TODO: The official stance right now is to simply not make overlapping patterns like this
+    #     # in the future, if this behavior is allowed, then this would be the test case.
+    #     # right now, this behavior is not coded for, and this test does not pass.
+
+    #     curated_url = CuratedUrlFactory(
+    #         collection=self.collection, url="https://example.com/astro/data.fits", division=Divisions.GENERAL
+    #     )
+
+    #     # Create two patterns affecting same URL
+    #     pattern1 = DeltaDivisionPattern.objects.create(
+    #         collection=self.collection,
+    #         match_pattern="*.fits",
+    #         division=Divisions.ASTROPHYSICS,
+    #         match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+    #     )
+
+    #     DeltaDivisionPattern.objects.create(
+    #         collection=self.collection, match_pattern=curated_url.url, division=Divisions.HELIOPHYSICS
+    #     )
+
+    #     # Remove one pattern
+    #     pattern1.delete()
+
+    #     # Delta should retain value from remaining pattern
+    #     delta = DeltaUrl.objects.get(url=curated_url.url)
+    #     assert delta.division == Divisions.HELIOPHYSICS

From 63ef2b4e188d650868c198be7df6aad52309b0b7 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 09:57:50 -0600
Subject: [PATCH 198/441] create an InclusionPatternBase and inherit from it

---
 sde_collections/models/delta_patterns.py | 58 +++++++++++++-----------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 8511631e..c5f4bcf9 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -92,39 +92,40 @@ def __str__(self):
         return self.match_pattern
 
 
-class DeltaExcludePattern(BaseMatchPattern):
-    """Pattern for marking URLs for exclusion."""
+class InclusionPatternBase(BaseMatchPattern):
+    """
+    Base class for patterns that handle URL inclusion/exclusion.
+    Both ExcludePattern and IncludePattern share the same core logic for managing
+    relationships and Delta URL creation/cleanup.
+    """
 
-    reason = models.TextField("Reason for excluding", default="", blank=True)
+    class Meta(BaseMatchPattern.Meta):
+        abstract = True
 
     def apply(self) -> None:
         """
-        Apply the pattern's effects to matching URLs. This involves:
-        1. Finding new Curated URLs that match the pattern but weren't previously affected
-        2. Creating Delta URLs for those newly affected Curated URLs ONLY IF they don't already have Delta URLs
-        3. Updating the pattern's list of affected Delta and Curated URLs
-
-        Note: The actual exclusion is handled through the many-to-many relationship,
-        so we don't need to modify existing Delta URLs.
+        Apply pattern effects to matching URLs:
+        1. Find new Curated URLs that match but weren't previously affected
+        2. Create Delta URLs for newly affected Curated URLs if needed
+        3. Update pattern relationships to manage inclusion/exclusion status
         """
         DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
 
         # Get QuerySet of all matching CuratedUrls
         matching_curated_urls = self.get_matching_curated_urls()
 
-        # Find Curated URLs that match but weren't previously affected by this pattern
+        # Find Curated URLs that match but weren't previously affected
         previously_unaffected_curated = matching_curated_urls.exclude(
             id__in=self.curated_urls.values_list("id", flat=True)
         )
 
-        # For each previously unaffected curated URL, ensure a DeltaUrl exists
-        # but ONLY create one if it doesn't already exist
+        # Create Delta URLs for newly affected Curated URLs if needed
         for curated_url in previously_unaffected_curated:
-            # Skip if ANY DeltaUrl exists for this URL (regardless of pattern association)
+            # Skip if Delta already exists
             if DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists():
                 continue
 
-            # Only create a new DeltaUrl if one doesn't exist
+            # Create new Delta URL copying fields from Curated URL
             fields = {
                 field.name: getattr(curated_url, field.name)
                 for field in curated_url._meta.fields
@@ -135,15 +136,15 @@ def apply(self) -> None:
 
             DeltaUrl.objects.create(**fields)
 
-        # Update the pattern's relationships - this is what actually handles the exclusion
+        # Update relationships - this handles inclusion/exclusion status
         self.update_affected_delta_urls_list()
 
     def unapply(self) -> None:
         """
         Remove this pattern's effects by:
-        1. Creating Delta URLs for previously excluded Curated URLs to show they're no longer excluded
+        1. Creating Delta URLs for previously excluded Curated URLs to show they're no longer excluded/included
         2. Cleaning up any Delta URLs that are now identical to their Curated URL counterparts
-           (these would have only existed to show their exclusion)
+           (these would have only existed to show their exclusion/inclusion)
         """
         DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
         CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
@@ -158,14 +159,14 @@ def unapply(self) -> None:
             fields["to_delete"] = False
             fields["collection"] = self.collection
 
-            DeltaUrl.objects.get_or_create(**fields)  # Use get_or_create to avoid duplicates
+            DeltaUrl.objects.get_or_create(**fields)
 
-        # Clean up Delta URLs that are now identical to their Curated URLs
+        # Clean up redundant Delta URLs
         for delta_url in self.delta_urls.filter(to_delete=False):
             try:
                 curated_url = CuratedUrl.objects.get(collection=self.collection, url=delta_url.url)
 
-                # Compare all fields except 'id' and 'to_delete'
+                # Check if Delta is now identical to Curated
                 fields_match = all(
                     getattr(delta_url, field.name) == getattr(curated_url, field.name)
                     for field in delta_url._meta.fields
@@ -176,22 +177,27 @@ def unapply(self) -> None:
                     delta_url.delete()
 
             except CuratedUrl.DoesNotExist:
-                # If there's no corresponding CuratedUrl, keep the DeltaUrl
                 continue
 
-        # Clear the pattern's relationships
+        # Clear pattern relationships
         self.delta_urls.clear()
         self.curated_urls.clear()
 
-    class Meta(BaseMatchPattern.Meta):
+
+class DeltaExcludePattern(InclusionPatternBase):
+    """Pattern for marking URLs for exclusion."""
+
+    reason = models.TextField("Reason for excluding", default="", blank=True)
+
+    class Meta(InclusionPatternBase.Meta):
         verbose_name = "Delta Exclude Pattern"
         verbose_name_plural = "Delta Exclude Patterns"
 
 
-class DeltaIncludePattern(BaseMatchPattern):
+class DeltaIncludePattern(InclusionPatternBase):
     """Pattern for explicitly including URLs."""
 
-    class Meta(BaseMatchPattern.Meta):
+    class Meta(InclusionPatternBase.Meta):
         verbose_name = "Delta Include Pattern"
         verbose_name_plural = "Delta Include Patterns"
 

From 034ce843a62cad876a75a504619c0e1b3a88af65 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 10:18:10 -0600
Subject: [PATCH 199/441] refactor include processing to override excludes

---
 sde_collections/models/delta_url.py           |  39 +++++-
 .../tests/test_include_patterns.py            | 132 ++++++++++++++++++
 2 files changed, 166 insertions(+), 5 deletions(-)
 create mode 100644 sde_collections/tests/test_include_patterns.py

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 3f1212e0..8022127b 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -4,27 +4,56 @@
 from django.db import models
 
 from .collection_choice_fields import Divisions, DocumentTypes
-from .delta_patterns import DeltaExcludePattern, DeltaTitlePattern
+from .delta_patterns import DeltaExcludePattern, DeltaIncludePattern, DeltaTitlePattern
 
 
 class DeltaUrlQuerySet(models.QuerySet):
     def with_exclusion_status(self):
+        """
+        Annotate queryset with exclusion status, taking into account both exclude and include patterns.
+        Include patterns take precedence over exclude patterns.
+        """
         return self.annotate(
-            excluded=models.Exists(
+            has_exclude=models.Exists(
                 DeltaExcludePattern.delta_urls.through.objects.filter(deltaurl=models.OuterRef("pk"))
-            )
+            ),
+            has_include=models.Exists(
+                DeltaIncludePattern.delta_urls.through.objects.filter(deltaurl=models.OuterRef("pk"))
+            ),
+            excluded=models.Case(
+                # If has_include is True, URL is not excluded regardless of exclude patterns
+                models.When(has_include=True, then=models.Value(False)),
+                # Otherwise, excluded status is determined by presence of exclude pattern
+                default=models.F("has_exclude"),
+                output_field=models.BooleanField(),
+            ),
         )
 
 
 class CuratedUrlQuerySet(models.QuerySet):
     def with_exclusion_status(self):
+        """
+        Annotate queryset with exclusion status, taking into account both exclude and include patterns.
+        Include patterns take precedence over exclude patterns.
+        """
         return self.annotate(
-            excluded=models.Exists(
+            has_exclude=models.Exists(
                 DeltaExcludePattern.curated_urls.through.objects.filter(curatedurl=models.OuterRef("pk"))
-            )
+            ),
+            has_include=models.Exists(
+                DeltaIncludePattern.curated_urls.through.objects.filter(curatedurl=models.OuterRef("pk"))
+            ),
+            excluded=models.Case(
+                # If has_include is True, URL is not excluded regardless of exclude patterns
+                models.When(has_include=True, then=models.Value(False)),
+                # Otherwise, excluded status is determined by presence of exclude pattern
+                default=models.F("has_exclude"),
+                output_field=models.BooleanField(),
+            ),
         )
 
 
+# Manager classes remain unchanged since they just use the updated QuerySets
 class DeltaUrlManager(models.Manager):
     def get_queryset(self):
         return DeltaUrlQuerySet(self.model, using=self._db).with_exclusion_status()
diff --git a/sde_collections/tests/test_include_patterns.py b/sde_collections/tests/test_include_patterns.py
new file mode 100644
index 00000000..4212efa5
--- /dev/null
+++ b/sde_collections/tests/test_include_patterns.py
@@ -0,0 +1,132 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_include_patterns.py
+import pytest
+
+from sde_collections.models.delta_patterns import (
+    DeltaExcludePattern,
+    DeltaIncludePattern,
+)
+from sde_collections.models.delta_url import DeltaUrl
+from sde_collections.tests.factories import (
+    CollectionFactory,
+    DeltaUrlFactory,
+    DumpUrlFactory,
+)
+
+
+@pytest.mark.django_db
+def test_patterns_applied_after_migration():
+    collection = CollectionFactory()
+
+    # Add DumpUrls to migrate - using folder-based structure
+    DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/1")
+    DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/2")
+    DumpUrlFactory(collection=collection, url="https://example.com/included_docs/1")
+    DumpUrlFactory(collection=collection, url="https://example.com/other_docs/1")
+    # This URL should be included despite being in excluded_docs folder
+    DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/included")
+
+    # Create exclude pattern for excluded_docs folder
+    exclude_pattern = DeltaExcludePattern.objects.create(
+        collection=collection, match_pattern="https://example.com/excluded_docs/*", match_pattern_type=2
+    )
+
+    # Create include patterns
+    include_pattern = DeltaIncludePattern.objects.create(
+        collection=collection, match_pattern="https://example.com/included_docs/*", match_pattern_type=2
+    )
+
+    # Specific include pattern that overrides the excluded_docs folder
+    specific_include = DeltaIncludePattern.objects.create(
+        collection=collection, match_pattern="https://example.com/excluded_docs/included", match_pattern_type=1
+    )
+
+    # Perform the migration
+    collection.migrate_dump_to_delta()
+
+    # Verify pattern relationships
+    assert exclude_pattern.delta_urls.filter(
+        url="https://example.com/excluded_docs/1"
+    ).exists(), "Exclude pattern not applied to excluded_docs"
+
+    assert include_pattern.delta_urls.filter(
+        url="https://example.com/included_docs/1"
+    ).exists(), "Include pattern not applied to included_docs"
+
+    # Verify URL in other_docs is unaffected
+    assert not exclude_pattern.delta_urls.filter(
+        url="https://example.com/other_docs/1"
+    ).exists(), "Exclude pattern incorrectly applied to other_docs"
+    assert not include_pattern.delta_urls.filter(
+        url="https://example.com/other_docs/1"
+    ).exists(), "Include pattern incorrectly applied to other_docs"
+
+    # Verify excluded status
+    excluded_url = DeltaUrl.objects.get(url="https://example.com/excluded_docs/1")
+    included_url = DeltaUrl.objects.get(url="https://example.com/included_docs/1")
+    neutral_url = DeltaUrl.objects.get(url="https://example.com/other_docs/1")
+    override_url = DeltaUrl.objects.get(url="https://example.com/excluded_docs/included")
+
+    assert excluded_url.excluded is True, "URL in excluded_docs should be excluded"
+    assert included_url.excluded is False, "URL in included_docs should not be excluded"
+    assert neutral_url.excluded is False, "URL in other_docs should not be excluded"
+    assert (
+        override_url.excluded is False
+    ), "Specifically included URL should not be excluded despite being in excluded_docs"
+
+    # Verify both patterns are applied to the override URL
+    assert exclude_pattern.delta_urls.filter(url="https://example.com/excluded_docs/included").exists()
+    assert specific_include.delta_urls.filter(url="https://example.com/excluded_docs/included").exists()
+
+
+# Test cases for the updated functionality
+@pytest.mark.django_db
+class TestUrlExclusionInclusion:
+    def test_exclusion_with_no_patterns(self):
+        """Test that URLs are not excluded by default"""
+        collection = CollectionFactory()
+        delta_url = DeltaUrlFactory(collection=collection)
+
+        assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False
+
+    def test_exclusion_pattern_only(self):
+        """Test that exclude patterns work when no include patterns exist"""
+        collection = CollectionFactory()
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/excluded")
+
+        DeltaExcludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/excluded", match_pattern_type=1
+        )
+
+        assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True
+
+    def test_include_pattern_overrides_exclude(self):
+        """Test that include patterns take precedence over exclude patterns"""
+        collection = CollectionFactory()
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/both")
+
+        # Create both exclude and include patterns for the same URL
+        DeltaExcludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/both", match_pattern_type=1
+        )
+
+        DeltaIncludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/both", match_pattern_type=1
+        )
+
+        # URL should not be excluded because include takes precedence
+        assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False
+
+    def test_wildcard_patterns(self):
+        """Test that wildcard patterns work correctly with include/exclude precedence"""
+        collection = CollectionFactory()
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/docs/file.pdf")
+
+        # Exclude all PDFs but include those in /docs/
+        DeltaExcludePattern.objects.create(collection=collection, match_pattern="*.pdf", match_pattern_type=2)
+
+        DeltaIncludePattern.objects.create(
+            collection=collection, match_pattern="https://example.com/docs/*", match_pattern_type=2
+        )
+
+        # URL should not be excluded because the include pattern matches
+        assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False

From bf6700426500b56731726bbbeb9a8cdb8a70be23 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 10:22:44 -0600
Subject: [PATCH 200/441] consolidate pattern readmes

---
 sde_collections/models/README_INCLUSION.md    | 146 ++++++++++++++++++
 .../{README.md => README_PATTERN_SYSTEM.md}   |   0
 .../README_UNAPPLY_LOGIC.md}                  |   0
 3 files changed, 146 insertions(+)
 create mode 100644 sde_collections/models/README_INCLUSION.md
 rename sde_collections/models/{README.md => README_PATTERN_SYSTEM.md} (100%)
 rename sde_collections/{tests/pattern_unapply_logic.md => models/README_UNAPPLY_LOGIC.md} (100%)

diff --git a/sde_collections/models/README_INCLUSION.md b/sde_collections/models/README_INCLUSION.md
new file mode 100644
index 00000000..d2fedf51
--- /dev/null
+++ b/sde_collections/models/README_INCLUSION.md
@@ -0,0 +1,146 @@
+# URL Include and Exclude Patterns
+
+## Overview
+
+The pattern system allows you to control which URLs are included in or excluded from your collection using two types of patterns:
+- **Exclude Patterns**: Mark URLs for exclusion from the collection
+- **Include Patterns**: Explicitly include URLs, overriding any exclude patterns
+
+## Pattern Types
+
+### Individual URL Patterns
+- Matches exact URLs
+- Best for targeting specific pages
+- No wildcards allowed
+```python
+# Matches only exactly this URL
+match_pattern = "https://example.com/docs/specific-page.html"
+```
+
+### Multi-URL (Wildcard) Patterns
+- Uses `*` as a wildcard to match multiple URLs
+- Best for targeting entire directories or file types
+- Can have wildcards anywhere in the pattern
+```python
+# Matches all files in the /docs directory
+match_pattern = "https://example.com/docs/*"
+
+# Matches all PDF files
+match_pattern = "https://example.com/*.pdf"
+```
+
+## Pattern Precedence
+
+1. Include patterns **always** take precedence over exclude patterns
+2. More specific patterns take precedence over general patterns
+3. If a URL matches both an include and exclude pattern, it will be included
+
+## Common Examples
+
+### Excluding a Directory But Including Specific Files
+
+```python
+# Exclude the internal docs directory
+DeltaExcludePattern.objects.create(
+    collection=collection,
+    match_pattern="https://example.com/internal/*",
+    match_pattern_type=2  # Multi-URL pattern
+)
+
+# But include specific approved pages
+DeltaIncludePattern.objects.create(
+    collection=collection,
+    match_pattern="https://example.com/internal/public-roadmap.html",
+    match_pattern_type=1  # Individual URL pattern
+)
+```
+
+### Including Only Specific File Types
+
+```python
+# Exclude everything in docs directory
+DeltaExcludePattern.objects.create(
+    collection=collection,
+    match_pattern="https://example.com/docs/*",
+    match_pattern_type=2
+)
+
+# Include only PDF files
+DeltaIncludePattern.objects.create(
+    collection=collection,
+    match_pattern="https://example.com/docs/*.pdf",
+    match_pattern_type=2
+)
+```
+
+### Folder-Based Access Control
+
+```python
+# Exclude all draft documents
+DeltaExcludePattern.objects.create(
+    collection=collection,
+    match_pattern="https://example.com/docs/drafts/*",
+    match_pattern_type=2
+)
+
+# Include the approved drafts subfolder
+DeltaIncludePattern.objects.create(
+    collection=collection,
+    match_pattern="https://example.com/docs/drafts/approved/*",
+    match_pattern_type=2
+)
+```
+
+## Best Practices
+
+1. **Start Specific**: Begin with specific patterns and broaden as needed
+   ```python
+   # Better
+   match_pattern = "https://example.com/docs/api/v1/*"
+   # Less precise
+   match_pattern = "https://example.com/docs/*"
+   ```
+
+2. **Use Include for Exceptions**: When excluding a large section, use include patterns for exceptions
+   ```python
+   # Exclude staging environment
+   exclude_pattern = "https://staging.example.com/*"
+   # Include specific staging features that should be public
+   include_pattern = "https://staging.example.com/features/released/*"
+   ```
+
+3. **Document Patterns**: Keep track of why each pattern was added
+   ```python
+   DeltaExcludePattern.objects.create(
+       collection=collection,
+       match_pattern="https://example.com/internal/*",
+       reason="Internal documentation not ready for public release"
+   )
+   ```
+
+4. **Regular Maintenance**: Review patterns periodically to ensure they're still needed and correct
+
+## Common Gotchas
+
+1. **Trailing Slashes**: URLs with and without trailing slashes are treated as different
+   ```python
+   # These are different patterns
+   "https://example.com/docs"
+   "https://example.com/docs/"
+   ```
+
+2. **Over-Inclusive Wildcards**: Be careful with patterns that might match too much
+   ```python
+   # Dangerous: Could match more than intended
+   match_pattern = "https://example.com/*internal*"
+
+   # Better: More specific
+   match_pattern = "https://example.com/internal/*"
+   ```
+
+3. **Pattern Order**: Remember that include patterns always win, regardless of the order they're created
+   ```python
+   # This URL will be included despite the exclude pattern
+   exclude_pattern = "https://example.com/docs/*"
+   include_pattern = "https://example.com/docs/public.html"
+   ```
diff --git a/sde_collections/models/README.md b/sde_collections/models/README_PATTERN_SYSTEM.md
similarity index 100%
rename from sde_collections/models/README.md
rename to sde_collections/models/README_PATTERN_SYSTEM.md
diff --git a/sde_collections/tests/pattern_unapply_logic.md b/sde_collections/models/README_UNAPPLY_LOGIC.md
similarity index 100%
rename from sde_collections/tests/pattern_unapply_logic.md
rename to sde_collections/models/README_UNAPPLY_LOGIC.md

From 73b266a71f63c8793d98c686379e052791fe7575 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 10:52:25 -0600
Subject: [PATCH 201/441] add lifecycle readme

---
 sde_collections/models/README_LIFECYCLE.md | 203 +++++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100644 sde_collections/models/README_LIFECYCLE.md

diff --git a/sde_collections/models/README_LIFECYCLE.md b/sde_collections/models/README_LIFECYCLE.md
new file mode 100644
index 00000000..61afa3f7
--- /dev/null
+++ b/sde_collections/models/README_LIFECYCLE.md
@@ -0,0 +1,203 @@
+# URL Migration and Promotion Guide
+
+## Overview
+This document explains the lifecycle of URLs in the system, focusing on two critical processes:
+1. Migration from DumpUrls to DeltaUrls
+2. Promotion from DeltaUrls to CuratedUrls
+
+## Core Concepts
+
+### URL States
+- **DumpUrls**: Raw data from initial scraping/indexing
+- **DeltaUrls**: Work-in-progress changes and modifications
+- **CuratedUrls**: Production-ready, approved content
+
+### Fields That Transfer
+All fields are transferred between states, including:
+- URL
+- Scraped Title
+- Generated Title
+- Document Type
+- Division
+- Excluded Status
+- Scraped Text
+- Any additional metadata
+
+## Migration Process (Dump → Delta)
+
+### Overview
+Migration converts DumpUrls to DeltaUrls, preserving all fields and applying patterns. This process happens when:
+- New content is scraped
+- Content is reindexed
+- Collection is being prepared for curation
+
+### Steps
+1. Clear existing DeltaUrls
+2. Process each DumpUrl:
+   - If matching CuratedUrl exists: Create Delta with all fields
+   - If no matching CuratedUrl: Create Delta as new URL
+3. Process missing CuratedUrls:
+   - Create deletion Deltas for any not in Dump
+4. Apply all patterns to new Deltas
+5. Clear DumpUrls
+
+### Examples
+
+#### Example 1: Basic Migration
+```python
+# Starting State
+dump_url = DumpUrl(
+    url="example.com/doc",
+    scraped_title="Original Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+
+# After Migration
+delta_url = DeltaUrl(
+    url="example.com/doc",
+    scraped_title="Original Title",
+    document_type=DocumentTypes.DOCUMENTATION,
+    to_delete=False
+)
+```
+
+#### Example 2: Migration with Existing Curated
+```python
+# Starting State
+dump_url = DumpUrl(
+    url="example.com/doc",
+    scraped_title="New Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+
+curated_url = CuratedUrl(
+    url="example.com/doc",
+    scraped_title="Old Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+
+# After Migration
+delta_url = DeltaUrl(
+    url="example.com/doc",
+    scraped_title="New Title",  # Different from curated
+    document_type=DocumentTypes.DOCUMENTATION,
+    to_delete=False
+)
+```
+
+#### Example 3: Migration with Pattern Application
+```python
+# Starting State
+dump_url = DumpUrl(
+    url="example.com/data/file.pdf",
+    scraped_title="Data File",
+    document_type=None
+)
+
+document_type_pattern = DocumentTypePattern(
+    match_pattern="*.pdf",
+    document_type=DocumentTypes.DATA
+)
+
+# After Migration and Pattern Application
+delta_url = DeltaUrl(
+    url="example.com/data/file.pdf",
+    scraped_title="Data File",
+    document_type=DocumentTypes.DATA,  # Set by pattern
+    to_delete=False
+)
+```
+
+## Promotion Process (Delta → Curated)
+
+### Overview
+Promotion moves DeltaUrls to CuratedUrls, applying all changes including explicit NULL values. This occurs when:
+- A curator marks a collection as Curated.
+
+### Steps
+1. Process each DeltaUrl:
+   - If marked for deletion: Remove matching CuratedUrl
+   - Otherwise: Update/create CuratedUrl with ALL fields
+2. Clear all DeltaUrls
+3. Refresh pattern relationships
+
+### Examples
+
+#### Example 1: Basic Promotion
+```python
+# Starting State
+delta_url = DeltaUrl(
+    url="example.com/doc",
+    scraped_title="New Title",
+    document_type=DocumentTypes.DOCUMENTATION,
+    to_delete=False
+)
+
+# After Promotion
+curated_url = CuratedUrl(
+    url="example.com/doc",
+    scraped_title="New Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+```
+
+#### Example 2: Promotion with NULL Override
+```python
+# Starting State
+delta_url = DeltaUrl(
+    url="example.com/doc",
+    scraped_title="Title",
+    document_type=None,  # Explicitly set to None by pattern
+    to_delete=False
+)
+
+curated_url = CuratedUrl(
+    url="example.com/doc",
+    scraped_title="Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+
+# After Promotion
+curated_url = CuratedUrl(
+    url="example.com/doc",
+    scraped_title="Title",
+    document_type=None  # NULL value preserved
+)
+```
+
+#### Example 3: Deletion During Promotion
+```python
+# Starting State
+delta_url = DeltaUrl(
+    url="example.com/old-doc",
+    scraped_title="Old Title",
+    to_delete=True
+)
+
+curated_url = CuratedUrl(
+    url="example.com/old-doc",
+    scraped_title="Old Title"
+)
+
+# After Promotion
+# CuratedUrl is deleted
+# DeltaUrl is cleared
+```
+
+## Important Notes
+
+### Field Handling
+- ALL fields are copied during migration and promotion
+- NULL values in DeltaUrls are treated as explicit values
+- Pattern-set values take precedence over original values
+
+### Pattern Application
+- Patterns are applied after migration
+- Pattern effects persist through promotion
+- Multiple patterns can affect the same URL
+
+### Data Integrity
+- Migrations preserve all field values
+- Promotions apply all changes
+- Deletion flags are honored during promotion
+- Pattern relationships are maintained

From d81534d5aa60beb710d8681f16284564e97c7ee3 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 14:41:59 -0600
Subject: [PATCH 202/441] fix tests and refactor related_names for patterns

---
 ...eltadivisionpattern_collection_and_more.py | 124 +++++++++
 sde_collections/models/collection.py          |  70 +++--
 sde_collections/models/delta_patterns.py      |   6 +-
 sde_collections/tests/factories.py            |  20 +-
 sde_collections/tests/test_migrate_dump.py    | 126 +++++----
 sde_collections/tests/test_migration.py       | 262 ++++++++++++++++++
 6 files changed, 512 insertions(+), 96 deletions(-)
 create mode 100644 sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py
 create mode 100644 sde_collections/tests/test_migration.py

diff --git a/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py b/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py
new file mode 100644
index 00000000..91d87951
--- /dev/null
+++ b/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py
@@ -0,0 +1,124 @@
+# Generated by Django 4.2.9 on 2024-11-24 19:39
+
+from django.db import migrations, models
+import django.db.models.deletion
+import sde_collections.models.delta_patterns
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0067_alter_deltadivisionpattern_options_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="deltadivisionpattern",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name="%(class)ss",
+                related_query_name="%(class)ss",
+                to="sde_collections.collection",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltadivisionpattern",
+            name="curated_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltadivisionpattern",
+            name="delta_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltadocumenttypepattern",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name="%(class)ss",
+                related_query_name="%(class)ss",
+                to="sde_collections.collection",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltadocumenttypepattern",
+            name="curated_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltadocumenttypepattern",
+            name="delta_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltaexcludepattern",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name="%(class)ss",
+                related_query_name="%(class)ss",
+                to="sde_collections.collection",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaexcludepattern",
+            name="curated_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltaexcludepattern",
+            name="delta_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltaincludepattern",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name="%(class)ss",
+                related_query_name="%(class)ss",
+                to="sde_collections.collection",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaincludepattern",
+            name="curated_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltaincludepattern",
+            name="delta_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltatitlepattern",
+            name="collection",
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name="%(class)ss",
+                related_query_name="%(class)ss",
+                to="sde_collections.collection",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltatitlepattern",
+            name="curated_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltatitlepattern",
+            name="delta_urls",
+            field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"),
+        ),
+        migrations.AlterField(
+            model_name="deltatitlepattern",
+            name="title_pattern",
+            field=models.CharField(
+                help_text="Pattern for the new title. Can be an exact replacement string or sinequa-valid code",
+                validators=[sde_collections.models.delta_patterns.validate_title_pattern],
+                verbose_name="Title Pattern",
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index fccf4345..9503633a 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -118,7 +118,12 @@ def refresh_url_lists_for_all_patterns(self):
                 pattern.update_affected_curated_urls_list()
 
     def migrate_dump_to_delta(self):
-        """Main function to handle migration from DumpUrls to DeltaUrls with specific rules."""
+        """
+        Migrates data from DumpUrls to DeltaUrls, preserving all fields.
+        Creates DeltaUrls that reflect:
+        1. Changes from DumpUrls vs CuratedUrls
+        2. Missing URLs in DumpUrls that exist in CuratedUrls (marked for deletion)
+        """
         # Step 1: Clear existing DeltaUrls for this collection
         self.clear_delta_urls()
 
@@ -146,27 +151,31 @@ def migrate_dump_to_delta(self):
         # Step 5: Clear DumpUrls after migration is complete
         self.clear_dump_urls()
 
-        # Step 6: Reapply patterns to DeltaUrls
-        self.refresh_url_lists_for_all_patterns()
+        # Step 6: Apply all patterns to DeltaUrls
+        # self.refresh_url_lists_for_all_patterns() # TODO: I'm pretty confident we shouldn't be running this
+        self.apply_all_patterns()
 
     def create_or_update_delta_url(self, url_instance, to_delete=False):
         """
         Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
-        If to_delete is True, only sets the to_delete flag and url.
+        Always copies all fields, even for deletion cases.
+
+        Args:
+            url_instance: DumpUrl or CuratedUrl instance to copy from
+            to_delete: Whether to mark the resulting DeltaUrl for deletion
         """
-        if to_delete:
-            # Only set the URL and to_delete flag
-            DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults={"to_delete": True})
-        else:
-            # Automatically move over all fields from url_instance
-            fields_to_copy = {
-                field.name: getattr(url_instance, field.name)
-                for field in DumpUrl._meta.fields  # Assumes same fields for CuratedUrl via inheritance
-                if field.name not in ["id", "collection", "url"]
-            }
-            fields_to_copy["to_delete"] = False  # Ensure to_delete flag is False
+        # Get all copyable fields from the source instance
+        fields_to_copy = {
+            field.name: getattr(url_instance, field.name)
+            for field in url_instance._meta.fields
+            if field.name not in ["id", "collection"]
+        }
 
-            DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy)
+        # Set deletion status
+        fields_to_copy["to_delete"] = to_delete
+
+        # Update or create the DeltaUrl
+        DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy)
 
     def promote_to_curated(self):
         """
@@ -600,15 +609,32 @@ def sync_with_production_webapp(self) -> None:
 
         self.save()
 
-    def apply_all_patterns(self) -> None:
-        """Apply all the patterns."""
-        for pattern in self.excludepattern.all():
+    def apply_all_patterns(self):
+        """Apply all the patterns with debug information."""
+        print("\nApplying patterns:")
+
+        for pattern in self.deltaexcludepatterns.all():
+            print(f"\nApplying exclude pattern: {pattern.match_pattern}")
             pattern.apply()
-        for pattern in self.includepattern.all():
+
+        for pattern in self.deltaincludepatterns.all():
+            print(f"\nApplying include pattern: {pattern.match_pattern}")
             pattern.apply()
-        for pattern in self.titlepattern.all():
+
+        for pattern in self.deltatitlepatterns.all():
+            print(f"\nApplying title pattern: {pattern.match_pattern}")
             pattern.apply()
-        for pattern in self.documenttypepattern.all():
+
+        for pattern in self.deltadocumenttypepatterns.all():
+            print(f"\nApplying doctype pattern: {pattern.match_pattern}")
+            matching_urls = pattern.get_matching_delta_urls()
+            print(f"Matching URLs: {matching_urls.count()}")
+            pattern.apply()
+
+        for pattern in self.deltadivisionpatterns.all():
+            print(f"\nApplying division pattern: {pattern.match_pattern}")
+            matching_urls = pattern.get_matching_delta_urls()
+            print(f"Matching URLs: {matching_urls.count()}")
             pattern.apply()
 
     def save(self, *args, **kwargs):
diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index c5f4bcf9..ac9b8c9a 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -24,7 +24,7 @@ class MatchPatternTypeChoices(models.IntegerChoices):
     collection = models.ForeignKey(
         "Collection",
         on_delete=models.CASCADE,
-        related_name="%(class)s",
+        related_name="%(class)ss",  # Makes collection.deltaincludepatterns.all()
         related_query_name="%(class)ss",
     )
     match_pattern = models.CharField(
@@ -33,11 +33,11 @@ class MatchPatternTypeChoices(models.IntegerChoices):
     match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1)
     delta_urls = models.ManyToManyField(
         "DeltaUrl",
-        related_name="%(class)s_delta_urls",
+        related_name="%(class)ss",  # Makes delta_url.deltaincludepatterns.all()
     )
     curated_urls = models.ManyToManyField(
         "CuratedUrl",
-        related_name="%(class)s_curated_urls",
+        related_name="%(class)ss",  # Makes curated_url.deltaincludepatterns.all()
     )
 
     def get_regex_pattern(self) -> str:
diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py
index 414221d5..dded5d5c 100644
--- a/sde_collections/tests/factories.py
+++ b/sde_collections/tests/factories.py
@@ -66,25 +66,25 @@ class Meta:
     # division = 1
 
 
-class CuratedUrlFactory(factory.django.DjangoModelFactory):
+class DeltaUrlFactory(factory.django.DjangoModelFactory):
     class Meta:
-        model = CuratedUrl
+        model = DeltaUrl
 
     collection = factory.SubFactory(CollectionFactory)
     url = factory.Faker("url")
     scraped_title = factory.Faker("sentence")
-    scraped_text = factory.Faker("paragraph")
-    generated_title = factory.Faker("sentence")
-    visited = factory.Faker("boolean")
-    document_type = 1
-    division = 1
+    to_delete = False
 
 
-class DeltaUrlFactory(factory.django.DjangoModelFactory):
+class CuratedUrlFactory(factory.django.DjangoModelFactory):
     class Meta:
-        model = DeltaUrl
+        model = CuratedUrl
 
     collection = factory.SubFactory(CollectionFactory)
     url = factory.Faker("url")
     scraped_title = factory.Faker("sentence")
-    to_delete = False
+    scraped_text = factory.Faker("paragraph")
+    generated_title = factory.Faker("sentence")
+    visited = factory.Faker("boolean")
+    document_type = 1
+    division = 1
diff --git a/sde_collections/tests/test_migrate_dump.py b/sde_collections/tests/test_migrate_dump.py
index 451dd0be..c0f460d6 100644
--- a/sde_collections/tests/test_migrate_dump.py
+++ b/sde_collections/tests/test_migrate_dump.py
@@ -3,11 +3,12 @@
 
 import pytest
 
+from sde_collections.models.collection_choice_fields import DocumentTypes
 from sde_collections.models.delta_patterns import (
+    DeltaDocumentTypePattern,
     DeltaExcludePattern,
-    DeltaIncludePattern,
 )
-from sde_collections.models.delta_url import DeltaUrl, DumpUrl
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from sde_collections.tests.factories import (
     CollectionFactory,
     CuratedUrlFactory,
@@ -47,7 +48,7 @@ def test_create_or_update_delta_url_delete(self):
         collection.create_or_update_delta_url(curated_url, to_delete=True)
         delta = DeltaUrl.objects.get(url=curated_url.url)
         assert delta.to_delete is True
-        assert delta.scraped_title == ""
+        assert delta.scraped_title == curated_url.scraped_title
 
 
 @pytest.mark.django_db
@@ -76,7 +77,7 @@ def test_url_in_curated_only(self):
         collection.migrate_dump_to_delta()
         delta = DeltaUrl.objects.get(url=curated_url.url)
         assert delta.to_delete is True
-        assert delta.scraped_title == ""
+        assert delta.scraped_title == curated_url.scraped_title
 
     def test_identical_url_in_both(self):
         collection = CollectionFactory()
@@ -233,77 +234,80 @@ def test_partial_data_in_curated_urls():
 
 
 @pytest.mark.django_db
-def test_patterns_applied_after_migration():
+def test_full_migration_with_patterns():
+    """
+    Test a complete migration flow with exclude patterns and document type patterns.
+    Tests the following scenarios:
+    - New URL from dump (should create delta)
+    - Updated URL from dump (should create delta with new title)
+    - Deleted URL (should create delta marked for deletion)
+    - URL matching exclude pattern (should be excluded)
+    - URL matching document type pattern (should have correct doc type)
+    """
     collection = CollectionFactory()
 
-    # Add DumpUrls to migrate
-    DumpUrlFactory(collection=collection, url="https://exclude.com")
-    DumpUrlFactory(collection=collection, url="https://include.com")
-    DumpUrlFactory(collection=collection, url="https://neutral.com")
+    # Set up initial DumpUrls and CuratedUrls
+    DumpUrlFactory(collection=collection, url="https://example.com/new", scraped_title="New Page")
+    DumpUrlFactory(collection=collection, url="https://example.com/update", scraped_title="Updated Title")
+    DumpUrlFactory(collection=collection, url="https://example.com/docs/guide", scraped_title="Documentation Guide")
 
-    # Create exclude and include patterns
+    CuratedUrlFactory(collection=collection, url="https://example.com/update", scraped_title="Old Title")
+    CuratedUrlFactory(collection=collection, url="https://example.com/delete", scraped_title="Delete Me")
+    CuratedUrlFactory(collection=collection, url="https://example.com/docs/guide", scraped_title="Documentation Guide")
+
+    # Create patterns before migration
     exclude_pattern = DeltaExcludePattern.objects.create(
-        collection=collection, match_pattern_type=2, match_pattern="exclude.*"
+        collection=collection,
+        match_pattern="https://example.com/delete",
+        match_pattern_type=1,  # Individual URL
+        reason="Test exclusion",
     )
-    include_pattern = DeltaIncludePattern.objects.create(
-        collection=collection, match_pattern_type=2, match_pattern="include.*"
+
+    doc_type_pattern = DeltaDocumentTypePattern.objects.create(
+        collection=collection,
+        match_pattern="https://example.com/docs/*",
+        match_pattern_type=2,  # Multi-URL pattern
+        document_type=DocumentTypes.DOCUMENTATION,
     )
 
-    # Perform the migration
+    # Perform migration
     collection.migrate_dump_to_delta()
 
-    # Check that the patterns were applied
-    exclude_pattern.refresh_from_db()
-    include_pattern.refresh_from_db()
-
-    # Verify exclude pattern relationship
-    assert exclude_pattern.delta_urls.filter(
-        url="https://exclude.com"
-    ).exists(), "Exclude pattern not applied to DeltaUrls."
-
-    # Verify include pattern relationship
-    assert include_pattern.delta_urls.filter(
-        url="https://include.com"
-    ).exists(), "Include pattern not applied to DeltaUrls."
+    # 1. Check new URL was created as delta
+    new_delta = DeltaUrl.objects.get(url="https://example.com/new")
+    assert new_delta.to_delete is False
+    assert new_delta.scraped_title == "New Page"
 
-    # Ensure neutral URL is unaffected
-    assert not exclude_pattern.delta_urls.filter(
-        url="https://neutral.com"
-    ).exists(), "Exclude pattern incorrectly applied."
-    assert not include_pattern.delta_urls.filter(
-        url="https://neutral.com"
-    ).exists(), "Include pattern incorrectly applied."
+    # 2. Check updated URL has new title in delta
+    update_delta = DeltaUrl.objects.get(url="https://example.com/update")
+    assert update_delta.to_delete is False
+    assert update_delta.scraped_title == "Updated Title"
 
+    # 3. Check deleted URL is marked for deletion
+    delete_delta = DeltaUrl.objects.get(url="https://example.com/delete")
+    assert delete_delta.to_delete is True
+    assert delete_delta.excluded is True  # Should be excluded due to pattern
 
-@pytest.mark.django_db
-def test_full_migration_with_patterns():
-    collection = CollectionFactory()
+    # 4. Check documentation URL has correct type
+    docs_delta = DeltaUrl.objects.get(url="https://example.com/docs/guide")
+    assert docs_delta.document_type == DocumentTypes.DOCUMENTATION
+    assert docs_delta.to_delete is False
 
-    # Set up DumpUrls and CuratedUrls
-    DumpUrlFactory(collection=collection, url="https://new.com")
-    DumpUrlFactory(collection=collection, url="https://update.com", scraped_title="Updated Title")
-    CuratedUrlFactory(collection=collection, url="https://update.com", scraped_title="Old Title")
-    CuratedUrlFactory(collection=collection, url="https://delete.com")
-
-    # Create patterns
-    exclude_pattern = DeltaExcludePattern.objects.create(
-        collection=collection, match_pattern_type=2, match_pattern="delete.*"
-    )
-    include_pattern = DeltaIncludePattern.objects.create(
-        collection=collection, match_pattern_type=2, match_pattern="update.*"
-    )
+    # 5. Verify pattern relationships
+    exclude_pattern.refresh_from_db()
+    doc_type_pattern.refresh_from_db()
 
-    # Perform migration
-    collection.migrate_dump_to_delta()
+    assert exclude_pattern.delta_urls.filter(url="https://example.com/delete").exists()
+    assert doc_type_pattern.delta_urls.filter(url="https://example.com/docs/guide").exists()
 
-    # Check DeltaUrls
-    assert DeltaUrl.objects.filter(url="https://new.com", to_delete=False).exists()
-    assert DeltaUrl.objects.filter(url="https://update.com", to_delete=False, scraped_title="Updated Title").exists()
-    assert DeltaUrl.objects.filter(url="https://delete.com", to_delete=True).exists()
+    # 6. Check total number of deltas is correct
+    assert DeltaUrl.objects.filter(collection=collection).count() == 4
 
-    # Check patterns
-    exclude_pattern.refresh_from_db()
-    include_pattern.refresh_from_db()
+    # Optional: Test promotion to verify patterns stick
+    collection.promote_to_curated()
 
-    assert exclude_pattern.delta_urls.filter(url="https://delete.com").exists(), "Exclude pattern not applied."
-    assert include_pattern.delta_urls.filter(url="https://update.com").exists(), "Include pattern not applied."
+    # Verify results after promotion
+    assert not CuratedUrl.objects.filter(url="https://example.com/delete").exists()
+    assert CuratedUrl.objects.get(url="https://example.com/docs/guide").document_type == DocumentTypes.DOCUMENTATION
+    assert CuratedUrl.objects.get(url="https://example.com/update").scraped_title == "Updated Title"
+    assert not CuratedUrl.objects.filter(scraped_title="Old Title").exists()
diff --git a/sde_collections/tests/test_migration.py b/sde_collections/tests/test_migration.py
new file mode 100644
index 00000000..4e9b1fac
--- /dev/null
+++ b/sde_collections/tests/test_migration.py
@@ -0,0 +1,262 @@
+import pytest
+from django.test import TestCase
+
+from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes
+from sde_collections.models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+    DeltaExcludePattern,
+)
+from sde_collections.models.delta_url import DeltaUrl, DumpUrl
+from sde_collections.tests.factories import (
+    CollectionFactory,
+    CuratedUrlFactory,
+    DeltaUrlFactory,
+    DumpUrlFactory,
+)
+
+
+@pytest.mark.django_db
+class TestMigrateDumpToDelta(TestCase):
+    """Test the migrate_dump_to_delta process comprehensively."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_basic_migration_new_url(self):
+        """Test basic migration of a new URL with no existing curated version."""
+        dump_url = DumpUrlFactory(
+            collection=self.collection,
+            url="https://example.com/new",
+            scraped_title="New Doc",
+            document_type=DocumentTypes.DOCUMENTATION,
+            division=Divisions.ASTROPHYSICS,
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        # Verify delta created with all fields
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.scraped_title == dump_url.scraped_title
+        assert delta.document_type == dump_url.document_type
+        assert delta.division == dump_url.division
+        assert delta.to_delete is False
+
+    def test_migration_with_differing_curated(self):
+        """Test migration when dump differs from existing curated URL."""
+        url = "https://example.com/doc"
+
+        dump_url = DumpUrlFactory(
+            collection=self.collection,
+            url=url,
+            scraped_title="New Title",
+            document_type=DocumentTypes.DATA,
+        )
+
+        CuratedUrlFactory(
+            collection=self.collection,
+            url=url,
+            scraped_title="Old Title",
+            document_type=DocumentTypes.DOCUMENTATION,
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        delta = DeltaUrl.objects.get(url=url)
+        assert delta.scraped_title == dump_url.scraped_title
+        assert delta.document_type == dump_url.document_type
+        assert delta.to_delete is False
+
+    def test_migration_marks_missing_urls_for_deletion(self):
+        """Test that curated URLs not in dump are marked for deletion."""
+        # Create only curated URL, no dump
+        curated_url = CuratedUrlFactory(
+            collection=self.collection,
+            url="https://example.com/old",
+            scraped_title="Old Doc",
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        delta = DeltaUrl.objects.get(url=curated_url.url)
+        assert delta.to_delete is True
+        assert delta.scraped_title == curated_url.scraped_title
+
+    def test_migration_handles_null_fields(self):
+        """Test migration properly handles null/empty fields."""
+        dump_url = DumpUrlFactory(
+            collection=self.collection,
+            url="https://example.com/doc",
+            scraped_title="",  # Empty string
+            document_type=None,  # Null
+            division=None,  # Null
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.scraped_title == ""
+        assert delta.document_type is None
+        assert delta.division is None
+
+    def test_migration_clears_existing_deltas(self):
+        """Test that existing deltas are cleared before migration."""
+        # Create pre-existing delta
+        old_delta = DeltaUrlFactory(
+            collection=self.collection,
+            url="https://example.com/old",
+            scraped_title="Old Delta",
+        )
+
+        # Create new dump URL
+        new_dump = DumpUrlFactory(
+            collection=self.collection,
+            url="https://example.com/new",
+            scraped_title="New Dump",
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        # Verify old delta is gone and only new one exists
+        assert not DeltaUrl.objects.filter(url=old_delta.url).exists()
+        assert DeltaUrl.objects.filter(url=new_dump.url).exists()
+
+    def test_migration_with_exclude_pattern(self):
+        """Test migration interacts correctly with exclude patterns."""
+        # Create pattern first
+        DeltaExcludePattern.objects.create(
+            collection=self.collection,
+            match_pattern="*internal*",
+            match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+        )
+
+        # Create dump URL that should be excluded
+        dump_url = DumpUrlFactory(
+            collection=self.collection,
+            url="https://example.com/internal/doc",
+            scraped_title="Internal Doc",
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.excluded is True
+
+    def test_migration_with_field_modifying_pattern(self):
+        """Test migration with patterns that modify fields."""
+        # Create document type pattern
+        DeltaDocumentTypePattern.objects.create(
+            collection=self.collection,
+            match_pattern="*.pdf",
+            match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            document_type=DocumentTypes.DATA,
+        )
+
+        # Create division pattern
+        DeltaDivisionPattern.objects.create(
+            collection=self.collection,
+            match_pattern="*/astro/*",
+            match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            division=Divisions.ASTROPHYSICS,
+        )
+
+        # Create dump URL that matches both patterns
+        dump_url = DumpUrlFactory(
+            collection=self.collection,
+            url="https://example.com/astro/data.pdf",
+            scraped_title="Astro Data",
+            document_type=DocumentTypes.DOCUMENTATION,  # Different from pattern
+            division=Divisions.EARTH_SCIENCE,  # Different from pattern
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert delta.document_type == DocumentTypes.DATA
+        assert delta.division == Divisions.ASTROPHYSICS
+
+    def test_migration_with_multiple_urls(self):
+        """Test migration with multiple URLs in various states."""
+        # Create mix of dump and curated URLs
+        dump_urls = [DumpUrlFactory(collection=self.collection) for _ in range(3)]
+        curated_urls = [CuratedUrlFactory(collection=self.collection) for _ in range(2)]
+
+        self.collection.migrate_dump_to_delta()
+
+        # Should have deltas for all dump URLs
+        for dump_url in dump_urls:
+            assert DeltaUrl.objects.filter(url=dump_url.url, to_delete=False).exists()
+
+        # Should have deletion deltas for curated URLs not in dump
+        for curated_url in curated_urls:
+            assert DeltaUrl.objects.filter(url=curated_url.url, to_delete=True).exists()
+
+    def test_migration_with_empty_states(self):
+        """Test migration handles empty dump and curated states."""
+        # No dump or curated URLs exist
+        self.collection.migrate_dump_to_delta()
+        assert DeltaUrl.objects.count() == 0
+
+        # Only curated URLs exist
+        CuratedUrlFactory(collection=self.collection)
+        self.collection.migrate_dump_to_delta()
+        assert DeltaUrl.objects.count() == 1
+        assert DeltaUrl.objects.first().to_delete is True
+
+    def test_migration_preserves_all_fields(self):
+        """Test that ALL fields are preserved during migration, not just changed ones."""
+        # Create dump URL with all fields populated
+        dump_url = DumpUrlFactory(
+            collection=self.collection,
+            url="https://example.com/doc",
+            scraped_title="Title",
+            scraped_text="Full text content",
+            generated_title="Generated Title",
+            document_type=DocumentTypes.DOCUMENTATION,
+            division=Divisions.ASTROPHYSICS,
+            visited=True,
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+
+        # Verify all fields were copied
+        fields_to_check = [
+            "scraped_title",
+            "scraped_text",
+            "generated_title",
+            "document_type",
+            "division",
+            "visited",
+        ]
+
+        for field in fields_to_check:
+            assert getattr(delta, field) == getattr(dump_url, field)
+
+    def test_clearing_dump_urls(self):
+        """Test that dump URLs are cleared after migration."""
+        DumpUrlFactory(collection=self.collection)
+        DumpUrlFactory(collection=self.collection)
+
+        self.collection.migrate_dump_to_delta()
+
+        assert DumpUrl.objects.filter(collection=self.collection).count() == 0
+
+    def test_pattern_relationships_updated(self):
+        """Test that pattern relationships are properly updated after migration."""
+        pattern = DeltaExcludePattern.objects.create(
+            collection=self.collection,
+            match_pattern="*test*",
+            match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+        )
+
+        dump_url = DumpUrlFactory(
+            collection=self.collection,
+            url="https://example.com/test/doc",
+        )
+
+        self.collection.migrate_dump_to_delta()
+
+        delta = DeltaUrl.objects.get(url=dump_url.url)
+        assert pattern.delta_urls.filter(id=delta.id).exists()

From 808c1ed192475ef3b309ee81082208ad7b99fe66 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 17:53:17 -0600
Subject: [PATCH 203/441] fix related name reference in serializers

---
 sde_collections/serializers.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index b3470ce3..7b2fdc7f 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -64,15 +64,15 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_delta_urls_count(self, obj):
-        titlepattern = obj.deltatitlepattern_delta_urls.last()
+        titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.delta_urls.count() if titlepattern else 0
 
     def get_generated_title_id(self, obj):
-        titlepattern = obj.deltatitlepattern_delta_urls.last()
+        titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.id if titlepattern else None
 
     def get_match_pattern_type(self, obj):
-        titlepattern = obj.deltatitlepattern_delta_urls.last()
+        titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.match_pattern_type if titlepattern else None
 
     class Meta:
@@ -104,15 +104,15 @@ class CuratedURLSerializer(serializers.ModelSerializer):
     curated_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_curated_urls_count(self, obj):
-        titlepattern = obj.deltatitlepattern_curated_urls.last()
+        titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.curated_urls.count() if titlepattern else 0
 
     def get_generated_title_id(self, obj):
-        titlepattern = obj.deltatitlepattern_curated_urls.last()
+        titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.id if titlepattern else None
 
     def get_match_pattern_type(self, obj):
-        titlepattern = obj.deltatitlepattern_curated_urls.last()
+        titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.match_pattern_type if titlepattern else None
 
     class Meta:

From 85ff6e56e4091bc5df2c227529cc50dfd981a47c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 20:18:41 -0600
Subject: [PATCH 204/441] add management command to deduplicate urls

---
 .../management/commands/deduplicate_urls.py   | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 sde_collections/management/commands/deduplicate_urls.py

diff --git a/sde_collections/management/commands/deduplicate_urls.py b/sde_collections/management/commands/deduplicate_urls.py
new file mode 100644
index 00000000..251ae887
--- /dev/null
+++ b/sde_collections/management/commands/deduplicate_urls.py
@@ -0,0 +1,88 @@
+import time
+
+from django.core.management.base import BaseCommand
+from django.db.models import Count, Min
+
+from sde_collections.models.candidate_url import CandidateURL
+from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+
+
+class Command(BaseCommand):
+    help = "Deduplicate CandidateURLs"
+
+    def handle(self, *args, **kwargs):
+        deduplicate_candidate_urls()
+
+
+def is_priority_collection(collection):
+    priority_statuses = {
+        WorkflowStatusChoices.CURATED,
+        WorkflowStatusChoices.QUALITY_FIXED,
+        WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
+        WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
+        WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
+        WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
+        WorkflowStatusChoices.QUALITY_CHECK_FAILED,
+        WorkflowStatusChoices.QUALITY_CHECK_MINOR,
+        WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+        WorkflowStatusChoices.PROD_PERFECT,
+        WorkflowStatusChoices.PROD_MINOR,
+        WorkflowStatusChoices.PROD_MAJOR,
+    }
+    return collection.workflow_status in priority_statuses
+
+
+def deduplicate_candidate_urls():
+    start_time = time.time()
+
+    collection_counts = {
+        c["id"]: c["url_count"]
+        for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count")
+    }
+
+    collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()}
+
+    # Phase 1: Intra-collection duplicates
+    intra_dupes = (
+        CandidateURL.objects.values("collection_id", "url")
+        .annotate(count=Count("id"), min_id=Min("id"))
+        .filter(count__gt=1)
+    )
+
+    intra_ids_to_delete = []
+    for dupe in intra_dupes:
+        dupe_ids = set(
+            CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"])
+            .exclude(id=dupe["min_id"])
+            .values_list("id", flat=True)
+        )
+        intra_ids_to_delete.extend(dupe_ids)
+
+    CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete()
+
+    # Phase 2: Cross-collection duplicates
+    cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1)
+
+    cross_ids_to_delete = []
+    for dupe in cross_dupes:
+        instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id"))
+
+        priority_instances = [i for i in instances if collection_status[i["collection_id"]]]
+        non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]]
+
+        if priority_instances:
+            keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]])
+        else:
+            keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]])
+
+        delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]]
+        cross_ids_to_delete.extend(delete_ids)
+
+    CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete()
+
+    elapsed_time = time.time() - start_time
+    action = "Deleted"
+    print(
+        f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds"  # noqa
+    )

From edff281d0ea0ca641ae2c10ccb7d408e2f02cf7f Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 22:15:23 -0600
Subject: [PATCH 205/441] correct error in migrate_urls_and_patterns

---
 .../management/commands/migrate_urls_and_patterns.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index 7110cd30..7c28d1d4 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -87,7 +87,7 @@ def handle(self, *args, **kwargs):
                             visited=candidate_url.visited,
                             document_type=candidate_url.document_type,
                             division=candidate_url.division,
-                            delete=False,
+                            to_delete=False,
                         )
                     )
 

From c1b9fc402f71b5d3a84d797d60ec34d7ccc9302b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Sun, 24 Nov 2024 22:40:57 -0600
Subject: [PATCH 206/441] remove print statements from promotion code

---
 sde_collections/models/collection.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 9503633a..5788f335 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -611,30 +611,20 @@ def sync_with_production_webapp(self) -> None:
 
     def apply_all_patterns(self):
         """Apply all the patterns with debug information."""
-        print("\nApplying patterns:")
 
         for pattern in self.deltaexcludepatterns.all():
-            print(f"\nApplying exclude pattern: {pattern.match_pattern}")
             pattern.apply()
 
         for pattern in self.deltaincludepatterns.all():
-            print(f"\nApplying include pattern: {pattern.match_pattern}")
             pattern.apply()
 
         for pattern in self.deltatitlepatterns.all():
-            print(f"\nApplying title pattern: {pattern.match_pattern}")
             pattern.apply()
 
         for pattern in self.deltadocumenttypepatterns.all():
-            print(f"\nApplying doctype pattern: {pattern.match_pattern}")
-            matching_urls = pattern.get_matching_delta_urls()
-            print(f"Matching URLs: {matching_urls.count()}")
             pattern.apply()
 
         for pattern in self.deltadivisionpatterns.all():
-            print(f"\nApplying division pattern: {pattern.match_pattern}")
-            matching_urls = pattern.get_matching_delta_urls()
-            print(f"Matching URLs: {matching_urls.count()}")
             pattern.apply()
 
     def save(self, *args, **kwargs):

From 754fc93dcc3cbf1cd36606bdd216bd4aa52a07e6 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Mon, 25 Nov 2024 13:40:43 +0530
Subject: [PATCH 207/441] Added test_migration command

---
 sde_collections/tests/test_migration.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sde_collections/tests/test_migration.py b/sde_collections/tests/test_migration.py
index 4e9b1fac..211145e9 100644
--- a/sde_collections/tests/test_migration.py
+++ b/sde_collections/tests/test_migration.py
@@ -1,3 +1,5 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migration.py
+
 import pytest
 from django.test import TestCase
 

From 8056b225fc840fdecf7c1091b0aff0b69e9351f7 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 25 Nov 2024 12:35:51 -0600
Subject: [PATCH 208/441] add readme on pattern resolution

---
 .../models/README_PATTERN_RESOLUTION.md       | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 sde_collections/models/README_PATTERN_RESOLUTION.md

diff --git a/sde_collections/models/README_PATTERN_RESOLUTION.md b/sde_collections/models/README_PATTERN_RESOLUTION.md
new file mode 100644
index 00000000..ad9f0e25
--- /dev/null
+++ b/sde_collections/models/README_PATTERN_RESOLUTION.md
@@ -0,0 +1,78 @@
+# URL Pattern Application Strategies
+
+## Strategy 1: Exclusive Patterns
+
+Patterns have exclusive ownership of URLs they match. System prevents creation of overlapping patterns.
+
+Example:
+```
+Pattern A: */docs/*          # Matches 100 URLs
+Pattern B: */docs/api/*      # Rejected - overlaps with Pattern A
+Pattern C: */blog/*          # Accepted - no overlap
+```
+
+Benefits:
+- Clear ownership
+- Predictable effects
+- Simple conflict resolution
+- Easy to debug
+
+Drawbacks:
+- Less flexible
+- May require many specific patterns
+- May need pattern deletion/recreation to modify rules
+
+## Strategy 2: Smallest Set Priority
+
+Multiple patterns can match same URLs. Pattern affecting smallest URL set takes precedence.
+
+Example:
+```
+Pattern A: */docs/*          # Matches 100 URLs
+Pattern B: */docs/api/*      # Matches 20 URLs
+Pattern C: */docs/api/v2/*   # Matches 5 URLs
+
+For URL "/docs/api/v2/users":
+- All patterns match
+- Pattern C wins (5 URLs < 20 URLs < 100 URLs)
+```
+
+Benefits:
+- More flexible rule creation
+- Natural handling of specificity
+
+Drawbacks:
+- Complex precedence rules
+- Pattern effects can change as URL sets grow
+- Harder to predict/debug
+- Performance impact from URL set size calculations
+
+## Implementation Notes
+
+Strategy 1:
+```python
+def save(self, *args, **kwargs):
+    # Check for overlapping patterns
+    overlapping = self.get_matching_delta_urls().filter(
+        deltapatterns__isnull=False
+    ).exists()
+    if overlapping:
+        raise ValidationError("Pattern would overlap existing pattern")
+    super().save(*args, **kwargs)
+```
+
+Strategy 2:
+```python
+def apply(self):
+    matching_urls = self.get_matching_delta_urls()
+    my_url_count = matching_urls.count()
+
+    # Only apply if this pattern matches fewer URLs than other matching patterns
+    for url in matching_urls:
+        other_patterns_min_count = url.deltapatterns.annotate(
+            url_count=Count('delta_urls')
+        ).aggregate(Min('url_count'))['url_count__min'] or float('inf')
+
+        if my_url_count <= other_patterns_min_count:
+            self.apply_to_url(url)
+```

From a1bd63eef9c5dd1201a1e9ce89a2fb619cd167ee Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 25 Nov 2024 15:17:02 -0600
Subject: [PATCH 209/441] add code to remove duplicate patterns

---
 .../commands/deduplicate_patterns.py          | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 sde_collections/management/commands/deduplicate_patterns.py

diff --git a/sde_collections/management/commands/deduplicate_patterns.py b/sde_collections/management/commands/deduplicate_patterns.py
new file mode 100644
index 00000000..f9de42e6
--- /dev/null
+++ b/sde_collections/management/commands/deduplicate_patterns.py
@@ -0,0 +1,47 @@
+# docker-compose -f local.yml run --rm django python manage.py deduplicate_patterns
+# docker-compose -f production.yml run --rm django python manage.py deduplicate_patterns
+
+from collections import defaultdict
+
+from django.core.management.base import BaseCommand
+from django.db.models import Count
+
+from sde_collections.models.pattern import (
+    DivisionPattern,
+    DocumentTypePattern,
+    ExcludePattern,
+    IncludePattern,
+    TitlePattern,
+)
+
+
+class Command(BaseCommand):
+    help = "Remove duplicate patterns within collections for all pattern types"
+
+    def handle(self, *args, **kwargs):
+        pattern_models = [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern]
+
+        deletion_counts = defaultdict(int)
+
+        for model in pattern_models:
+            # Get all collections that have duplicate patterns
+            collections_with_dupes = (
+                model.objects.values("collection", "match_pattern")
+                .annotate(pattern_count=Count("id"))
+                .filter(pattern_count__gt=1)
+            )
+
+            for group in collections_with_dupes:
+                # Get all patterns for this collection/match_pattern combo
+                patterns = model.objects.filter(collection_id=group["collection"], match_pattern=group["match_pattern"])
+
+                # Keep one pattern, delete the rest
+                patterns_to_delete = patterns[1:]
+                for pattern in patterns_to_delete:
+                    pattern.delete()
+                    deletion_counts[model.__name__] += 1
+
+        # Print final summary
+        for model_name, count in deletion_counts.items():
+            self.stdout.write(f"{model_name}: {count}")
+        self.stdout.write(f"Total: {sum(deletion_counts.values())}")

From 28956276361a055b57bfb7a2d09ff02ace5ffd83 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 25 Nov 2024 15:27:23 -0600
Subject: [PATCH 210/441] celeryworker_updates

---
 .envs/.local/.django | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 0978166d..172022c5 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -45,5 +45,5 @@ LRM_QA_PASSWORD=''
 
 #Server Tokens
 #--------------------------------------------------------------------------------
-LRM_DEV_TOKEN=''
-XLI_TOKEN=''
+LRM_DEV_TOKEN='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJTaW5lcXVhIiwiaWF0IjoxNzI5NDgzMzU4LCJzaWQiOiIwRUM1QjI3QjU1RTQ0QjhBODA2QzM2QjY0REM0QkVCNiIsImtpbmQiOiJhY2Nlc3MiLCJzdWIiOiJzaW5lcXVhfGNvc21vc19tbF91c2VyIn0.slzYgP9vr1CE-lVRo3ZzJ7sTlh-S9bBC-bX5PUt5Ns8'
+XLI_TOKEN='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJTaW5lcXVhIiwiaWF0IjoxNzI3OTAzMzAzLCJzaWQiOiJCRDkwN0Q4QzJCMjg0MDA2ODQ5OEZFOENCRjdEODQwNiIsImtpbmQiOiJhY2Nlc3MiLCJleHAiOjE3MzU2NzkzMDMsInN1YiI6IlNpbmVxdWF8Z3JhX3VzZXJzIn0.o1a3eDPgEWdoHu7S8KQi0wMw_brxfAM1lClbfncVQVI'

From 5f8e7e1bd882b6667ea2f9fd74e3f5c7e1237289 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 25 Nov 2024 15:29:47 -0600
Subject: [PATCH 211/441] latest

---
 .envs/.local/.django | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 172022c5..54c76263 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -45,5 +45,5 @@ LRM_QA_PASSWORD=''
 
 #Server Tokens
 #--------------------------------------------------------------------------------
-LRM_DEV_TOKEN='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJTaW5lcXVhIiwiaWF0IjoxNzI5NDgzMzU4LCJzaWQiOiIwRUM1QjI3QjU1RTQ0QjhBODA2QzM2QjY0REM0QkVCNiIsImtpbmQiOiJhY2Nlc3MiLCJzdWIiOiJzaW5lcXVhfGNvc21vc19tbF91c2VyIn0.slzYgP9vr1CE-lVRo3ZzJ7sTlh-S9bBC-bX5PUt5Ns8'
-XLI_TOKEN='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJTaW5lcXVhIiwiaWF0IjoxNzI3OTAzMzAzLCJzaWQiOiJCRDkwN0Q4QzJCMjg0MDA2ODQ5OEZFOENCRjdEODQwNiIsImtpbmQiOiJhY2Nlc3MiLCJleHAiOjE3MzU2NzkzMDMsInN1YiI6IlNpbmVxdWF8Z3JhX3VzZXJzIn0.o1a3eDPgEWdoHu7S8KQi0wMw_brxfAM1lClbfncVQVI'
+LRM_DEV_TOKEN=''
+XLI_TOKEN=''
\ No newline at end of file

From 4747f591fe192fd04090a3af0771d6f443ba48f1 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 25 Nov 2024 15:30:48 -0600
Subject: [PATCH 212/441] latest

---
 local.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/local.yml b/local.yml
index 84893914..49611576 100644
--- a/local.yml
+++ b/local.yml
@@ -62,14 +62,6 @@ services:
      - postgres
    ports: []
    command: /start-celeryworker
-   deploy:
-     resources:
-       limits:
-         cpus: '4.0'
-         memory: 8G
-       reservations:
-         cpus: '2.0'
-         memory: 4G
 
 
   # celerybeat:

From 437cd659a32c3455864a33705c17b18b58044d2c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:31:25 +0000
Subject: [PATCH 213/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .envs/.local/.django | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.envs/.local/.django b/.envs/.local/.django
index 54c76263..0978166d 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -46,4 +46,4 @@ LRM_QA_PASSWORD=''
 #Server Tokens
 #--------------------------------------------------------------------------------
 LRM_DEV_TOKEN=''
-XLI_TOKEN=''
\ No newline at end of file
+XLI_TOKEN=''

From 41dd9e57d9cd7ff9adf14f4fb8fc540d5f064a4d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 25 Nov 2024 15:35:42 -0600
Subject: [PATCH 214/441] latest

---
 local.yml | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/local.yml b/local.yml
index 49611576..7359ac75 100644
--- a/local.yml
+++ b/local.yml
@@ -53,16 +53,15 @@ services:
     image: redis:6
     container_name: sde_indexing_helper_local_redis
 
-  celeryworker:
-   <<: *django
-   image: sde_indexing_helper_local_celeryworker
-   container_name: sde_indexing_helper_local_celeryworker
-   depends_on:
-     - redis
-     - postgres
-   ports: []
-   command: /start-celeryworker
-
+ celeryworker:
+    <<: *django
+    image: sde_indexing_helper_local_celeryworker
+    container_name: sde_indexing_helper_local_celeryworker
+    depends_on:
+      - redis
+      - postgres
+    ports: []
+    command: /start-celeryworker
 
   # celerybeat:
   #   <<: *django

From be863894a79c304162722b2bcf4d75ef041f90b1 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 25 Nov 2024 15:38:44 -0600
Subject: [PATCH 215/441] latest_

---
 local.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/local.yml b/local.yml
index 7359ac75..ebdb810b 100644
--- a/local.yml
+++ b/local.yml
@@ -53,7 +53,7 @@ services:
     image: redis:6
     container_name: sde_indexing_helper_local_redis
 
- celeryworker:
+  celeryworker:
     <<: *django
     image: sde_indexing_helper_local_celeryworker
     container_name: sde_indexing_helper_local_celeryworker

From 3e0f399de9a7cd7e1a260646c329e9d93218ac7b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 25 Nov 2024 16:06:38 -0600
Subject: [PATCH 216/441] move all Title logic into delta_patterns.py

---
 sde_collections/models/delta_patterns.py | 301 +++++++++++++----------
 sde_collections/models/delta_url.py      |  30 +--
 2 files changed, 166 insertions(+), 165 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index ac9b8c9a..c091957f 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -202,6 +202,152 @@ class Meta(InclusionPatternBase.Meta):
         verbose_name_plural = "Delta Include Patterns"
 
 
+class FieldModifyingPattern(BaseMatchPattern):
+    """
+    Abstract base class for patterns that modify a single field on matching URLs.
+    Examples: DeltaDivisionPattern, DeltaDocumentTypePattern
+    """
+
+    class Meta(BaseMatchPattern.Meta):
+        abstract = True
+
+    def get_field_to_modify(self) -> str:
+        """Return the name of the field this pattern modifies. Must be implemented by subclasses."""
+        raise NotImplementedError
+
+    def get_new_value(self) -> Any:
+        """Return the new value for the field. Must be implemented by subclasses."""
+        raise NotImplementedError
+
+    def apply(self) -> None:
+        """
+        Apply field modification to matching URLs:
+        1. Find new Curated URLs that match but weren't previously affected
+        2. Create Delta URLs only for Curated URLs where the field value would change
+        3. Update the pattern's list of affected URLs
+        4. Set the field value on all matching Delta URLs
+        """
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+
+        field = self.get_field_to_modify()
+        new_value = self.get_new_value()
+
+        # Get newly matching Curated URLs
+        matching_curated_urls = self.get_matching_curated_urls()
+        previously_unaffected_curated = matching_curated_urls.exclude(
+            id__in=self.curated_urls.values_list("id", flat=True)
+        )
+
+        # Create DeltaUrls only where field value would change
+        for curated_url in previously_unaffected_curated:
+            if (
+                getattr(curated_url, field) == new_value
+                or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists()
+            ):
+                continue
+
+            fields = {
+                f.name: getattr(curated_url, f.name)
+                for f in curated_url._meta.fields
+                if f.name not in ["id", "collection"]
+            }
+            fields[field] = new_value
+            fields["to_delete"] = False
+            fields["collection"] = self.collection
+
+            DeltaUrl.objects.create(**fields)
+
+        # Update all matching DeltaUrls with the new field value
+        self.get_matching_delta_urls().update(**{field: new_value})
+        self.update_affected_delta_urls_list()
+
+    def unapply(self) -> None:
+        """
+        Remove field modifications:
+        1. Create Delta URLs for affected Curated URLs to explicitly set NULL
+        2. Remove field value from affected Delta URLs only if no other patterns affect them
+        3. Clean up Delta URLs that become identical to their Curated URL
+        """
+
+        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
+        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
+
+        field = self.get_field_to_modify()
+
+        # Get all affected URLs
+        affected_deltas = self.delta_urls.all()
+        affected_curated = self.curated_urls.all()
+
+        # Process each affected delta URL
+        for delta in affected_deltas:
+            curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()
+
+            if not curated:
+                # Scenario 1: Delta only - new URL
+                setattr(delta, field, None)
+                delta.save()
+            else:
+                # Scenario 2: Both exist
+                setattr(delta, field, getattr(curated, field))
+                delta.save()
+
+                # Check if delta is now redundant
+                fields_match = all(
+                    getattr(delta, f.name) == getattr(curated, f.name)
+                    for f in delta._meta.fields
+                    if f.name not in ["id", "to_delete"]
+                )
+                if fields_match:
+                    delta.delete()
+
+        # Handle curated URLs that don't have deltas
+        for curated in affected_curated:
+            if not DeltaUrl.objects.filter(url=curated.url).exists():
+                # Scenario 3: Curated only
+                # Copy all fields from curated except the one we're nulling
+                fields = {
+                    f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
+                }
+                fields[field] = None  # Set the pattern's field to None
+                delta = DeltaUrl.objects.create(collection=self.collection, **fields)
+
+        # Clear pattern relationships
+        self.delta_urls.clear()
+        self.curated_urls.clear()
+
+
+class DeltaDocumentTypePattern(FieldModifyingPattern):
+    """Pattern for setting document types."""
+
+    document_type = models.IntegerField(choices=DocumentTypes.choices)
+
+    def get_field_to_modify(self) -> str:
+        return "document_type"
+
+    def get_new_value(self) -> Any:
+        return self.document_type
+
+    class Meta(FieldModifyingPattern.Meta):
+        verbose_name = "Delta Document Type Pattern"
+        verbose_name_plural = "Delta Document Type Patterns"
+
+
+class DeltaDivisionPattern(FieldModifyingPattern):
+    """Pattern for setting divisions."""
+
+    division = models.IntegerField(choices=Divisions.choices)
+
+    def get_field_to_modify(self) -> str:
+        return "division"
+
+    def get_new_value(self) -> Any:
+        return self.division
+
+    class Meta(FieldModifyingPattern.Meta):
+        verbose_name = "Delta Division Pattern"
+        verbose_name_plural = "Delta Division Patterns"
+
+
 def validate_title_pattern(title_pattern_string: str) -> None:
     """Validate title pattern format."""
     parsed_title = parse_title(title_pattern_string)
@@ -373,147 +519,30 @@ class Meta(BaseMatchPattern.Meta):
         verbose_name_plural = "Delta Title Patterns"
 
 
-class FieldModifyingPattern(BaseMatchPattern):
-    """
-    Abstract base class for patterns that modify a single field on matching URLs.
-    Examples: DeltaDivisionPattern, DeltaDocumentTypePattern
-    """
-
-    class Meta(BaseMatchPattern.Meta):
-        abstract = True
-
-    def get_field_to_modify(self) -> str:
-        """Return the name of the field this pattern modifies. Must be implemented by subclasses."""
-        raise NotImplementedError
-
-    def get_new_value(self) -> Any:
-        """Return the new value for the field. Must be implemented by subclasses."""
-        raise NotImplementedError
-
-    def apply(self) -> None:
-        """
-        Apply field modification to matching URLs:
-        1. Find new Curated URLs that match but weren't previously affected
-        2. Create Delta URLs only for Curated URLs where the field value would change
-        3. Update the pattern's list of affected URLs
-        4. Set the field value on all matching Delta URLs
-        """
-        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
-
-        field = self.get_field_to_modify()
-        new_value = self.get_new_value()
-
-        # Get newly matching Curated URLs
-        matching_curated_urls = self.get_matching_curated_urls()
-        previously_unaffected_curated = matching_curated_urls.exclude(
-            id__in=self.curated_urls.values_list("id", flat=True)
-        )
-
-        # Create DeltaUrls only where field value would change
-        for curated_url in previously_unaffected_curated:
-            if (
-                getattr(curated_url, field) == new_value
-                or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists()
-            ):
-                continue
-
-            fields = {
-                f.name: getattr(curated_url, f.name)
-                for f in curated_url._meta.fields
-                if f.name not in ["id", "collection"]
-            }
-            fields[field] = new_value
-            fields["to_delete"] = False
-            fields["collection"] = self.collection
-
-            DeltaUrl.objects.create(**fields)
-
-        # Update all matching DeltaUrls with the new field value
-        self.get_matching_delta_urls().update(**{field: new_value})
-        self.update_affected_delta_urls_list()
-
-    def unapply(self) -> None:
-        """
-        Remove field modifications:
-        1. Create Delta URLs for affected Curated URLs to explicitly set NULL
-        2. Remove field value from affected Delta URLs only if no other patterns affect them
-        3. Clean up Delta URLs that become identical to their Curated URL
-        """
+class DeltaResolvedTitleBase(models.Model):
+    # TODO: need to understand this logic and whether we need to have these match to CuratedUrls as well
 
-        DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
-        CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
+    title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE)
+    delta_url = models.OneToOneField("sde_collections.DeltaUrl", on_delete=models.CASCADE)
+    created_at = models.DateTimeField(auto_now_add=True)
 
-        field = self.get_field_to_modify()
-
-        # Get all affected URLs
-        affected_deltas = self.delta_urls.all()
-        affected_curated = self.curated_urls.all()
-
-        # Process each affected delta URL
-        for delta in affected_deltas:
-            curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()
-
-            if not curated:
-                # Scenario 1: Delta only - new URL
-                setattr(delta, field, None)
-                delta.save()
-            else:
-                # Scenario 2: Both exist
-                setattr(delta, field, getattr(curated, field))
-                delta.save()
-
-                # Check if delta is now redundant
-                fields_match = all(
-                    getattr(delta, f.name) == getattr(curated, f.name)
-                    for f in delta._meta.fields
-                    if f.name not in ["id", "to_delete"]
-                )
-                if fields_match:
-                    delta.delete()
-
-        # Handle curated URLs that don't have deltas
-        for curated in affected_curated:
-            if not DeltaUrl.objects.filter(url=curated.url).exists():
-                # Scenario 3: Curated only
-                # Copy all fields from curated except the one we're nulling
-                fields = {
-                    f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
-                }
-                fields[field] = None  # Set the pattern's field to None
-                delta = DeltaUrl.objects.create(collection=self.collection, **fields)
-
-        # Clear pattern relationships
-        self.delta_urls.clear()
-        self.curated_urls.clear()
-
-
-class DeltaDocumentTypePattern(FieldModifyingPattern):
-    """Pattern for setting document types."""
-
-    document_type = models.IntegerField(choices=DocumentTypes.choices)
-
-    def get_field_to_modify(self) -> str:
-        return "document_type"
-
-    def get_new_value(self) -> Any:
-        return self.document_type
-
-    class Meta(FieldModifyingPattern.Meta):
-        verbose_name = "Delta Document Type Pattern"
-        verbose_name_plural = "Delta Document Type Patterns"
+    class Meta:
+        abstract = True
 
 
-class DeltaDivisionPattern(FieldModifyingPattern):
-    """Pattern for setting divisions."""
+class DeltaResolvedTitle(DeltaResolvedTitleBase):
+    resolved_title = models.CharField(blank=True, default="")
 
-    division = models.IntegerField(choices=Divisions.choices)
+    class Meta:
+        verbose_name = "Resolved Title"
+        verbose_name_plural = "Resolved Titles"
 
-    def get_field_to_modify(self) -> str:
-        return "division"
+    def save(self, *args, **kwargs):
+        # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it
+        DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete()
+        super().save(*args, **kwargs)
 
-    def get_new_value(self) -> Any:
-        return self.division
 
-    class Meta(FieldModifyingPattern.Meta):
-        verbose_name = "Delta Division Pattern"
-        verbose_name_plural = "Delta Division Patterns"
+class DeltaResolvedTitleError(DeltaResolvedTitleBase):
+    error_string = models.TextField(null=False, blank=False)
+    http_status_code = models.IntegerField(null=True, blank=True)
diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index 8022127b..cefeae7a 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -4,7 +4,7 @@
 from django.db import models
 
 from .collection_choice_fields import Divisions, DocumentTypes
-from .delta_patterns import DeltaExcludePattern, DeltaIncludePattern, DeltaTitlePattern
+from .delta_patterns import DeltaExcludePattern, DeltaIncludePattern
 
 
 class DeltaUrlQuerySet(models.QuerySet):
@@ -174,31 +174,3 @@ class Meta:
         verbose_name = "Curated Urls"
         verbose_name_plural = "Curated Urls"
         ordering = ["url"]
-
-
-class DeltaResolvedTitleBase(models.Model):
-    # TODO: need to understand this logic and whether we need to have thess match to CuratedUrls as well
-    title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE)
-    delta_url = models.OneToOneField(DeltaUrl, on_delete=models.CASCADE)
-    created_at = models.DateTimeField(auto_now_add=True)
-
-    class Meta:
-        abstract = True
-
-
-class DeltaResolvedTitle(DeltaResolvedTitleBase):
-    resolved_title = models.CharField(blank=True, default="")
-
-    class Meta:
-        verbose_name = "Resolved Title"
-        verbose_name_plural = "Resolved Titles"
-
-    def save(self, *args, **kwargs):
-        # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it
-        DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete()
-        super().save(*args, **kwargs)
-
-
-class DeltaResolvedTitleError(DeltaResolvedTitleBase):
-    error_string = models.TextField(null=False, blank=False)
-    http_status_code = models.IntegerField(null=True, blank=True)

From b3783887fa2a8dfe13069105c3c80677608daf35 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 25 Nov 2024 17:11:33 -0600
Subject: [PATCH 217/441] implement smallest set rule application for titles

---
 sde_collections/models/delta_patterns.py     | 40 +++++++++-
 sde_collections/tests/test_title_patterns.py | 79 ++++++++++++++++++++
 2 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 sde_collections/tests/test_title_patterns.py

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index c091957f..f65cf4a5 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -388,6 +388,35 @@ def generate_title_for_url(self, url_obj) -> tuple[str, str | None]:
         except (ValueError, ValidationError) as e:
             return None, str(e)
 
+    def get_url_match_count(self):
+        """
+        Get the number of unique URLs this pattern matches across both delta and curated URLs.
+        """
+        delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True))
+        curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True))
+        return len(delta_urls.union(curated_urls))
+
+    def is_most_distinctive_pattern(self, url) -> bool:
+        """
+        Determine if this pattern should apply to a URL by checking if it matches
+        the smallest number of URLs among all patterns that match this URL.
+        Returns True if this pattern should be applied.
+        """
+        my_match_count = self.get_url_match_count()
+
+        # Get all patterns that match this URL based on match_pattern regex
+        matching_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(
+            id=self.id
+        )  # Exclude self to avoid duplicate counting
+
+        # Filter to only patterns that would match this URL and get their counts
+        for pattern in matching_patterns:
+            if re.match(pattern.get_regex_pattern(), url.url):
+                if pattern.get_url_match_count() < my_match_count:
+                    return False
+
+        return True
+
     def apply(self) -> None:
         """
         Apply the title pattern to matching URLs:
@@ -408,6 +437,9 @@ def apply(self) -> None:
 
         # Process each previously unaffected curated URL
         for curated_url in previously_unaffected_curated:
+            if not self.is_most_distinctive_pattern(curated_url):
+                continue
+
             new_title, error = self.generate_title_for_url(curated_url)
 
             if error:
@@ -439,15 +471,19 @@ def apply(self) -> None:
 
         # Update titles for all matching Delta URLs
         for delta_url in self.get_matching_delta_urls():
+            if not self.is_most_distinctive_pattern(delta_url):
+                continue
+
             new_title, error = self.generate_title_for_url(delta_url)
 
             if error:
                 DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error)
                 continue
 
-            # Update title and record resolution
+            # Update title and record resolution - key change here
             DeltaResolvedTitle.objects.update_or_create(
-                title_pattern=self, delta_url=delta_url, defaults={"resolved_title": new_title}
+                delta_url=delta_url,  # Only use delta_url for lookup
+                defaults={"title_pattern": self, "resolved_title": new_title},
             )
 
             delta_url.generated_title = new_title
diff --git a/sde_collections/tests/test_title_patterns.py b/sde_collections/tests/test_title_patterns.py
new file mode 100644
index 00000000..36e6719f
--- /dev/null
+++ b/sde_collections/tests/test_title_patterns.py
@@ -0,0 +1,79 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_patterns.py
+
+import pytest
+
+from sde_collections.models.delta_patterns import DeltaResolvedTitle, DeltaTitlePattern
+from sde_collections.models.delta_url import DeltaUrl
+from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory
+
+
+@pytest.mark.django_db
+def test_title_pattern_multiple_resolved_titles_extended():
+    """Test that patterns properly handle title resolution based on URL set size."""
+    collection = CollectionFactory()
+
+    # Create URLs with different levels of specificity
+    url1 = DeltaUrlFactory(
+        collection=collection, url="https://example.com/docs/item.html", scraped_title="Original Title"
+    )
+    url2 = DeltaUrlFactory(
+        collection=collection, url="https://example.com/docs/item2.html", scraped_title="Original Title"
+    )
+    url3 = DeltaUrlFactory(
+        collection=collection, url="https://example.com/docs/pdfs/item1.html", scraped_title="Original Title"
+    )
+
+    # Create general pattern (matches all URLs)
+    general_pattern = DeltaTitlePattern.objects.create(
+        collection=collection,
+        match_pattern="*docs*",
+        title_pattern="{title} - Docs",
+        match_pattern_type=2,
+    )
+
+    # Verify initial pattern application
+    assert general_pattern.get_url_match_count() == 3
+    assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs"
+    assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs"
+    assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - Docs"
+
+    # Verify DeltaResolvedTitle entries
+    assert DeltaResolvedTitle.objects.count() == 3
+    for url in [url1, url2, url3]:
+        resolved = DeltaResolvedTitle.objects.get(delta_url=url)
+        assert resolved.title_pattern == general_pattern
+        assert resolved.resolved_title == "Original Title - Docs"
+
+    # Create more specific pattern
+    specific_pattern = DeltaTitlePattern.objects.create(
+        collection=collection, match_pattern="*docs/pdfs*", title_pattern="{title} - HTML", match_pattern_type=2
+    )
+
+    # Verify pattern match counts
+    assert specific_pattern.get_url_match_count() == 1  # Only matches pdfs URL
+    assert general_pattern.get_url_match_count() == 3  # Matches all URLs
+
+    # Verify titles were updated appropriately
+    assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs"  # Unchanged
+    assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs"  # Unchanged
+    assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - HTML"  # Updated
+
+    # Verify DeltaResolvedTitle entries
+    assert DeltaResolvedTitle.objects.count() == 3  # Still one per URL
+
+    # URLs with general pattern should be unchanged
+    for url in [url1, url2]:
+        resolved = DeltaResolvedTitle.objects.get(delta_url=url)
+        assert resolved.title_pattern == general_pattern
+        assert resolved.resolved_title == "Original Title - Docs"
+
+    # PDF URL should now use specific pattern
+    resolved_pdf = DeltaResolvedTitle.objects.get(delta_url=url3)
+    assert resolved_pdf.title_pattern == specific_pattern
+    assert resolved_pdf.resolved_title == "Original Title - HTML"
+
+    # Verify pattern relationships are maintained
+    assert url1 in general_pattern.delta_urls.all()
+    assert url2 in general_pattern.delta_urls.all()
+    assert url3 in general_pattern.delta_urls.all()
+    assert url3 in specific_pattern.delta_urls.all()

From d87b355d9fe56fd7b541752dde62cfa59068b0fd Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 25 Nov 2024 17:43:10 -0600
Subject: [PATCH 218/441] implement pattern specificity across entire class

---
 sde_collections/models/delta_patterns.py      | 74 ++++++++++-------
 sde_collections/tests/test_delta_patterns.py  | 17 ++--
 ...atterns.py => test_pattern_specificity.py} | 83 ++++++++++++++++++-
 3 files changed, 130 insertions(+), 44 deletions(-)
 rename sde_collections/tests/{test_title_patterns.py => test_pattern_specificity.py} (50%)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index f65cf4a5..0d2310a3 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -40,6 +40,38 @@ class MatchPatternTypeChoices(models.IntegerChoices):
         related_name="%(class)ss",  # Makes curated_url.deltaincludepatterns.all()
     )
 
+    def get_url_match_count(self):
+        """
+        Get the number of unique URLs this pattern matches across both delta and curated URLs.
+        """
+        delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True))
+        curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True))
+        return len(delta_urls.union(curated_urls))
+
+    def is_most_distinctive_pattern(self, url) -> bool:
+        """
+        Determine if this pattern should apply to a URL by checking if it matches
+        the smallest number of URLs among all patterns that match this URL.
+        Returns True if this pattern should be applied.
+        """
+        my_match_count = self.get_url_match_count()
+
+        # Get patterns from same type that affect this URL
+        pattern_class = self.__class__
+        matching_patterns = (
+            pattern_class.objects.filter(collection=self.collection)
+            .filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url))
+            .exclude(id=self.id)
+            .distinct()
+        )
+
+        # If any matching pattern has a smaller URL set, don't apply
+        for pattern in matching_patterns:
+            if pattern.get_url_match_count() < my_match_count:
+                return False
+
+        return True
+
     def get_regex_pattern(self) -> str:
         """Convert the match pattern into a proper regex based on pattern type."""
         escaped_pattern = re.escape(self.match_pattern)
@@ -240,6 +272,9 @@ def apply(self) -> None:
 
         # Create DeltaUrls only where field value would change
         for curated_url in previously_unaffected_curated:
+            if not self.is_most_distinctive_pattern(curated_url):
+                continue
+
             if (
                 getattr(curated_url, field) == new_value
                 or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists()
@@ -257,8 +292,13 @@ def apply(self) -> None:
 
             DeltaUrl.objects.create(**fields)
 
-        # Update all matching DeltaUrls with the new field value
-        self.get_matching_delta_urls().update(**{field: new_value})
+        # Update all matching DeltaUrls with the new field value if this is the most distinctive pattern
+        for delta_url in self.get_matching_delta_urls():
+            if self.is_most_distinctive_pattern(delta_url):
+                setattr(delta_url, field, new_value)
+                delta_url.save()
+
+        # Update pattern relationships
         self.update_affected_delta_urls_list()
 
     def unapply(self) -> None:
@@ -388,35 +428,6 @@ def generate_title_for_url(self, url_obj) -> tuple[str, str | None]:
         except (ValueError, ValidationError) as e:
             return None, str(e)
 
-    def get_url_match_count(self):
-        """
-        Get the number of unique URLs this pattern matches across both delta and curated URLs.
-        """
-        delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True))
-        curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True))
-        return len(delta_urls.union(curated_urls))
-
-    def is_most_distinctive_pattern(self, url) -> bool:
-        """
-        Determine if this pattern should apply to a URL by checking if it matches
-        the smallest number of URLs among all patterns that match this URL.
-        Returns True if this pattern should be applied.
-        """
-        my_match_count = self.get_url_match_count()
-
-        # Get all patterns that match this URL based on match_pattern regex
-        matching_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(
-            id=self.id
-        )  # Exclude self to avoid duplicate counting
-
-        # Filter to only patterns that would match this URL and get their counts
-        for pattern in matching_patterns:
-            if re.match(pattern.get_regex_pattern(), url.url):
-                if pattern.get_url_match_count() < my_match_count:
-                    return False
-
-        return True
-
     def apply(self) -> None:
         """
         Apply the title pattern to matching URLs:
@@ -491,7 +502,6 @@ def apply(self) -> None:
 
         # Update pattern relationships
         self.update_affected_delta_urls_list()
-        self.update_affected_curated_urls_list()
 
     def unapply(self) -> None:
         """
diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py
index 625618a1..b72981fc 100644
--- a/sde_collections/tests/test_delta_patterns.py
+++ b/sde_collections/tests/test_delta_patterns.py
@@ -2,12 +2,12 @@
 
 import pytest
 
-from sde_collections.models.delta_patterns import DeltaExcludePattern, DeltaTitlePattern
-from sde_collections.models.delta_url import (
-    CuratedUrl,
+from sde_collections.models.delta_patterns import (
+    DeltaExcludePattern,
     DeltaResolvedTitleError,
-    DeltaUrl,
+    DeltaTitlePattern,
 )
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
 from sde_collections.tests.factories import (
     CollectionFactory,
     CuratedUrlFactory,
@@ -101,7 +101,6 @@ def test_apply_generates_delta_url_if_title_differs(self):
             collection=collection,
             url="https://example.com/page",
             scraped_title="Sample Title",
-            generated_title="Old Title - Processed",
         )
 
         # Step 2: Create a `DeltaTitlePattern` with a new title pattern
@@ -112,9 +111,6 @@ def test_apply_generates_delta_url_if_title_differs(self):
             title_pattern="{title} - Processed New",
         )
 
-        # Apply the pattern
-        pattern.apply()
-
         # Step 3: A new DeltaUrl should be created with the updated `generated_title`
         delta_url = DeltaUrl.objects.get(url=curated_url.url)
         expected_generated_title = resolve_title(
@@ -217,7 +213,7 @@ def test_unapply_removes_pattern_relationships(self):
         curated_url = CuratedUrlFactory(
             collection=collection, url="https://example.com/page", scraped_title="Sample Title"
         )
-        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title")
+        delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title")
 
         # Create and apply a `DeltaTitlePattern`
         pattern = DeltaTitlePattern.objects.create(
@@ -231,7 +227,8 @@ def test_unapply_removes_pattern_relationships(self):
 
         # Ensure relationships are set
         assert pattern.delta_urls.filter(pk=delta_url.pk).exists()
-        assert pattern.curated_urls.filter(pk=curated_url.pk).exists()
+        # this actually shouldn't match until after promotion
+        assert not pattern.curated_urls.filter(pk=curated_url.pk).exists()
 
         # Unapply the pattern
         pattern.unapply()
diff --git a/sde_collections/tests/test_title_patterns.py b/sde_collections/tests/test_pattern_specificity.py
similarity index 50%
rename from sde_collections/tests/test_title_patterns.py
rename to sde_collections/tests/test_pattern_specificity.py
index 36e6719f..98c7f006 100644
--- a/sde_collections/tests/test_title_patterns.py
+++ b/sde_collections/tests/test_pattern_specificity.py
@@ -1,8 +1,13 @@
-# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_patterns.py
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py
 
 import pytest
 
-from sde_collections.models.delta_patterns import DeltaResolvedTitle, DeltaTitlePattern
+from sde_collections.models.collection_choice_fields import DocumentTypes
+from sde_collections.models.delta_patterns import (
+    DeltaDocumentTypePattern,
+    DeltaResolvedTitle,
+    DeltaTitlePattern,
+)
 from sde_collections.models.delta_url import DeltaUrl
 from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory
 
@@ -77,3 +82,77 @@ def test_title_pattern_multiple_resolved_titles_extended():
     assert url2 in general_pattern.delta_urls.all()
     assert url3 in general_pattern.delta_urls.all()
     assert url3 in specific_pattern.delta_urls.all()
+
+
+@pytest.mark.django_db
+def test_field_modifying_pattern_layered_specificity():
+    """Test overlapping patterns with different levels of specificity."""
+    collection = CollectionFactory()
+
+    # Create URLs in a hierarchy that allows for overlapping pattern matches
+    deep_tool = DeltaUrlFactory(
+        collection=collection,
+        url="https://example.com/tools/analysis/v2/processor.py",
+        document_type=DocumentTypes.DOCUMENTATION,  # Starting as documentation
+    )
+    mid_tool = DeltaUrlFactory(
+        collection=collection,
+        url="https://example.com/tools/analysis/helper.py",
+        document_type=DocumentTypes.DOCUMENTATION,  # Starting as documentation
+    )
+    top_tool = DeltaUrlFactory(
+        collection=collection,
+        url="https://example.com/tools/simple.py",
+        document_type=DocumentTypes.DOCUMENTATION,  # Starting as documentation
+    )
+
+    # Create patterns with overlapping matches
+    broad_pattern = DeltaDocumentTypePattern.objects.create(
+        collection=collection,
+        match_pattern="*/tools/*.py",  # Matches all 3 URLs
+        document_type=DocumentTypes.SOFTWARETOOLS,
+        match_pattern_type=2,
+    )
+
+    mid_pattern = DeltaDocumentTypePattern.objects.create(
+        collection=collection,
+        match_pattern="*/tools/analysis/*.py",  # Matches 2 URLs (mid and deep)
+        document_type=DocumentTypes.DATA,  # Different type to clearly show which pattern won
+        match_pattern_type=2,
+    )
+
+    specific_pattern = DeltaDocumentTypePattern.objects.create(
+        collection=collection,
+        match_pattern="*/analysis/v2/*.py",  # Matches only 1 URL (deep)
+        document_type=DocumentTypes.DOCUMENTATION,  # Different type to clearly show which pattern won
+        match_pattern_type=2,
+    )
+
+    # Verify URL match counts
+    assert broad_pattern.get_url_match_count() == 3
+    assert mid_pattern.get_url_match_count() == 2
+    assert specific_pattern.get_url_match_count() == 1
+
+    # Verify patterns were applied correctly based on specificity
+    deep_tool.refresh_from_db()
+    mid_tool.refresh_from_db()
+    top_tool.refresh_from_db()
+
+    # The most specific pattern (1 match) should win for the deep URL
+    assert deep_tool.document_type == DocumentTypes.DOCUMENTATION, "Deep URL should use most specific pattern"
+
+    # The mid-level pattern (2 matches) should win for the middle URL
+    assert mid_tool.document_type == DocumentTypes.DATA, "Mid URL should use mid-level pattern"
+
+    # The broad pattern (3 matches) should only affect the top URL
+    assert top_tool.document_type == DocumentTypes.SOFTWARETOOLS, "Top URL should use broad pattern"
+
+    # Verify the relationships are tracked correctly
+    assert deep_tool.pk in specific_pattern.delta_urls.values_list("pk", flat=True)
+    assert deep_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True)
+    assert deep_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
+
+    assert mid_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True)
+    assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
+
+    assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)

From 3bb1acc4582a06ea25ae5751c36f823c0424d2cf Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 25 Nov 2024 18:08:48 -0600
Subject: [PATCH 219/441] adjust ResolvedTitle imports

---
 sde_collections/admin.py | 3 ++-
 sde_collections/views.py | 7 ++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 8c9acf9b..02f4b11f 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -5,12 +5,13 @@
 
 from sde_collections.models.delta_patterns import (
     DeltaDivisionPattern,
+    DeltaResolvedTitle,
     DeltaTitlePattern,
 )
 
 from .models.candidate_url import CandidateURL, ResolvedTitle
 from .models.collection import Collection, WorkflowHistory
-from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
+from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api
 
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 6c63abfe..3ceaed84 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -31,14 +31,11 @@
     DeltaDocumentTypePattern,
     DeltaExcludePattern,
     DeltaIncludePattern,
-    DeltaTitlePattern,
-)
-from .models.delta_url import (
-    CuratedUrl,
     DeltaResolvedTitle,
     DeltaResolvedTitleError,
-    DeltaUrl,
+    DeltaTitlePattern,
 )
+from .models.delta_url import CuratedUrl, DeltaUrl
 from .serializers import (
     CollectionReadSerializer,
     CollectionSerializer,

From 0fcf2ea0cb587b308259d5bae38441f419c62155 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 25 Nov 2024 19:01:36 -0600
Subject: [PATCH 220/441] handle network errors and arbitrary errors during
 title resolution

---
 sde_collections/models/delta_patterns.py |  2 +-
 sde_collections/utils/title_resolver.py  | 46 +++++++++++++-----------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index 0d2310a3..ae3e92ea 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -425,7 +425,7 @@ def generate_title_for_url(self, url_obj) -> tuple[str, str | None]:
 
         try:
             return resolve_title(self.title_pattern, context), None
-        except (ValueError, ValidationError) as e:
+        except Exception as e:
             return None, str(e)
 
     def apply(self) -> None:
diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py
index 20211bf7..165065d9 100644
--- a/sde_collections/utils/title_resolver.py
+++ b/sde_collections/utils/title_resolver.py
@@ -63,29 +63,33 @@ def resolve_xpath(xpath: str, url: str) -> str:
     if not is_valid_xpath(xpath):
         raise ValueError(f"The xpath, {xpath}, is not valid.")
 
-    response = requests.get(url)
-
-    if response.ok:
-        tree = html.fromstring(response.content)
-        values = tree.xpath(xpath)
-
-        if len(values) == 1:
-            if isinstance(values[0], str):
-                text_content = values[0]
-            else:
-                text_content = values[0].text
-
-            if text_content:
-                text_content = clean_text(text_content)
-                return text_content
+    try:
+        response = requests.get(url)
+
+        if response.ok:
+            tree = html.fromstring(response.content)
+            values = tree.xpath(xpath)
+
+            if len(values) == 1:
+                if isinstance(values[0], str):
+                    text_content = values[0]
+                else:
+                    text_content = values[0].text
+
+                if text_content:
+                    text_content = clean_text(text_content)
+                    return text_content
+                else:
+                    raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.")
+            elif len(values) > 1:
+                raise ValueError(f"More than one element found for the xpath, {xpath}")
             else:
-                raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.")
-        elif len(values) > 1:
-            raise ValueError(f"More than one element found for the xpath, {xpath}")
+                raise ValueError(f"No element found for the xpath, {xpath}")
         else:
-            raise ValueError(f"No element found for the xpath, {xpath}")
-    else:
-        raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}")
+            raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}")
+
+    except requests.RequestException as e:
+        raise ValueError(f"Network error while accessing {url}: {str(e)}")
 
 
 def parse_title(input_string: str) -> list[tuple[str, str]]:

From 77b1ec328b321d099bea36aa1579f8abc3da601d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 26 Nov 2024 01:33:50 -0600
Subject: [PATCH 221/441] Fixes #1096

---
 sde_collections/sinequa_api.py     | 14 +++++++++++--
 sde_collections/tests/api_tests.py | 33 +++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index c16277d5..d0713062 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -231,5 +231,15 @@ def get_full_texts(self, collection_config_folder: str, source: str = None, coll
         return self.sql_query(sql, collection)
 
     @staticmethod
-    def _process_full_text_response(batch_data: str):
-        return [{"url": url, "full_text": full_text, "title": title} for url, full_text, title in batch_data["Rows"]]
+    def _process_full_text_response(batch_data: dict):
+        if 'Rows' not in batch_data or not isinstance(batch_data['Rows'], list):
+            raise ValueError("Expected 'Rows' key with a list of data.")
+
+        processed_data = []
+        for row in batch_data['Rows']:
+            # Ensure each row has exactly three elements (url, full_text, title)
+            if len(row) != 3:
+                raise ValueError("Each row must contain exactly three elements (url, full_text, title).")
+            url, full_text, title = row
+            processed_data.append({"url": url, "full_text": full_text, "title": title})
+        return processed_data
diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 0a7a9245..85db82a8 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -147,7 +147,7 @@ def test_api_init(self, server_name, user, password, expected):
     @patch("requests.post")
     def test_query_dev_server_authentication(self, mock_post, api_instance):
         """Test query on dev servers requiring authentication."""
-        api_instance.server_name = "xli"  # Setting a dev server
+        api_instance.server_name = "xli"
         mock_post.return_value = MagicMock(status_code=200, json=lambda: {"result": "success"})
 
         response = api_instance.query(page=1, collection_config_folder="folder")
@@ -168,3 +168,34 @@ def test_sql_query_pagination(self, mock_process_response, api_instance, collect
 
         result = api_instance.sql_query("SELECT * FROM test_index", collection)
         assert result == "All 6 records have been processed and updated."
+
+    def test_process_full_text_response(self, api_instance):
+        """Test that _process_full_text_response correctly processes the data."""
+        batch_data = {"Rows": [
+            ["http://example.com", "Example text", "Example title"],
+            ["http://example.net", "Another text", "Another title"]
+        ]}
+        expected_output = [
+            {"url": "http://example.com", "full_text": "Example text", "title": "Example title"},
+            {"url": "http://example.net", "full_text": "Another text", "title": "Another title"}
+        ]
+        result = api_instance._process_full_text_response(batch_data)
+        assert result == expected_output
+
+    def test_process_full_text_response_with_invalid_data(self, api_instance):
+        """Test that _process_full_text_response raises an error with invalid data."""
+        # Test for missing 'Rows' key
+        invalid_data_no_rows = {}  # No 'Rows' key
+        with pytest.raises(ValueError, match="Expected 'Rows' key with a list of data"):
+            api_instance._process_full_text_response(invalid_data_no_rows)
+
+        # Test for incorrect row length
+        invalid_data_wrong_length = {"Rows": [["http://example.com", "Example text"]]}  # Missing 'title'
+        with pytest.raises(ValueError, match="Each row must contain exactly three elements"):
+            api_instance._process_full_text_response(invalid_data_wrong_length)
+
+    @patch("sde_collections.sinequa_api.Api._get_token", return_value=None)
+    def test_sql_query_missing_token(self, mock_get_token, api_instance, collection):
+        """Test that sql_query raises an error when no token is provided."""
+        with pytest.raises(ValueError, match="A token is required to use the SQL endpoint"):
+            api_instance.sql_query("SELECT * FROM test_table", collection)

From 60fdac1f91c9324f9664fdb0a1a41369db85b5d2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 Nov 2024 07:34:12 +0000
Subject: [PATCH 222/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/sinequa_api.py     |  4 ++--
 sde_collections/tests/api_tests.py | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index d0713062..03d3a724 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -232,11 +232,11 @@ def get_full_texts(self, collection_config_folder: str, source: str = None, coll
 
     @staticmethod
     def _process_full_text_response(batch_data: dict):
-        if 'Rows' not in batch_data or not isinstance(batch_data['Rows'], list):
+        if "Rows" not in batch_data or not isinstance(batch_data["Rows"], list):
             raise ValueError("Expected 'Rows' key with a list of data.")
 
         processed_data = []
-        for row in batch_data['Rows']:
+        for row in batch_data["Rows"]:
             # Ensure each row has exactly three elements (url, full_text, title)
             if len(row) != 3:
                 raise ValueError("Each row must contain exactly three elements (url, full_text, title).")
diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 85db82a8..8abb5f08 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -171,13 +171,15 @@ def test_sql_query_pagination(self, mock_process_response, api_instance, collect
 
     def test_process_full_text_response(self, api_instance):
         """Test that _process_full_text_response correctly processes the data."""
-        batch_data = {"Rows": [
-            ["http://example.com", "Example text", "Example title"],
-            ["http://example.net", "Another text", "Another title"]
-        ]}
+        batch_data = {
+            "Rows": [
+                ["http://example.com", "Example text", "Example title"],
+                ["http://example.net", "Another text", "Another title"],
+            ]
+        }
         expected_output = [
             {"url": "http://example.com", "full_text": "Example text", "title": "Example title"},
-            {"url": "http://example.net", "full_text": "Another text", "title": "Another title"}
+            {"url": "http://example.net", "full_text": "Another text", "title": "Another title"},
         ]
         result = api_instance._process_full_text_response(batch_data)
         assert result == expected_output

From 4c7834f69c483387751e0afb26cbb0f1e13ff3db Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 26 Nov 2024 09:17:25 -0600
Subject: [PATCH 223/441] add an overview readme to introduce the system

---
 sde_collections/models/README.md              | 78 +++++++++++++++++
 .../models/README_PATTERN_RESOLUTION.md       | 84 ++++++-------------
 2 files changed, 105 insertions(+), 57 deletions(-)
 create mode 100644 sde_collections/models/README.md

diff --git a/sde_collections/models/README.md b/sde_collections/models/README.md
new file mode 100644
index 00000000..1c5202c4
--- /dev/null
+++ b/sde_collections/models/README.md
@@ -0,0 +1,78 @@
+# URL Pattern Management System
+
+## Overview
+This system provides a framework for managing and curating collections of URLs through pattern-based rules. It enables systematic modification, categorization, and filtering of URLs while maintaining a clear separation between work-in-progress changes and production content.
+
+## Core Concepts
+
+### URL States
+Content progresses through three states:
+- **Dump URLs**: Raw content from initial scraping/indexing
+- **Delta URLs**: Work-in-progress changes and modifications
+- **Curated URLs**: Production-ready, approved content
+
+### Pattern Types
+- **Include/Exclude Patterns**: Control which URLs are included in collections
+  - Include patterns always override exclude patterns
+  - Use wildcards for matching multiple URLs
+
+- **Modification Patterns**: Change URL properties
+  - Title patterns modify final titles shown in search results
+  - Document type patterns affect which tab the URL appears under
+  - Division patterns assign URLs within the Science Knowledge Sources
+
+### Pattern Resolution
+The system uses a "smallest set priority" strategy which resolves conflicts by always using the most specific pattern that matches a URL:
+- Multiple patterns can match the same URL
+- Pattern matching the smallest number of URLs takes precedence
+- Applies to title, division, and document type patterns
+- More specific patterns naturally override general ones
+
+## Getting Started
+
+To effectively understand this system, we recommend reading through the documentation in the following order:
+
+1. Begin with the Pattern System Overview to learn the fundamental concepts of how patterns work and interact with URLs
+2. Next, explore the URL Lifecycle documentation to understand how content moves through different states
+3. The Pattern Resolution documentation will show you how the system handles overlapping patterns
+4. Learn how to control which URLs appear in your collection with the Include/Exclude patterns guide
+5. Finally, review the Pattern Unapplication Logic to understand how pattern removal affects your URLs
+
+Each section builds upon knowledge from previous sections, providing a comprehensive understanding of the system.
+
+## Documentation
+
+[Pattern System Overview](./README_PATTERN_SYSTEM.md)
+- Core concepts and pattern types
+- Pattern lifecycle and effects
+- Delta URL generation rules
+- Working principles (idempotency, separation of concerns)
+- Pattern interaction examples
+
+[URL Lifecycle Management](./README_LIFECYCLE.md)
+- Migration process (Dump → Delta)
+- Promotion process (Delta → Curated)
+- Field handling during transitions
+- Pattern application timing
+- Data integrity considerations
+
+[Pattern Resolution](./README_PATTERN_RESOLUTION.md)
+- Smallest set priority mechanism
+- URL counting and precedence
+- Performance considerations
+- Edge case handling
+- Implementation details
+
+[URL Inclusion/Exclusion](./README_INCLUSION.md)
+- Wildcard pattern matching
+- Include/exclude precedence
+- Example pattern configurations
+- Best practices
+- Common pitfalls and solutions
+
+[Pattern Unapplication Logic](./README_UNAPPLY_LOGIC.md)
+- Pattern removal handling
+- Delta management during unapplication
+- Manual change preservation
+- Cleanup procedures
+- Edge case handling
diff --git a/sde_collections/models/README_PATTERN_RESOLUTION.md b/sde_collections/models/README_PATTERN_RESOLUTION.md
index ad9f0e25..936e8424 100644
--- a/sde_collections/models/README_PATTERN_RESOLUTION.md
+++ b/sde_collections/models/README_PATTERN_RESOLUTION.md
@@ -1,32 +1,16 @@
-# URL Pattern Application Strategies
+# Pattern Resolution System
 
-## Strategy 1: Exclusive Patterns
+## Overview
+The pattern system uses a "smallest set priority" strategy for resolving conflicts between overlapping patterns. This applies to title patterns, division patterns, and document type patterns. The pattern that matches the smallest set of URLs takes precedence.
 
-Patterns have exclusive ownership of URLs they match. System prevents creation of overlapping patterns.
+## How It Works
 
-Example:
-```
-Pattern A: */docs/*          # Matches 100 URLs
-Pattern B: */docs/api/*      # Rejected - overlaps with Pattern A
-Pattern C: */blog/*          # Accepted - no overlap
-```
-
-Benefits:
-- Clear ownership
-- Predictable effects
-- Simple conflict resolution
-- Easy to debug
-
-Drawbacks:
-- Less flexible
-- May require many specific patterns
-- May need pattern deletion/recreation to modify rules
-
-## Strategy 2: Smallest Set Priority
+When multiple patterns match a URL, the system:
+1. Counts how many total URLs each pattern matches
+2. Compares the counts
+3. Applies the pattern that matches the fewest URLs
 
-Multiple patterns can match same URLs. Pattern affecting smallest URL set takes precedence.
-
-Example:
+### Example
 ```
 Pattern A: */docs/*          # Matches 100 URLs
 Pattern B: */docs/api/*      # Matches 20 URLs
@@ -37,42 +21,28 @@ For URL "/docs/api/v2/users":
 - Pattern C wins (5 URLs < 20 URLs < 100 URLs)
 ```
 
-Benefits:
-- More flexible rule creation
-- Natural handling of specificity
-
-Drawbacks:
-- Complex precedence rules
-- Pattern effects can change as URL sets grow
-- Harder to predict/debug
-- Performance impact from URL set size calculations
-
-## Implementation Notes
+## Pattern Types and Resolution
 
-Strategy 1:
+### Title Patterns
 ```python
-def save(self, *args, **kwargs):
-    # Check for overlapping patterns
-    overlapping = self.get_matching_delta_urls().filter(
-        deltapatterns__isnull=False
-    ).exists()
-    if overlapping:
-        raise ValidationError("Pattern would overlap existing pattern")
-    super().save(*args, **kwargs)
+# More specific title pattern takes precedence
+Pattern A: */docs/* → title="Documentation"           # 100 URLs
+Pattern B: */docs/api/* → title="API Reference"       # 20 URLs
+Result: URL gets title "API Reference"
 ```
 
-Strategy 2:
+### Division Patterns
 ```python
-def apply(self):
-    matching_urls = self.get_matching_delta_urls()
-    my_url_count = matching_urls.count()
-
-    # Only apply if this pattern matches fewer URLs than other matching patterns
-    for url in matching_urls:
-        other_patterns_min_count = url.deltapatterns.annotate(
-            url_count=Count('delta_urls')
-        ).aggregate(Min('url_count'))['url_count__min'] or float('inf')
+# More specific division assignment wins
+Pattern A: *.pdf → division="GENERAL"                 # 500 URLs
+Pattern B: */specs/*.pdf → division="ENGINEERING"     # 50 URLs
+Result: URL gets division "ENGINEERING"
+```
 
-        if my_url_count <= other_patterns_min_count:
-            self.apply_to_url(url)
+### Document Type Patterns
+```python
+# Most specific document type classification applies
+Pattern A: */docs/* → type="DOCUMENTATION"            # 200 URLs
+Pattern B: */docs/data/* → type="DATA"                # 30 URLs
+Result: URL gets type "DATA"
 ```

From e2650e56676b2192d7c1b9efd6bf93a678641db8 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 26 Nov 2024 11:22:50 -0600
Subject: [PATCH 224/441] update the pattern resolution readme

---
 .../models/README_PATTERN_RESOLUTION.md       | 112 ++++++++++++++----
 1 file changed, 92 insertions(+), 20 deletions(-)

diff --git a/sde_collections/models/README_PATTERN_RESOLUTION.md b/sde_collections/models/README_PATTERN_RESOLUTION.md
index 936e8424..91a4259a 100644
--- a/sde_collections/models/README_PATTERN_RESOLUTION.md
+++ b/sde_collections/models/README_PATTERN_RESOLUTION.md
@@ -4,45 +4,117 @@
 The pattern system uses a "smallest set priority" strategy for resolving conflicts between overlapping patterns. This applies to title patterns, division patterns, and document type patterns. The pattern that matches the smallest set of URLs takes precedence.
 
 ## How It Works
-
 When multiple patterns match a URL, the system:
 1. Counts how many total URLs each pattern matches
 2. Compares the counts
 3. Applies the pattern that matches the fewest URLs
 
-### Example
+### Example Pattern Hierarchy
 ```
 Pattern A: */docs/*          # Matches 100 URLs
 Pattern B: */docs/api/*      # Matches 20 URLs
 Pattern C: */docs/api/v2/*   # Matches 5 URLs
 
-For URL "/docs/api/v2/users":
-- All patterns match
-- Pattern C wins (5 URLs < 20 URLs < 100 URLs)
+Example URLs and Which Patterns Apply:
+1. https://example.com/docs/overview.html
+   ✓ Matches Pattern A
+   ✗ Doesn't match Pattern B or C
+   Result: Pattern A applies (only match)
+
+2. https://example.com/docs/api/endpoints.html
+   ✓ Matches Pattern A
+   ✓ Matches Pattern B
+   ✗ Doesn't match Pattern C
+   Result: Pattern B applies (20 < 100 URLs)
+
+3. https://example.com/docs/api/v2/users.html
+   ✓ Matches Pattern A
+   ✓ Matches Pattern B
+   ✓ Matches Pattern C
+   Result: Pattern C applies (5 < 20 < 100 URLs)
 ```
 
 ## Pattern Types and Resolution
 
 ### Title Patterns
-```python
-# More specific title pattern takes precedence
-Pattern A: */docs/* → title="Documentation"           # 100 URLs
-Pattern B: */docs/api/* → title="API Reference"       # 20 URLs
-Result: URL gets title "API Reference"
+```
+Patterns:
+A: */docs/* → title="Documentation"           # Matches 100 URLs
+B: */docs/api/* → title="API Reference"       # Matches 20 URLs
+C: */docs/api/v2/* → title="V2 API Guide"     # Matches 5 URLs
+
+Example URLs:
+1. https://example.com/docs/getting-started.html
+   • Matches: Pattern A
+   • Result: title="Documentation"
+
+2. https://example.com/docs/api/authentication.html
+   • Matches: Patterns A, B
+   • Result: title="API Reference"
+
+3. https://example.com/docs/api/v2/oauth.html
+   • Matches: Patterns A, B, C
+   • Result: title="V2 API Guide"
 ```
 
 ### Division Patterns
-```python
-# More specific division assignment wins
-Pattern A: *.pdf → division="GENERAL"                 # 500 URLs
-Pattern B: */specs/*.pdf → division="ENGINEERING"     # 50 URLs
-Result: URL gets division "ENGINEERING"
+```
+Patterns:
+A: *.pdf → division="GENERAL"                 # Matches 500 URLs
+B: */specs/*.pdf → division="ENGINEERING"     # Matches 50 URLs
+C: */specs/2024/*.pdf → division="RESEARCH"   # Matches 10 URLs
+
+Example URLs:
+1. https://example.com/docs/report.pdf
+   • Matches: Pattern A
+   • Result: division="GENERAL"
+
+2. https://example.com/specs/architecture.pdf
+   • Matches: Patterns A, B
+   • Result: division="ENGINEERING"
+
+3. https://example.com/specs/2024/roadmap.pdf
+   • Matches: Patterns A, B, C
+   • Result: division="RESEARCH"
 ```
 
 ### Document Type Patterns
-```python
-# Most specific document type classification applies
-Pattern A: */docs/* → type="DOCUMENTATION"            # 200 URLs
-Pattern B: */docs/data/* → type="DATA"                # 30 URLs
-Result: URL gets type "DATA"
+```
+Patterns:
+A: */docs/* → type="DOCUMENTATION"            # Matches 200 URLs
+B: */docs/data/* → type="DATA"                # Matches 30 URLs
+C: */docs/data/schemas/* → type="SCHEMA"      # Matches 8 URLs
+
+Example URLs:
+1. https://example.com/docs/guide.html
+   • Matches: Pattern A
+   • Result: type="DOCUMENTATION"
+
+2. https://example.com/docs/data/metrics.json
+   • Matches: Patterns A, B
+   • Result: type="DATA"
+
+3. https://example.com/docs/data/schemas/user.json
+   • Matches: Patterns A, B, C
+   • Result: type="SCHEMA"
+```
+
+## Special Cases
+
+### Mixed Pattern Types
+```
+When different pattern types overlap, each is resolved independently:
+
+URL: https://example.com/docs/api/v2/schema.json
+Matching Patterns:
+1. */docs/* → title="Documentation", 100 matches
+2. */docs/* → doc_type="DOCUMENTATION", 100 matches
+3. */docs/api/* → title="API Reference", 50 matches
+4. */docs/api/v2/* → division="ENGINEERING", 10 matches
+5. */docs/api/v2/*.json → doc_type="DATA", 3 matches
+
+Final Result:
+• title="API Reference" (from pattern 3, most specific title pattern)
+• division="ENGINEERING" (from pattern 4, only matching division pattern)
+• doc_type="DATA" (from pattern 5, most specific doc_type pattern)
 ```

From 8c1932348857b18b27bf5e8774f4a0bbc9f94bf2 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 26 Nov 2024 14:50:49 -0600
Subject: [PATCH 225/441] add note about distinct

---
 sde_collections/models/delta_patterns.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index ae3e92ea..c0c3123b 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -63,7 +63,7 @@ def is_most_distinctive_pattern(self, url) -> bool:
             .filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url))
             .exclude(id=self.id)
             .distinct()
-        )
+        )  # TODO: does this have a distinct urls, or distinct model objects.
 
         # If any matching pattern has a smaller URL set, don't apply
         for pattern in matching_patterns:

From c4c1bc1c8796f29693f68e080f9380559fd038bf Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 26 Nov 2024 15:11:41 -0600
Subject: [PATCH 226/441] improved doc strings and the errors

---
 sde_collections/sinequa_api.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 03d3a724..aa559474 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -64,7 +64,7 @@ class Api:
     def __init__(self, server_name: str = None, user: str = None, password: str = None, token: str = None) -> None:
         self.server_name = server_name
         if server_name not in server_configs:
-            raise ValueError(f"Server name '{server_name}' is not in server_configs")
+            raise ValueError(f"Invalid server configuration: '{server_name}' is not a recognized server name")
 
         self.config = server_configs[server_name]
         self.app_name: str = self.config["app_name"]
@@ -72,7 +72,6 @@ def __init__(self, server_name: str = None, user: str = None, password: str = No
         self.base_url: str = self.config["base_url"]
         self.dev_servers = ["xli", "lrm_dev", "lrm_qa"]
 
-        # Store provided values only
         self._provided_user = user
         self._provided_password = password
         self._provided_token = token
@@ -116,7 +115,8 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
             password = self._get_password()
             if not user or not password:
                 raise ValueError(
-                    "User and password are required for the query endpoint on the following servers: {self.dev_servers}"
+                    f"Authentication error: Missing credentials for dev server '{self.server_name}'. "
+                    f"Both username and password are required for servers: {', '.join(self.dev_servers)}"
                 )
             authentication = f"?Password={password}&User={user}"
             url = f"{url}{authentication}"
@@ -139,10 +139,9 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
         return self.process_response(url, payload)
 
     def sql_query(self, sql: str, collection) -> Any:
-        """Executes an SQL query on the configured server using token-based authentication with pagination."""
         token = self._get_token()
         if not token:
-            raise ValueError("A token is required to use the SQL endpoint")
+            raise ValueError("Authentication error: Token is required for SQL endpoint access")
 
         page = 0
         page_size = 5000  # Number of records per page
@@ -165,7 +164,6 @@ def sql_query(self, sql: str, collection) -> Any:
             total_row_count = response.get("TotalRowCount", 0)
             processed_response = self._process_full_text_response(response)
             self.process_and_update_data(processed_response, collection)
-            print(f"Batch {page + 1} is being processed and updated")
 
             # Check if all rows have been fetched
             if len(batch_data) == 0 or (skip_records + page_size) >= total_row_count:
@@ -183,7 +181,6 @@ def process_and_update_data(self, batch_data, collection):
                     url = record["url"]
                     scraped_text = record.get("full_text", "")
                     scraped_title = record.get("title", "")
-                    # Ensure the collection is included in the defaults
                     DumpUrl.objects.update_or_create(
                         url=url,
                         defaults={
@@ -193,6 +190,7 @@ def process_and_update_data(self, batch_data, collection):
                         },
                     )
             except KeyError as e:
+                # TODO: reevaluate whether this should be a Raise and break the code
                 print(f"Missing key in data: {str(e)}")
             except Exception as e:
                 print(f"Error processing record: {str(e)}")
@@ -225,7 +223,10 @@ def get_full_texts(self, collection_config_folder: str, source: str = None, coll
             source = self._get_source_name()
 
         if (index := self.config.get("index")) is None:
-            raise ValueError("Index not defined for this server")
+            raise ValueError(
+                f"Configuration error: Index not defined for server '{self.server_name}'. "
+                "Please update server configuration with the required index."
+            )
 
         sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
         return self.sql_query(sql, collection)
@@ -233,13 +234,18 @@ def get_full_texts(self, collection_config_folder: str, source: str = None, coll
     @staticmethod
     def _process_full_text_response(batch_data: dict):
         if "Rows" not in batch_data or not isinstance(batch_data["Rows"], list):
-            raise ValueError("Expected 'Rows' key with a list of data.")
+            raise ValueError(
+                "Invalid response format: Expected 'Rows' key with list data in Sinequa server response. "
+                f"Received: {type(batch_data.get('Rows', None))}"
+            )
 
         processed_data = []
-        for row in batch_data["Rows"]:
-            # Ensure each row has exactly three elements (url, full_text, title)
+        for idx, row in enumerate(batch_data["Rows"]):
             if len(row) != 3:
-                raise ValueError("Each row must contain exactly three elements (url, full_text, title).")
+                raise ValueError(
+                    f"Invalid row format at index {idx}: Expected exactly three elements (url, full_text, title). "
+                    f"Received {len(row)} elements."
+                )
             url, full_text, title = row
             processed_data.append({"url": url, "full_text": full_text, "title": title})
         return processed_data

From 5a9308c661f9a05983804a3c5d9bd6ea9a1f3eb2 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 2 Dec 2024 00:32:16 -0600
Subject: [PATCH 227/441] Continue in case of missing records

---
 sde_collections/sinequa_api.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index aa559474..f4a45ca3 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -190,7 +190,6 @@ def process_and_update_data(self, batch_data, collection):
                         },
                     )
             except KeyError as e:
-                # TODO: reevaluate whether this should be a Raise and break the code
                 print(f"Missing key in data: {str(e)}")
             except Exception as e:
                 print(f"Error processing record: {str(e)}")

From cba90fa80d90481bc01baa4337a51e0330286701 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 2 Dec 2024 00:43:03 -0600
Subject: [PATCH 228/441] prints for each processed and updated batch

---
 sde_collections/sinequa_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index f4a45ca3..99177df0 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -164,6 +164,7 @@ def sql_query(self, sql: str, collection) -> Any:
             total_row_count = response.get("TotalRowCount", 0)
             processed_response = self._process_full_text_response(response)
             self.process_and_update_data(processed_response, collection)
+            print(f"Batch {page + 1} has been processed and updated")
 
             # Check if all rows have been fetched
             if len(batch_data) == 0 or (skip_records + page_size) >= total_row_count:

From 08e6070c69ec476e4a20d87c50f48f102ec550e6 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 09:08:27 -0600
Subject: [PATCH 229/441] improve cmr tests

---
 scripts/ej/test_cmr_processing.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py
index e83b3b03..3a552118 100644
--- a/scripts/ej/test_cmr_processing.py
+++ b/scripts/ej/test_cmr_processing.py
@@ -1,5 +1,6 @@
 # docker-compose -f local.yml run --rm django pytest scripts/ej/test_cmr_processing.py
 import json
+from urllib.parse import urlparse
 
 import pytest
 from cmr_processing import CmrDataset
@@ -448,7 +449,8 @@ class TestUrlProcessing:
     def test_sde_link_generation(self):
         data = {"meta": {"concept-id": "C179001887-SEDAC"}}
         dataset = CmrDataset(data)
-        assert "sciencediscoveryengine.nasa.gov" in dataset.sde_link
+        parsed_url = urlparse(dataset.sde_link)
+        assert parsed_url.hostname == "sciencediscoveryengine.nasa.gov"
         assert "C179001887-SEDAC" in dataset.sde_link
 
     def test_source_link_generation(self):
@@ -465,14 +467,14 @@ class TestProjectProcessing:
     """Unit tests for project information processing"""
 
     def test_multiple_projects(self):
-        data = {"umm": {"Projects": [{"ShortName": "Project1"}, {"ShortName": "Project2"}]}}
+        data = {"umm": {"Projects": [{"ShortName": "short_1"}, {"ShortName": "short_2"}]}}
         dataset = CmrDataset(data)
-        assert dataset.projects == "Project1; Project2"
+        assert dataset.projects == "short_1; short_2"
 
     def test_missing_project_shortname(self):
-        data = {"umm": {"Projects": [{"LongName": "Project1"}, {"ShortName": "Project2"}]}}
+        data = {"umm": {"Projects": [{"LongName": "long_1"}, {"ShortName": "short_2"}]}}
         dataset = CmrDataset(data)
-        assert dataset.projects == "Project2"
+        assert dataset.projects == "long_1; short_2"
 
     def test_no_projects(self):
         dataset = CmrDataset({"umm": {}})

From 1b23afac4c158fab6bc3412bc8ffe38e3072c34f Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 3 Dec 2024 22:28:53 +0530
Subject: [PATCH 230/441] API tests

---
 sde_collections/tests/test_apis.py | 264 +++++++++++++++++++++++++++++
 sde_collections/urls.py            |   2 +-
 2 files changed, 265 insertions(+), 1 deletion(-)
 create mode 100644 sde_collections/tests/test_apis.py

diff --git a/sde_collections/tests/test_apis.py b/sde_collections/tests/test_apis.py
new file mode 100644
index 00000000..9fb073fe
--- /dev/null
+++ b/sde_collections/tests/test_apis.py
@@ -0,0 +1,264 @@
+import pytest
+from django.urls import reverse
+from rest_framework import status
+
+from sde_collections.tests.factories import (
+    CollectionFactory,
+    CuratedUrlFactory,
+    DeltaUrlFactory,
+)
+
+
+@pytest.mark.django_db
+class TestDeltaURLAPIView:
+    """Test suite for the Delta URL API endpoints"""
+
+    def setup_method(self):
+        """Setup test data"""
+        self.collection = CollectionFactory()
+
+    def test_delta_url_api_empty_list(self, client):
+        """Should return empty list when no delta URLs exist"""
+        url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        assert len(response.json()["results"]) == 0
+
+    def test_delta_url_api_with_data(self, client):
+        """Should return list of non-excluded delta URLs for given config folder"""
+        delta_url1 = DeltaUrlFactory(collection=self.collection)
+
+        url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["url"] == delta_url1.url
+        expected_title = delta_url1.generated_title if delta_url1.generated_title else delta_url1.scraped_title
+        assert data[0]["title"] == expected_title
+
+    def test_delta_url_api_wrong_config_folder(self, client):
+        """Should return empty list for non-existent config folder"""
+        url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": "nonexistent"})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        assert len(response.json()["results"]) == 0
+
+    def test_delta_url_api_serializer_fields(self, client):
+        """Should return all expected fields in serializer"""
+        DeltaUrlFactory(collection=self.collection)
+
+        url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"][0]
+        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root"}
+        assert set(data.keys()) == expected_fields
+
+    def test_delta_url_api_pagination(self, client):
+        """Should correctly paginate results when multiple URLs exist"""
+        [DeltaUrlFactory(collection=self.collection) for _ in range(15)]
+
+        url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()
+        assert "next" in data
+        assert "previous" in data
+        assert "count" in data
+        assert data["count"] == 15
+
+
+@pytest.mark.django_db
+class TestCuratedURLAPIView:
+    """Test suite for the Curated URL API endpoints"""
+
+    def setup_method(self):
+        """Setup test data"""
+        self.collection = CollectionFactory()
+
+    def test_curated_url_api_empty_list(self, client):
+        """Should return empty list when no curated URLs exist"""
+        url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        assert len(response.json()["results"]) == 0
+
+    def test_curated_url_api_with_data(self, client):
+        """Should return list of curated URLs for given config folder"""
+        curated_url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title")
+
+        url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["url"] == curated_url1.url
+        assert data[0]["title"] == curated_url1.generated_title
+
+    def test_curated_url_api_wrong_config_folder(self, client):
+        """Should return empty list for non-existent config folder"""
+        url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": "nonexistent"})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        assert len(response.json()["results"]) == 0
+
+    def test_curated_url_api_serializer_fields(self, client):
+        """Should return all expected fields in serializer"""
+        CuratedUrlFactory(collection=self.collection)
+
+        url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"][0]
+        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root"}
+        assert set(data.keys()) == expected_fields
+
+    def test_candidate_url_api_alias(self, client):
+        """Should verify candidate-urls-api endpoint aliases to curated-urls-api"""
+        curated_url = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title")
+
+        curated_url = reverse(
+            "sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder}
+        )
+        candidate_url = reverse(
+            "sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}
+        )
+
+        curated_response = client.get(curated_url)
+        candidate_response = client.get(candidate_url)
+
+        assert curated_response.status_code == status.HTTP_200_OK
+        assert candidate_response.status_code == status.HTTP_200_OK
+        assert curated_response.json()["results"] == candidate_response.json()["results"]
+
+    def test_multiple_collections(self, client):
+        """Should only return URLs from the specified collection"""
+        other_collection = CollectionFactory()
+
+        url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title 1")
+        CuratedUrlFactory(collection=other_collection, generated_title="Test Generated Title 2")
+
+        url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["url"] == url1.url
+        assert data[0]["title"] == url1.generated_title
+
+    def test_curated_url_api_invalid_filters(self, client):
+        """Should handle invalid filter parameters gracefully"""
+        CuratedUrlFactory(collection=self.collection)
+
+        url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(f"{url}?invalid_filter=value")
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+
+
+@pytest.mark.django_db
+class TestCandidateURLAPIView:
+    """Test suite for the Candidate URL API endpoints. Note that this is an alias for Curated URL API"""
+
+    def setup_method(self):
+        """Setup test data"""
+        self.collection = CollectionFactory()
+
+    def test_candidate_url_api_empty_list(self, client):
+        """Should return empty list when no candidate URLs exist"""
+        url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        assert len(response.json()["results"]) == 0
+
+    def test_candidate_url_api_with_data(self, client):
+        """Should return list of candidate URLs for given config folder"""
+        candidate_url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title")
+
+        url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["url"] == candidate_url1.url
+        assert data[0]["title"] == candidate_url1.generated_title
+
+    def test_candidate_url_api_wrong_config_folder(self, client):
+        """Should return empty list for non-existent config folder"""
+        url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": "nonexistent"})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        assert len(response.json()["results"]) == 0
+
+    def test_candidate_url_api_serializer_fields(self, client):
+        """Should return all expected fields in serializer"""
+        CuratedUrlFactory(collection=self.collection)
+
+        url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"][0]
+        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root"}
+        assert set(data.keys()) == expected_fields
+
+    def test_candidate_url_api_alias(self, client):
+        """Should verify candidate-urls-api endpoint aliases to candidate-urls-api"""
+        candidate_url = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title")
+
+        candidate_url = reverse(
+            "sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}
+        )
+        candidate_url = reverse(
+            "sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}
+        )
+
+        candidate_response = client.get(candidate_url)
+        candidate_response = client.get(candidate_url)
+
+        assert candidate_response.status_code == status.HTTP_200_OK
+        assert candidate_response.status_code == status.HTTP_200_OK
+        assert candidate_response.json()["results"] == candidate_response.json()["results"]
+
+    def test_multiple_collections(self, client):
+        """Should only return URLs from the specified collection"""
+        other_collection = CollectionFactory()
+
+        url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title 1")
+        CuratedUrlFactory(collection=other_collection, generated_title="Test Generated Title 2")
+
+        url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(url)
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["url"] == url1.url
+        assert data[0]["title"] == url1.generated_title
+
+    def test_candidate_url_api_invalid_filters(self, client):
+        """Should handle invalid filter parameters gracefully"""
+        CuratedUrlFactory(collection=self.collection)
+
+        url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder})
+        response = client.get(f"{url}?invalid_filter=value")
+
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index c3d2426f..9ee77759 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -62,7 +62,7 @@
     ),
     path("curated-urls-api/<str:config_folder>/", view=views.CuratedURLAPIView.as_view(), name="curated-url-api"),
     path(
-        "candidate-url-api/<str:config_folder>/",
+        "candidate-urls-api/<str:config_folder>/",
         view=views.CuratedURLAPIView.as_view(),
         name="candidate-url-api",
     ),

From a6c6d692f51491bb91c2a53720a192708686d9fa Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 3 Dec 2024 22:30:37 +0530
Subject: [PATCH 231/441] Added test command

---
 sde_collections/tests/test_apis.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sde_collections/tests/test_apis.py b/sde_collections/tests/test_apis.py
index 9fb073fe..0df84834 100644
--- a/sde_collections/tests/test_apis.py
+++ b/sde_collections/tests/test_apis.py
@@ -1,3 +1,5 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_apis.py
+
 import pytest
 from django.urls import reverse
 from rest_framework import status

From a598171ef01d6bf10e866890b0a753f0a8d70a6b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 11:05:00 -0600
Subject: [PATCH 232/441] improve processing of temporal extent to include
 ranges as well as single dates

---
 scripts/ej/cmr_processing.py      | 18 +++++----
 scripts/ej/test_cmr_processing.py | 61 +++++++++++++++++++++++++++++--
 2 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
index ce9252d8..6b6c77ab 100644
--- a/scripts/ej/cmr_processing.py
+++ b/scripts/ej/cmr_processing.py
@@ -69,26 +69,28 @@ def _check_temporal_range(self, range_datetime: dict) -> tuple[datetime, datetim
         return begin_date, end_date
 
     def _process_temporal_extents(self) -> TemporalInfo:
-        """Process all temporal information."""
         temporal_extents = self.umm.get("TemporalExtents", [])
         latest_end_date = None
         total_duration = 0
-        single_date_times = []
+        all_temporal_strings = []
 
         for extent in temporal_extents:
-            single_date_times.extend(extent.get("SingleDateTimes", []))
-            range_datetimes = extent.get("RangeDateTimes", [])
+            # Process single dates
+            all_temporal_strings.extend(extent.get("SingleDateTimes", []))
 
-            for range_dt in range_datetimes:
+            # Process range dates
+            for range_dt in extent.get("RangeDateTimes", []):
                 try:
                     begin_date, end_date = self._check_temporal_range(range_dt)
+                    range_str = f"{range_dt['BeginningDateTime']} - {range_dt['EndingDateTime']}"
+                    all_temporal_strings.append(range_str)
+
                     if latest_end_date is None or end_date > latest_end_date:
                         latest_end_date = end_date
                     total_duration += (end_date - begin_date).days
                 except (KeyError, ValueError):
                     continue
 
-        # Fix: Extract Value and Unit correctly from the TemporalResolution dictionary
         temporal_resolution_dict = temporal_extents[0].get("TemporalResolution", {}) if temporal_extents else {}
         resolution_value = temporal_resolution_dict.get("Value", "")
         resolution_unit = temporal_resolution_dict.get("Unit", "")
@@ -96,9 +98,9 @@ def _process_temporal_extents(self) -> TemporalInfo:
         return TemporalInfo(
             latest_end_date=latest_end_date,
             total_duration=total_duration,
-            resolution=str(resolution_value),  # Convert to string in case it's a number
+            resolution=str(resolution_value),
             resolution_unit=resolution_unit,
-            single_date_times=single_date_times,
+            single_date_times=sorted(all_temporal_strings),
         )
 
     def _process_spatial_info(self) -> SpatialInfo:
diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py
index 3a552118..56b00b56 100644
--- a/scripts/ej/test_cmr_processing.py
+++ b/scripts/ej/test_cmr_processing.py
@@ -21,13 +21,12 @@ def cmr_dataset(self):
 
     def test_full_dataset_processing(self, cmr_dataset):
         """Test that all properties can be extracted from real data without errors"""
-        # Test all property accessors
         assert cmr_dataset.dataset_name == "2000 Pilot Environmental Sustainability Index (ESI)"
         assert cmr_dataset.description.startswith("The 2000 Pilot Environmental Sustainability Index")
         assert cmr_dataset.limitations == "None"
         assert cmr_dataset.format == "PDF"
-        assert cmr_dataset.temporal_extent == ""  # No SingleDateTimes in example
-        assert cmr_dataset.intended_use == "Path A"  # ProcessingLevel is 4
+        assert cmr_dataset.temporal_extent == "1978-01-01T00:00:00.000Z - 1999-12-31T00:00:00.000Z"
+        assert cmr_dataset.intended_use == "Path A"
         assert cmr_dataset.source_link == "https://doi.org/10.7927/H4NK3BZJ"
         assert "Long temporal extent" in cmr_dataset.strengths
         assert "No recent data available" in cmr_dataset.weaknesses
@@ -133,6 +132,62 @@ def test_missing_temporal_data(self):
         assert dataset.temporal_info.latest_end_date is None
         assert dataset.temporal_resolution == ""
 
+    def test_single_date_only(self):
+        data = {
+            "meta": {},
+            "umm": {"TemporalExtents": [{"SingleDateTimes": ["2020-01-01T00:00:00.000Z", "2020-06-01T00:00:00.000Z"]}]},
+        }
+        dataset = CmrDataset(data)
+        assert dataset.temporal_extent == "2020-01-01T00:00:00.000Z, 2020-06-01T00:00:00.000Z"
+
+    def test_range_date_only(self):
+        data = {
+            "meta": {},
+            "umm": {
+                "TemporalExtents": [
+                    {
+                        "RangeDateTimes": [
+                            {
+                                "BeginningDateTime": "2020-01-01T00:00:00.000Z",
+                                "EndingDateTime": "2020-12-31T23:59:59.999Z",
+                            },
+                            {
+                                "BeginningDateTime": "2021-01-01T00:00:00.000Z",
+                                "EndingDateTime": "2021-12-31T23:59:59.999Z",
+                            },
+                        ]
+                    }
+                ]
+            },
+        }
+        dataset = CmrDataset(data)
+        assert (
+            dataset.temporal_extent
+            == "2020-01-01T00:00:00.000Z - 2020-12-31T23:59:59.999Z, 2021-01-01T00:00:00.000Z - 2021-12-31T23:59:59.999Z"  # noqa
+        )
+
+    def test_combined_single_and_range_dates(self):
+        data = {
+            "meta": {},
+            "umm": {
+                "TemporalExtents": [
+                    {
+                        "SingleDateTimes": ["2020-01-01T00:00:00.000Z"],
+                        "RangeDateTimes": [
+                            {
+                                "BeginningDateTime": "2021-01-01T00:00:00.000Z",
+                                "EndingDateTime": "2021-12-31T23:59:59.999Z",
+                            }
+                        ],
+                    }
+                ]
+            },
+        }
+        dataset = CmrDataset(data)
+        assert (
+            dataset.temporal_extent == "2020-01-01T00:00:00.000Z, 2021-01-01T00:00:00.000Z - 2021-12-31T23:59:59.999Z"
+        )
+
 
 class TestSpatialProcessing:
     """Unit tests for spatial information processing"""

From 8f0a3ee680d0519aa533213989a9acf914619a44 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 3 Dec 2024 23:40:04 +0530
Subject: [PATCH 233/441] Affected Delta URLs header added

---
 .../templates/sde_collections/delta_urls_list.html     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index 33d3f536..82698db1 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -189,7 +189,7 @@ <h3 class="whiteText deltaTitle">
                         <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected Delta URLs</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
                         <th scope="col" class="text-center col-1"><strong>ID</strong></th>
                     </tr>
@@ -220,7 +220,7 @@ <h3 class="whiteText deltaTitle">
                             <strong>Match Pattern Type</strong>
                         </th>
                         <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
+                            <strong>Affected Delta URLs</strong>
                         </th>
                         <th scope="col" class="text-center col-1">
                             <strong>Actions</strong>
@@ -257,7 +257,7 @@ <h3 class="whiteText deltaTitle">
                             <strong>Title Pattern</strong>
                         </th>
                         <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
+                            <strong>Affected Delta URLs</strong>
                         </th>
                         <th scope="col" class="text-center col-1">
                             <strong>Actions</strong>
@@ -294,7 +294,7 @@ <h3 class="whiteText deltaTitle">
                             <strong>Document Type</strong>
                         </th>
                         <th scope="col" class="text-center col-1">
-                            <strong>Affected URLs</strong>
+                            <strong>Affected Delta URLs</strong>
                         </th>
                         <th scope="col" class="text-center col-1">
                             <strong>Actions</strong>
@@ -330,7 +330,7 @@ <h3 class="whiteText deltaTitle">
                         <th scope="col" class="text-center col-1"><strong>Match Pattern</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Division</strong></th>
-                        <th scope="col" class="text-center col-1"><strong>Affected URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected Delta URLs</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
                         <th scope="col" class="text-center col-1"><strong>ID</strong></th>
                     </tr>

From be9dee4897dcce7ca96ba4393613f4ae3dc8a67c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 12:21:52 -0600
Subject: [PATCH 234/441] rename fstring validator to validate_fstring

---
 sde_collections/models/delta_patterns.py | 4 ++--
 sde_collections/models/pattern.py        | 4 ++--
 sde_collections/utils/title_resolver.py  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index c0c3123b..c0212d04 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -6,10 +6,10 @@
 from django.db import models
 
 from ..utils.title_resolver import (
-    is_valid_fstring,
     is_valid_xpath,
     parse_title,
     resolve_title,
+    validate_fstring,
 )
 from .collection_choice_fields import Divisions, DocumentTypes
 
@@ -398,7 +398,7 @@ def validate_title_pattern(title_pattern_string: str) -> None:
                 raise ValidationError(f"Invalid xpath: {element_value}")
         elif element_type == "brace":
             try:
-                is_valid_fstring(element_value)
+                validate_fstring(element_value)
             except ValueError as e:
                 raise ValidationError(str(e))
 
diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py
index 1e14042b..774b988e 100644
--- a/sde_collections/models/pattern.py
+++ b/sde_collections/models/pattern.py
@@ -5,10 +5,10 @@
 from django.db import models
 
 from ..utils.title_resolver import (
-    is_valid_fstring,
     is_valid_xpath,
     parse_title,
     resolve_title,
+    validate_fstring,
 )
 from .collection_choice_fields import Divisions, DocumentTypes
 
@@ -146,7 +146,7 @@ def validate_title_pattern(title_pattern_string):
                 raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.")  # noqa: E231
         elif element_type == "brace":
             try:
-                is_valid_fstring(element_value)
+                validate_fstring(element_value)
             except ValueError as e:
                 raise ValidationError(str(e))
 
diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py
index 165065d9..c39b989a 100644
--- a/sde_collections/utils/title_resolver.py
+++ b/sde_collections/utils/title_resolver.py
@@ -17,7 +17,7 @@ def is_valid_xpath(xpath: str) -> bool:
         return False
 
 
-def is_valid_fstring(pattern: str) -> bool:
+def validate_fstring(pattern: str) -> bool:
     context = {
         "url": "",
         "title": "",
@@ -53,7 +53,7 @@ def resolve_brace(pattern: str, context: dict[str, Any]) -> str:
     """Safely interpolates the variables in an f-string pattern using the provided context."""
     parsed = ast.parse(f"f'''{pattern}'''", mode="eval")
 
-    is_valid_fstring(pattern)  # Refactor this
+    validate_fstring(pattern)
 
     compiled = compile(parsed, "<string>", "eval")
     return str(eval(compiled, {}, context))

From caf084075da069ae4540a0e114418726179177e0 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 12:22:08 -0600
Subject: [PATCH 235/441] add tests for title resolution

---
 .../tests/test_title_resolution.py            | 133 ++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 sde_collections/tests/test_title_resolution.py

diff --git a/sde_collections/tests/test_title_resolution.py b/sde_collections/tests/test_title_resolution.py
new file mode 100644
index 00000000..88c15aef
--- /dev/null
+++ b/sde_collections/tests/test_title_resolution.py
@@ -0,0 +1,133 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_resolution.py
+
+from unittest.mock import Mock, patch
+
+import pytest
+
+from ..utils.title_resolver import (
+    clean_text,
+    is_valid_xpath,
+    parse_title,
+    resolve_brace,
+    resolve_title,
+    resolve_xpath,
+    validate_fstring,
+)
+
+
+def test_parse_title():
+    # Test basic string
+    assert parse_title("Simple Title") == [("str", "Simple Title")]
+
+    # Test f-string
+    assert parse_title("Hello {title}") == [("str", "Hello "), ("brace", "{title}")]
+
+    # Test xpath
+    assert parse_title("xpath://h1") == [("xpath", "//h1")]
+
+    # Test complex pattern
+    result = parse_title("xpath://h1 | {title} - {collection}")
+    assert result == [
+        ("xpath", "//h1"),
+        ("str", " | "),
+        ("brace", "{title}"),
+        ("str", " - "),
+        ("brace", "{collection}"),
+    ]
+
+
+def test_is_valid_xpath():
+    assert is_valid_xpath("//h1") is True
+    assert is_valid_xpath("//div[@class='title']") is True
+    assert is_valid_xpath("invalid xpath") is False
+    assert is_valid_xpath("//h1[") is False
+
+
+def test_validate_fstring():
+    # Valid cases - should not raise
+    validate_fstring("{title}")
+    validate_fstring("{url}")
+    validate_fstring("{collection}")
+
+    # Invalid cases
+    with pytest.raises(ValueError):
+        validate_fstring("{invalid_var}")
+    with pytest.raises(ValueError):
+        validate_fstring("{title.upper()}")
+    with pytest.raises(ValueError):
+        validate_fstring("{len(title)}")
+
+
+def test_resolve_brace():
+    context = {"title": "Test Title", "url": "https://example.com", "collection": "Test Collection"}
+
+    assert resolve_brace("{title}", context) == "Test Title"
+    assert resolve_brace("{title} - {collection}", context) == "Test Title - Test Collection"
+
+    with pytest.raises(ValueError):
+        resolve_brace("{invalid}", context)
+
+
+def test_clean_text():
+    # Test whitespace handling
+    assert clean_text("  Title  \n  With\tSpaces  ") == "Title With Spaces"
+
+    # Test HTML entities
+    assert clean_text("Title &amp; More") == "Title & More"
+
+    # Test unicode normalization
+    assert clean_text("Café") == "Cafe"
+
+
+@patch("requests.get")
+def test_resolve_xpath(mock_get):
+    mock_response = Mock()
+    mock_response.ok = True
+    mock_response.content = b"""
+        <html>
+            <body>
+                <h1>Test Title</h1>
+                <div class="content">Inner Content</div>
+            </body>
+        </html>
+    """
+    mock_get.return_value = mock_response
+
+    # Test basic xpath
+    assert resolve_xpath("//h1", "https://example.com") == "Test Title"
+    assert resolve_xpath("//div[@class='content']", "https://example.com") == "Inner Content"
+
+    # Test error cases
+    mock_response.ok = False
+    with pytest.raises(ValueError):
+        resolve_xpath("//h1", "https://example.com")
+
+    mock_response.ok = True
+    with pytest.raises(ValueError):
+        resolve_xpath("//nonexistent", "https://example.com")
+
+
+@patch("requests.get")
+def test_resolve_title(mock_get):
+    mock_response = Mock()
+    mock_response.ok = True
+    mock_response.content = b"""
+        <html>
+            <body>
+                <h1>Dynamic Content</h1>
+            </body>
+        </html>
+    """
+    mock_get.return_value = mock_response
+
+    context = {"title": "Original Title", "url": "https://example.com", "collection": "Test Collection"}
+
+    # Test combination of xpath and f-string
+    pattern = "xpath://h1 | {title} - {collection}"
+    assert resolve_title(pattern, context) == "Dynamic Content | Original Title - Test Collection"
+
+    # Test simple f-string
+    assert resolve_title("{title} ({collection})", context) == "Original Title (Test Collection)"
+
+    # Test plain string
+    assert resolve_title("Static Title", context) == "Static Title"

From 81a9ff91ad73b68aa5f56c58a5e45fe2e4154838 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 16:12:19 -0600
Subject: [PATCH 236/441] update DeltaTitleErrors to not create duplicates

---
 sde_collections/models/delta_patterns.py     |  9 ++--
 sde_collections/tests/test_delta_patterns.py | 43 ++++++++++++++++++++
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index c0212d04..da64b450 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -454,8 +454,9 @@ def apply(self) -> None:
             new_title, error = self.generate_title_for_url(curated_url)
 
             if error:
-                # Log error and continue to next URL
-                DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=curated_url, error_string=error)
+                DeltaResolvedTitleError.objects.update_or_create(
+                    delta_url=curated_url, defaults={"title_pattern": self, "error_string": error}  # lookup field
+                )
                 continue
 
             # Skip if the generated title matches existing or if Delta already exists
@@ -488,7 +489,9 @@ def apply(self) -> None:
             new_title, error = self.generate_title_for_url(delta_url)
 
             if error:
-                DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error)
+                DeltaResolvedTitleError.objects.update_or_create(
+                    delta_url=delta_url, defaults={"title_pattern": self, "error_string": error}  # lookup field
+                )
                 continue
 
             # Update title and record resolution - key change here
diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py
index b72981fc..55d56556 100644
--- a/sde_collections/tests/test_delta_patterns.py
+++ b/sde_collections/tests/test_delta_patterns.py
@@ -266,3 +266,46 @@ def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
         # Ensure no new `DeltaUrl` is created after reapplying the pattern
         pattern.apply()
         assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0
+
+    @pytest.mark.django_db
+    def test_title_pattern_error_updates(self):
+        """
+        Test that when a more specific pattern creates an error,
+        it updates rather than duplicates the error record.
+        """
+        # Create a collection and URL
+        collection = CollectionFactory()
+        url = DeltaUrlFactory(
+            collection=collection, url="https://example.com/docs/specific/item.html", scraped_title="Original Title"
+        )
+
+        # Create a general pattern first
+        general_pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="*docs*",
+            # Use a different error-causing pattern
+            title_pattern="{invalid}",  # Invalid variable name will cause error
+            match_pattern_type=2,
+        )
+
+        # Verify initial error state
+        error = url.deltaresolvedtitleerror
+        assert error.title_pattern == general_pattern
+        assert "Variable 'invalid' not allowed in f-string pattern" in error.error_string
+
+        # Create a more specific pattern
+        specific_pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern="*docs/specific*",
+            # Different invalid variable
+            title_pattern="{another_invalid}",
+            match_pattern_type=2,
+        )
+
+        # Error should now be from specific pattern
+        error = url.deltaresolvedtitleerror  # Should still only be one
+        assert error.title_pattern == specific_pattern
+        assert "Variable 'another_invalid' not allowed in f-string pattern" in error.error_string
+
+        # Verify we still only have one error record
+        assert DeltaResolvedTitleError.objects.filter(delta_url=url).count() == 1

From 82462f781e33c79423474225609338eba83cce12 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 16:39:18 -0600
Subject: [PATCH 237/441] add tiebreaker logic to pattern specificity

---
 sde_collections/models/delta_patterns.py      | 25 ++++++--
 sde_collections/tests/test_delta_patterns.py  | 10 +--
 .../tests/test_pattern_specificity.py         | 62 +++++++++++++++++++
 3 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index da64b450..dade950a 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -50,11 +50,19 @@ def get_url_match_count(self):
 
     def is_most_distinctive_pattern(self, url) -> bool:
         """
-        Determine if this pattern should apply to a URL by checking if it matches
-        the smallest number of URLs among all patterns that match this URL.
+        Determine if this pattern should apply to a URL by checking:
+        1. First checks if this pattern matches this URL
+        2. If it matches the smallest number of URLs among all patterns that match this URL
+        3. If tied for smallest number of matches, uses the longest pattern string
         Returns True if this pattern should be applied.
         """
+        # First check if this pattern matches the URL
+        regex_pattern = self.get_regex_pattern()
+        if not re.search(regex_pattern, url.url):
+            return False
+
         my_match_count = self.get_url_match_count()
+        my_pattern_length = len(self.match_pattern)
 
         # Get patterns from same type that affect this URL
         pattern_class = self.__class__
@@ -63,12 +71,19 @@ def is_most_distinctive_pattern(self, url) -> bool:
             .filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url))
             .exclude(id=self.id)
             .distinct()
-        )  # TODO: does this have a distinct urls, or distinct model objects.
+        )
 
-        # If any matching pattern has a smaller URL set, don't apply
+        # Use M2M relationships for checking other patterns since those are already established
         for pattern in matching_patterns:
-            if pattern.get_url_match_count() < my_match_count:
+            other_match_count = pattern.get_url_match_count()
+            if other_match_count < my_match_count:
+                # Other pattern matches fewer URLs - definitely not most distinctive
                 return False
+            if other_match_count == my_match_count:
+                # Same match count - check pattern length
+                if len(pattern.match_pattern) > my_pattern_length:
+                    # Other pattern is longer - not most distinctive
+                    return False
 
         return True
 
diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py
index 55d56556..a7941fbd 100644
--- a/sde_collections/tests/test_delta_patterns.py
+++ b/sde_collections/tests/test_delta_patterns.py
@@ -283,7 +283,6 @@ def test_title_pattern_error_updates(self):
         general_pattern = DeltaTitlePattern.objects.create(
             collection=collection,
             match_pattern="*docs*",
-            # Use a different error-causing pattern
             title_pattern="{invalid}",  # Invalid variable name will cause error
             match_pattern_type=2,
         )
@@ -297,14 +296,17 @@ def test_title_pattern_error_updates(self):
         specific_pattern = DeltaTitlePattern.objects.create(
             collection=collection,
             match_pattern="*docs/specific*",
-            # Different invalid variable
             title_pattern="{another_invalid}",
             match_pattern_type=2,
         )
 
+        # Re-fetch error to see latest state
+        error.refresh_from_db()
+
         # Error should now be from specific pattern
-        error = url.deltaresolvedtitleerror  # Should still only be one
-        assert error.title_pattern == specific_pattern
+        assert (
+            error.title_pattern == specific_pattern
+        ), f"Error still associated with {error.title_pattern} instead of {specific_pattern}"
         assert "Variable 'another_invalid' not allowed in f-string pattern" in error.error_string
 
         # Verify we still only have one error record
diff --git a/sde_collections/tests/test_pattern_specificity.py b/sde_collections/tests/test_pattern_specificity.py
index 98c7f006..7eafe978 100644
--- a/sde_collections/tests/test_pattern_specificity.py
+++ b/sde_collections/tests/test_pattern_specificity.py
@@ -156,3 +156,65 @@ def test_field_modifying_pattern_layered_specificity():
     assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
 
     assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)
+
+
+@pytest.mark.django_db
+def test_pattern_specificity_tiebreaker():
+    """Test that when patterns match the same number of URLs, longer patterns are considered more specific."""
+    collection = CollectionFactory()
+
+    # Create URLs that would result in same match count for different patterns
+    url1 = DeltaUrlFactory(
+        collection=collection, url="https://example.com/docs/specific/item1.html", scraped_title="Title 1"
+    )
+    url2 = DeltaUrlFactory(
+        collection=collection, url="https://example.com/docs/specific/item2.html", scraped_title="Title 2"
+    )
+
+    # Create patterns with same match count but different lengths
+    general_pattern = DeltaTitlePattern.objects.create(
+        collection=collection,
+        match_pattern="*docs*",  # Shorter pattern
+        title_pattern="{title}",
+        match_pattern_type=2,
+    )
+
+    specific_pattern = DeltaTitlePattern.objects.create(
+        collection=collection,
+        match_pattern="*docs/specific*",  # Longer pattern
+        title_pattern="{title} - Specific",
+        match_pattern_type=2,
+    )
+
+    # Both patterns will match both URLs (same match count)
+    assert general_pattern.get_url_match_count() == 2
+    assert specific_pattern.get_url_match_count() == 2
+
+    # But the longer pattern should be considered more specific
+    assert general_pattern.is_most_distinctive_pattern(url1) is False
+    assert specific_pattern.is_most_distinctive_pattern(url1) is True
+
+    # Check that this applies to both URLs
+    assert general_pattern.is_most_distinctive_pattern(url2) is False
+    assert specific_pattern.is_most_distinctive_pattern(url2) is True
+
+    # Create an even more specific pattern
+    very_specific_pattern = DeltaTitlePattern.objects.create(
+        collection=collection,
+        match_pattern="*docs/specific/item1*",  # Even longer pattern
+        title_pattern="{title} - Very Specific",
+        match_pattern_type=2,
+    )
+
+    # It matches fewer URLs
+    assert very_specific_pattern.get_url_match_count() == 1
+
+    # For URL1, the very specific pattern should win due to fewer matches
+    assert general_pattern.is_most_distinctive_pattern(url1) is False
+    assert specific_pattern.is_most_distinctive_pattern(url1) is False
+    assert very_specific_pattern.is_most_distinctive_pattern(url1) is True
+
+    # For URL2, the middle pattern should still win since very_specific doesn't match
+    assert general_pattern.is_most_distinctive_pattern(url2) is False
+    assert specific_pattern.is_most_distinctive_pattern(url2) is True
+    assert very_specific_pattern.is_most_distinctive_pattern(url2) is False

From 49dda0c499b681c6ddb6c2b7528ae47a5af03133 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 17:16:32 -0600
Subject: [PATCH 238/441] add readme not about running management commands

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ec38f6b1..8e05366b 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,11 @@ Eventually, job creation will be done seamlessly by the webapp. Until then, edit
 ```shell
 tmux new -s docker_django
 ```
-Once you are inside, you can run dmshell.
+Once you are inside, you can run dmshell or for example a managment command:
+
+```shell
+docker-compose -f production.yml run --rm django python manage.py deduplicate_urls
+```
 
 Later, you can do this to get back in.
 ```shell

From 334f1387f6d6f28556bee5bedac120a227a3bbd8 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 17:17:41 -0600
Subject: [PATCH 239/441] minor improvements to ej processing readme

---
 scripts/ej/README.md | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/scripts/ej/README.md b/scripts/ej/README.md
index 456adb6a..eb74c490 100644
--- a/scripts/ej/README.md
+++ b/scripts/ej/README.md
@@ -10,11 +10,6 @@ The pipeline consists of several components:
 - Threshold-based filtering
 - Data dump creation
 
-## Prerequisites
-
-- Access to CMR collection data
-- Access to the classification model predictions (contact Bishwas for access)
-
 ## Setup
 
 1. Clone the repository
@@ -34,7 +29,7 @@ github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processin
 
 ## Configuration
 
-Edit `config.py` to customize:
+Edit `scripts/ej/config.py` to customize:
 
 - Classification thresholds
 - Authorized classifications

From 52b26a0e826124817fadb9da0675e0456cb072e6 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 17:18:33 -0600
Subject: [PATCH 240/441] update process_ej_dump to use new model choice values

---
 scripts/ej/cmr_to_models.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py
index f4e5a523..1cbb31c9 100644
--- a/scripts/ej/cmr_to_models.py
+++ b/scripts/ej/cmr_to_models.py
@@ -12,10 +12,10 @@
 def process_ej_dump(file_path: str) -> None:
     """Process EJ dump file and create database entries."""
 
-    destination_server = EnvironmentalJusticeRow.DestinationServerChoices.DEV
+    data_source = EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
 
     # Clear existing data
-    EnvironmentalJusticeRow.objects.filter(destination_server=destination_server).delete()
+    EnvironmentalJusticeRow.objects.filter(data_source=data_source).delete()
 
     # Load the preprocessed data
     with open(file_path) as f:
@@ -24,7 +24,7 @@ def process_ej_dump(file_path: str) -> None:
     # Create database entries
     for entry in clean_data:
         ej_row = EnvironmentalJusticeRow(
-            destination_server=destination_server,
+            data_source=data_source,
             sde_link=entry["sde_link"],
             dataset=entry["dataset"],
             description=entry["description"],
@@ -47,4 +47,4 @@ def process_ej_dump(file_path: str) -> None:
         ej_row.save()
 
 
-process_ej_dump("backups/ej_dump_20241120_211754.json")
+process_ej_dump("backups/ej_dump_20241203_170124.json")

From b91405dcf7f4bbb78dde4746d4bf61b9e4f7550b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 3 Dec 2024 19:40:59 -0600
Subject: [PATCH 241/441] refactor sinequa_api wrapper, test suites, and
 full_text import

---
 sde_collections/sinequa_api.py                | 179 +++++++++-------
 sde_collections/tasks.py                      |  50 +++--
 sde_collections/tests/api_tests.py            | 199 +++++++-----------
 .../tests/test_import_fulltexts.py            |  70 ++++--
 4 files changed, 269 insertions(+), 229 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 99177df0..78ac4e05 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -1,12 +1,10 @@
 import json
+from collections.abc import Iterator
 from typing import Any
 
 import requests
 import urllib3
 from django.conf import settings
-from django.db import transaction
-
-from .models.delta_url import DumpUrl
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -138,85 +136,99 @@ def query(self, page: int, collection_config_folder: str | None = None, source:
 
         return self.process_response(url, payload)
 
-    def sql_query(self, sql: str, collection) -> Any:
+    def _execute_sql_query(self, sql: str) -> dict:
+        """
+        Executes a SQL query against the Sinequa API.
+
+        Args:
+            sql (str): The SQL query to execute
+
+        Returns:
+            dict: The JSON response from the API containing 'Rows' and 'TotalRowCount'
+
+        Raises:
+            ValueError: If no token is available for authentication
+        """
         token = self._get_token()
         if not token:
             raise ValueError("Authentication error: Token is required for SQL endpoint access")
 
-        page = 0
-        page_size = 5000  # Number of records per page
-        skip_records = 0
+        url = f"{self.base_url}/api/v1/engine.sql"
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
+        raw_payload = json.dumps(
+            {
+                "method": "engine.sql",
+                "sql": sql,
+                "pretty": True,
+            }
+        )
 
-        while True:
-            paginated_sql = f"{sql} SKIP {skip_records} COUNT {page_size}"
-            url = f"{self.base_url}/api/v1/engine.sql"
-            headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
-            raw_payload = json.dumps(
-                {
-                    "method": "engine.sql",
-                    "sql": paginated_sql,
-                    "pretty": True,
-                }
-            )
+        return self.process_response(url, headers=headers, raw_data=raw_payload)
 
-            response = self.process_response(url, headers=headers, raw_data=raw_payload)
-            batch_data = response.get("Rows", [])
-            total_row_count = response.get("TotalRowCount", 0)
-            processed_response = self._process_full_text_response(response)
-            self.process_and_update_data(processed_response, collection)
-            print(f"Batch {page + 1} has been processed and updated")
+    def _process_rows_to_records(self, rows: list) -> list[dict]:
+        """
+        Converts raw SQL row data into structured record dictionaries.
 
-            # Check if all rows have been fetched
-            if len(batch_data) == 0 or (skip_records + page_size) >= total_row_count:
-                break
+        Args:
+            rows (list): List of rows, where each row is [url, full_text, title]
 
-            page += 1
-            skip_records += page_size
-
-        return f"All {total_row_count} records have been processed and updated."
-
-    def process_and_update_data(self, batch_data, collection):
-        for record in batch_data:
-            try:
-                with transaction.atomic():
-                    url = record["url"]
-                    scraped_text = record.get("full_text", "")
-                    scraped_title = record.get("title", "")
-                    DumpUrl.objects.update_or_create(
-                        url=url,
-                        defaults={
-                            "scraped_text": scraped_text,
-                            "scraped_title": scraped_title,
-                            "collection": collection,
-                        },
-                    )
-            except KeyError as e:
-                print(f"Missing key in data: {str(e)}")
-            except Exception as e:
-                print(f"Error processing record: {str(e)}")
-
-    def get_full_texts(self, collection_config_folder: str, source: str = None, collection=None) -> Any:
+        Returns:
+            list[dict]: List of processed records with url, full_text, and title keys
+
+        Raises:
+            ValueError: If any row doesn't contain exactly 3 elements
+        """
+        processed_records = []
+        for idx, row in enumerate(rows):
+            if len(row) != 3:
+                raise ValueError(
+                    f"Invalid row format at index {idx}: Expected exactly three elements (url, full_text, title). "
+                    f"Received {len(row)} elements."
+                )
+            processed_records.append({"url": row[0], "full_text": row[1], "title": row[2]})
+        return processed_records
+
+    def get_full_texts(self, collection_config_folder: str, source: str = None) -> Iterator[dict]:
         """
-        Retrieves the full texts, URLs, and titles for a specified collection.
+        Retrieves and yields batches of text records from the SQL database for a given collection.
+        Uses pagination to handle large datasets efficiently.
 
-        Returns:
-            dict: A JSON response containing the results of the SQL query,
-                where each item has 'url', 'text', and 'title'.
-
-        Example:
-            Calling get_full_texts("example_collection") might return:
-                [
-                    {
-                        'url': 'http://example.com/article1',
-                        'text': 'Here is the full text of the first article...',
-                        'title': 'Article One Title'
-                    },
-                    {
-                        'url': 'http://example.com/article2',
-                        'text': 'Here is the full text of the second article...',
-                        'title': 'Article Two Title'
-                    }
-                ]
+        Args:
+            collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "SMD")
+            source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers
+                or "SDE" for other servers.
+
+        Yields:
+            list[dict]: Batches of records, where each record is a dictionary containing:
+                {
+                    "url": str,       # The URL of the document
+                    "full_text": str, # The full text content of the document
+                    "title": str      # The title of the document
+                }
+
+        Raises:
+            ValueError: If the server's index is not defined in its configuration
+
+        Example batch:
+            [
+                {
+                    "url": "https://example.nasa.gov/doc1",
+                    "full_text": "This is the content of doc1...",
+                    "title": "Document 1 Title"
+                },
+                {
+                    "url": "https://example.nasa.gov/doc2",
+                    "full_text": "This is the content of doc2...",
+                    "title": "Document 2 Title"
+                }
+            ]
+
+        Note:
+            - Results are paginated in batches of 5000 records
+            - Each batch is processed into clean dictionaries before being yielded
+            - The iterator will stop when either:
+                1. No more rows are returned from the query
+                2. The total count of records has been reached
         """
 
         if not source:
@@ -229,7 +241,28 @@ def get_full_texts(self, collection_config_folder: str, source: str = None, coll
             )
 
         sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
-        return self.sql_query(sql, collection)
+
+        page = 0
+        page_size = 5000
+        total_processed = 0
+
+        while True:
+            paginated_sql = f"{sql} SKIP {total_processed} COUNT {page_size}"
+            response = self._execute_sql_query(paginated_sql)
+
+            rows = response.get("Rows", [])
+            if not rows:  # Stop if we get an empty batch
+                break
+
+            yield self._process_rows_to_records(rows)
+
+            total_processed += len(rows)
+            total_count = response.get("TotalRowCount", 0)
+
+            if total_processed >= total_count:  # Stop if we've processed all records
+                break
+
+            page += 1
 
     @staticmethod
     def _process_full_text_response(batch_data: dict):
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 188147ac..8d4a4c4d 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -7,6 +7,7 @@
 from django.conf import settings
 from django.core import management
 from django.core.management.commands import loaddata
+from django.db import transaction
 
 from config import celery_app
 
@@ -147,26 +148,43 @@ def resolve_title_pattern(title_pattern_id):
 @celery_app.task(soft_time_limit=600)
 def fetch_and_replace_full_text(collection_id, server_name):
     """
-    Task to initiate fetching and replacing full text and metadata for all URLs associated with a specified collection
-    from a given server.
-    Args:
-        collection_id (int): The identifier for the collection in the database.
-        server_name (str): The name of the server.
-
-    Returns:
-        str: A message indicating the result of the operation, including the number of URLs processed.
+    Task to fetch and replace full text and metadata for a collection.
+    Handles data in batches to manage memory usage.
     """
     collection = Collection.objects.get(id=collection_id)
     api = Api(server_name)
 
-    # Step 1: Delete all existing DumpUrl entries for the collection
+    # Step 1: Delete existing DumpUrl entries
     deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
     print(f"Deleted {deleted_count} old records.")
 
-    # Step 2: Fetch and process new data
-    result_message = api.get_full_texts(collection.config_folder, collection=collection)
-
-    # Step 3: Migrate DumpUrl to DeltaUrl
-    collection.migrate_dump_to_delta()
-
-    return result_message
+    # Step 2: Process data in batches
+    total_processed = 0
+
+    try:
+        for batch in api.get_full_texts(collection.config_folder):
+            # Use bulk_create for efficiency, with a transaction per batch
+            with transaction.atomic():
+                DumpUrl.objects.bulk_create(
+                    [
+                        DumpUrl(
+                            url=record["url"],
+                            collection=collection,
+                            scraped_text=record["full_text"],
+                            scraped_title=record["title"],
+                        )
+                        for record in batch
+                    ]
+                )
+
+            total_processed += len(batch)
+            print(f"Processed batch of {len(batch)} records. Total: {total_processed}")
+
+        # Step 3: Migrate dump URLs to delta URLs
+        collection.migrate_dump_to_delta()
+
+        return f"Successfully processed {total_processed} records and updated the database."
+
+    except Exception as e:
+        print(f"Error processing records: {str(e)}")
+        raise
diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 8abb5f08..88a0f44f 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -5,9 +5,7 @@
 from django.utils import timezone
 
 from sde_collections.models.collection import WorkflowStatusChoices
-from sde_collections.models.delta_url import DumpUrl
 from sde_collections.sinequa_api import Api
-from sde_collections.tasks import fetch_and_replace_full_text
 from sde_collections.tests.factories import CollectionFactory, UserFactory
 
 
@@ -69,135 +67,96 @@ def test_query(self, mock_process_response, api_instance):
         response = api_instance.query(page=1, collection_config_folder="folder")
         assert response == {"result": "success"}
 
-    @patch("sde_collections.sinequa_api.Api.process_response")
-    def test_sql_query(self, mock_process_response, api_instance, collection):
-        """Test SQL query execution and response processing."""
-        mock_process_response.return_value = {
-            "Rows": [{"url": "http://example.com", "full_text": "Text", "title": "Title"}],
-            "TotalRowCount": 1,
-        }
-        response = api_instance.sql_query("SELECT * FROM test_index", collection)
-        assert response == "All 1 records have been processed and updated."
+    def test_process_rows_to_records(self, api_instance):
+        """Test processing row data into record dictionaries."""
+        # Test valid input
+        valid_rows = [["http://example.com/1", "Text 1", "Title 1"], ["http://example.com/2", "Text 2", "Title 2"]]
+        expected_output = [
+            {"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"},
+            {"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"},
+        ]
+        assert api_instance._process_rows_to_records(valid_rows) == expected_output
+
+        # Test invalid row length
+        invalid_rows = [["http://example.com", "Text"]]  # Missing title
+        with pytest.raises(ValueError, match="Invalid row format at index 0"):
+            api_instance._process_rows_to_records(invalid_rows)
 
     @patch("sde_collections.sinequa_api.Api.process_response")
-    def test_get_full_texts(self, mock_process_response, api_instance, collection):
-        """Test fetching full texts from the API."""
-        mock_process_response.return_value = {
-            "Rows": [{"url": "http://example.com", "text": "Example text", "title": "Example title"}]
-        }
-        response = api_instance.get_full_texts(
-            collection_config_folder="folder", source="source", collection=collection
-        )
-        assert response == "All 0 records have been processed and updated."
-
-    def test_process_and_update_data(self, api_instance, collection):
-        """Test processing and updating data in the database."""
-        batch_data = [{"url": "http://example.com", "full_text": "Example text", "title": "Example title"}]
-        api_instance.process_and_update_data(batch_data, collection)
-        dump_urls = DumpUrl.objects.filter(collection=collection)
-        assert dump_urls.count() == 1
-        assert dump_urls.first().url == "http://example.com"
-
-    @patch("sde_collections.sinequa_api.Api.sql_query")
-    @patch("sde_collections.models.collection.Collection.migrate_dump_to_delta")
-    def test_fetch_and_replace_full_text(self, mock_migrate, mock_sql_query, collection):
-        """Test the fetch_and_replace_full_text Celery task."""
-        with patch(
-            "sde_collections.sinequa_api.server_configs",
+    def test_execute_sql_query(self, mock_process_response, api_instance):
+        """Test SQL query execution."""
+        mock_process_response.return_value = {"Rows": [], "TotalRowCount": 0}
+
+        # Test successful query
+        result = api_instance._execute_sql_query("SELECT * FROM test")
+        assert result == {"Rows": [], "TotalRowCount": 0}
+
+        # Test query with missing token
+        api_instance._provided_token = None
+        with pytest.raises(ValueError, match="Token is required"):
+            api_instance._execute_sql_query("SELECT * FROM test")
+
+    @patch("sde_collections.sinequa_api.Api._execute_sql_query")
+    def test_get_full_texts_pagination(self, mock_execute_sql, api_instance):
+        """Test that get_full_texts correctly handles pagination."""
+        # Mock responses for two pages of results
+        mock_execute_sql.side_effect = [
             {
-                "test_server": {
-                    "app_name": "test_app",
-                    "query_name": "test_query",
-                    "base_url": "http://testserver.com/api",
-                    "index": "test_index",
-                }
+                "Rows": [["http://example.com/1", "Text 1", "Title 1"], ["http://example.com/2", "Text 2", "Title 2"]],
+                "TotalRowCount": 3,
             },
-        ):
-            mock_sql_query.return_value = "All records processed"
-            mock_migrate.return_value = None
-
-            result = fetch_and_replace_full_text(collection.id, "test_server")
-            assert result == "All records processed"
-            mock_migrate.assert_called_once()
-
-    @patch(
-        "sde_collections.sinequa_api.server_configs",
-        {
-            "test_server": {
-                "app_name": "test_app",
-                "query_name": "test_query",
-                "base_url": "http://testserver.com/api",
-                "index": "test_index",
-            }
-        },
-    )
+            {"Rows": [["http://example.com/3", "Text 3", "Title 3"]], "TotalRowCount": 3},
+            {"Rows": [], "TotalRowCount": 3},
+        ]
+
+        # Collect all batches from the iterator
+        batches = list(api_instance.get_full_texts("test_folder"))
+
+        assert len(batches) == 2  # Should have two batches
+        assert len(batches[0]) == 2  # First batch has 2 records
+        assert len(batches[1]) == 1  # Second batch has 1 record
+
+        # Verify content of first batch
+        assert batches[0] == [
+            {"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"},
+            {"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"},
+        ]
+
+        # Verify content of second batch
+        assert batches[1] == [{"url": "http://example.com/3", "full_text": "Text 3", "title": "Title 3"}]
+
+    def test_get_full_texts_missing_index(self, api_instance):
+        """Test that get_full_texts raises error when index is missing from config."""
+        api_instance.config.pop("index", None)
+        with pytest.raises(ValueError, match="Index not defined for server"):
+            next(api_instance.get_full_texts("test_folder"))
+
     @pytest.mark.parametrize(
-        "server_name, user, password, expected",
-        [("test_server", "user1", "pass1", True), ("invalid_server", None, None, False)],
+        "server_name,expect_auth",
+        [
+            ("xli", True),  # dev server should have auth
+            ("production", False),  # prod server should not have auth
+        ],
     )
-    def test_api_init(self, server_name, user, password, expected):
-        """Test API initialization with valid and invalid server names."""
-        if expected:
-            api = Api(server_name=server_name, user=user, password=password)
-            assert api.server_name == server_name
-        else:
-            with pytest.raises(ValueError):
-                Api(server_name=server_name)
-
     @patch("requests.post")
-    def test_query_dev_server_authentication(self, mock_post, api_instance):
-        """Test query on dev servers requiring authentication."""
-        api_instance.server_name = "xli"
+    def test_query_authentication(self, mock_post, server_name, expect_auth, api_instance):
+        """Test authentication handling for different server types."""
+        api_instance.server_name = server_name
         mock_post.return_value = MagicMock(status_code=200, json=lambda: {"result": "success"})
 
         response = api_instance.query(page=1, collection_config_folder="folder")
         assert response == {"result": "success"}
 
-        # Extract URL from call_args (positional arguments)
-        called_url = mock_post.call_args[0][0]  # URL is the first positional argument
-        assert "?Password=test_pass&User=test_user" in called_url
+        called_url = mock_post.call_args[0][0]
+        auth_present = "?Password=test_pass&User=test_user" in called_url
+        assert auth_present == expect_auth
 
-    @patch("sde_collections.sinequa_api.Api.process_response")
-    def test_sql_query_pagination(self, mock_process_response, api_instance, collection):
-        """Test SQL query with pagination."""
-        mock_process_response.side_effect = [
-            {"Rows": [{"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"}], "TotalRowCount": 6},
-            {"Rows": [{"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"}], "TotalRowCount": 6},
-            {"Rows": [], "TotalRowCount": 6},
-        ]
+    @patch("requests.post")
+    def test_query_dev_server_missing_credentials(self, mock_post, api_instance):
+        """Test that dev servers raise error when credentials are missing."""
+        api_instance.server_name = "xli"
+        api_instance._provided_user = None
+        api_instance._provided_password = None
 
-        result = api_instance.sql_query("SELECT * FROM test_index", collection)
-        assert result == "All 6 records have been processed and updated."
-
-    def test_process_full_text_response(self, api_instance):
-        """Test that _process_full_text_response correctly processes the data."""
-        batch_data = {
-            "Rows": [
-                ["http://example.com", "Example text", "Example title"],
-                ["http://example.net", "Another text", "Another title"],
-            ]
-        }
-        expected_output = [
-            {"url": "http://example.com", "full_text": "Example text", "title": "Example title"},
-            {"url": "http://example.net", "full_text": "Another text", "title": "Another title"},
-        ]
-        result = api_instance._process_full_text_response(batch_data)
-        assert result == expected_output
-
-    def test_process_full_text_response_with_invalid_data(self, api_instance):
-        """Test that _process_full_text_response raises an error with invalid data."""
-        # Test for missing 'Rows' key
-        invalid_data_no_rows = {}  # No 'Rows' key
-        with pytest.raises(ValueError, match="Expected 'Rows' key with a list of data"):
-            api_instance._process_full_text_response(invalid_data_no_rows)
-
-        # Test for incorrect row length
-        invalid_data_wrong_length = {"Rows": [["http://example.com", "Example text"]]}  # Missing 'title'
-        with pytest.raises(ValueError, match="Each row must contain exactly three elements"):
-            api_instance._process_full_text_response(invalid_data_wrong_length)
-
-    @patch("sde_collections.sinequa_api.Api._get_token", return_value=None)
-    def test_sql_query_missing_token(self, mock_get_token, api_instance, collection):
-        """Test that sql_query raises an error when no token is provided."""
-        with pytest.raises(ValueError, match="A token is required to use the SQL endpoint"):
-            api_instance.sql_query("SELECT * FROM test_table", collection)
+        with pytest.raises(ValueError, match="Authentication error: Missing credentials for dev server"):
+            api_instance.query(page=1)
diff --git a/sde_collections/tests/test_import_fulltexts.py b/sde_collections/tests/test_import_fulltexts.py
index b4256bde..d39f1633 100644
--- a/sde_collections/tests/test_import_fulltexts.py
+++ b/sde_collections/tests/test_import_fulltexts.py
@@ -4,39 +4,69 @@
 
 import pytest
 
-from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from sde_collections.models.delta_url import DeltaUrl, DumpUrl
 from sde_collections.tasks import fetch_and_replace_full_text
 from sde_collections.tests.factories import CollectionFactory
 
 
 @pytest.mark.django_db
 def test_fetch_and_replace_full_text():
-    # Create a test collection
-    collection = CollectionFactory()
+    collection = CollectionFactory(config_folder="test_folder")
 
-    # Mock API response
-    mock_documents = [
+    mock_batch = [
         {"url": "http://example.com/1", "full_text": "Test Text 1", "title": "Test Title 1"},
         {"url": "http://example.com/2", "full_text": "Test Text 2", "title": "Test Title 2"},
     ]
 
+    def mock_generator():
+        yield mock_batch
+
     with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts:
-        mock_get_full_texts.return_value = mock_documents
+        mock_get_full_texts.return_value = mock_generator()
 
-        # Call the function
         fetch_and_replace_full_text(collection.id, "lrm_dev")
 
-        # Assertions
         assert DumpUrl.objects.filter(collection=collection).count() == 0
-        assert DeltaUrl.objects.filter(collection=collection).count() == len(mock_documents)
-        assert CuratedUrl.objects.filter(collection=collection).count() == 0
-
-        for doc in mock_documents:
-            assert (
-                DeltaUrl.objects.filter(collection=collection)
-                .filter(
-                    url=doc["url"],
-                    scraped_text=doc["full_text"],
-                )
-                .exists()
-            )
+        assert DeltaUrl.objects.filter(collection=collection).count() == 2
+
+
+@pytest.mark.django_db
+def test_fetch_and_replace_full_text_large_dataset():
+    """Test processing a large number of records with proper pagination and batching."""
+    collection = CollectionFactory(config_folder="test_folder")
+
+    # Create sample data - 20,000 records in total
+    def create_batch(start_idx, size):
+        return [
+            {"url": f"http://example.com/{i}", "full_text": f"Test Text {i}", "title": f"Test Title {i}"}
+            for i in range(start_idx, start_idx + size)
+        ]
+
+    # Mock the API to return data in batches of 5000 (matching actual API pagination)
+    def mock_batch_generator():
+        batch_size = 5000
+        total_records = 20000
+
+        for start in range(0, total_records, batch_size):
+            yield create_batch(start, min(batch_size, total_records - start))
+
+    with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts:
+        mock_get_full_texts.return_value = mock_batch_generator()
+
+        # Execute the task
+        result = fetch_and_replace_full_text(collection.id, "lrm_dev")
+
+        # Verify total number of records
+        assert DeltaUrl.objects.filter(collection=collection).count() == 20000
+
+        # Verify some random records exist and have correct data
+        for i in [0, 4999, 5000, 19999]:  # Check boundaries and middle
+            url = DeltaUrl.objects.get(url=f"http://example.com/{i}")
+            assert url.scraped_text == f"Test Text {i}"
+            assert url.scraped_title == f"Test Title {i}"
+
+        # Verify batch processing worked by checking the success message
+        assert "Successfully processed 20000 records" in result
+
+        # Verify no DumpUrls remain (should all be migrated to DeltaUrls)
+        assert DumpUrl.objects.filter(collection=collection).count() == 0

From 5a34a3e6a78aac06d141f189ac1eb6c0c3ed6b4c Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 4 Dec 2024 12:00:25 +0530
Subject: [PATCH 242/441] Updated page title to URLs

---
 .../templates/sde_collections/delta_urls_list.html              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index 82698db1..ec83d31c 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -14,7 +14,7 @@
 {% block content %}
 {% csrf_token %}
 <div class="headerDiv">
-<h1 class="pageTitle">Delta URLs</h1>
+<h1 class="pageTitle">URLs</h1>
 <button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
 type="button"
 data-toggle="dropdown"

From 30601c90583e3a34b3abd1f6783102dcaef7e8aa Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 4 Dec 2024 12:03:49 +0530
Subject: [PATCH 243/441] Updated subtitle to collection name

---
 .../templates/sde_collections/delta_urls_list.html            | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index ec83d31c..8ad55d77 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -30,7 +30,9 @@ <h1 class="pageTitle">URLs</h1>
 </div>
 <div class="deltaUrlContainer">
 <h3 class="whiteText deltaTitle">
-    {{ delta_urls.count|intcomma }} Delta URLs for <a
+    <!-- {{ delta_urls.count|intcomma }} Delta URLs for  -->
+      Collection Name:
+    <a
         href="{% url 'sde_collections:detail' collection.pk %}"><strong class="urlStyle underline">{{ collection.name }}</strong></a>
     <br>
     <!-- <small class="muted">Base URL: <a href="{{ collection.url }}" target="_blank">{{ collection.url }}</a></small> -->

From 0726e5f8f3542e61e17214968510e44f751fe400 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 4 Dec 2024 15:24:24 +0530
Subject: [PATCH 244/441] Updated serializer to include the to_delete field

---
 sde_collections/serializers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 7b2fdc7f..1688ccef 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -81,6 +81,7 @@ class Meta:
             "id",
             "excluded",
             "url",
+            "to_delete",
             "scraped_title",
             "generated_title",
             "generated_title_id",

From bc3ac6b4a10778d6656129b98cfcc9c180fcaad7 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 4 Dec 2024 18:15:33 -0600
Subject: [PATCH 245/441] t1

---
 .github/workflows/test-on-pr.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 .github/workflows/test-on-pr.yml

diff --git a/.github/workflows/test-on-pr.yml b/.github/workflows/test-on-pr.yml
new file mode 100644
index 00000000..e69de29b

From 2e26672176f841a5c0f6b4b5c2ded45e92e71b66 Mon Sep 17 00:00:00 2001
From: saifrk <ksaif5720@gmail.com>
Date: Wed, 4 Dec 2024 18:18:38 -0600
Subject: [PATCH 246/441] Update test-on-pr.yml

---
 .github/workflows/test-on-pr.yml | 62 ++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/.github/workflows/test-on-pr.yml b/.github/workflows/test-on-pr.yml
index e69de29b..fd88a474 100644
--- a/.github/workflows/test-on-pr.yml
+++ b/.github/workflows/test-on-pr.yml
@@ -0,0 +1,62 @@
+name: Django Test Suite on PR
+
+on:
+  pull_request:
+    branches:
+      - feature/add-github-actions
+
+jobs:
+  run-tests:
+    runs-on: ubuntu-latest
+
+    services:
+      docker:
+        image: docker:19.03.12
+        options: --privileged
+        ports:
+          - 5432:5432
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v2
+
+      - name: Set up Docker Compose
+        run: |
+          sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+          sudo chmod +x /usr/local/bin/docker-compose
+
+      - name: Build the Docker environment
+        run: docker-compose -f local.yml build
+
+      - name: Run tests for delta patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_delta_patterns.py
+
+      - name: Run tests for exclude patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_exclude_patterns.py
+
+      - name: Run tests for include patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_include_patterns.py
+
+      - name: Run tests for field modifier patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_patterns.py
+
+      - name: Run tests for promote collection
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
+
+      - name: Run tests for migrate dump
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migrate_dump.py
+
+      - name: Run tests for pattern specificity
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py
+
+      - name: Run tests for APIs
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_apis.py
+
+      - name: Run tests for import fulltexts
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_import_fulltexts.py
+
+      - name: Run API tests
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+
+      - name: Cleanup
+        run: docker-compose -f local.yml down

From e9f4bdb5b86e4dacd2aa84db8bea1a1960c64582 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 4 Dec 2024 20:26:43 -0600
Subject: [PATCH 247/441] Fixes #1111

---
 ...test-on-pr.yml => run_full_test_suite.yml} | 124 +++++++++---------
 1 file changed, 62 insertions(+), 62 deletions(-)
 rename .github/workflows/{test-on-pr.yml => run_full_test_suite.yml} (97%)

diff --git a/.github/workflows/test-on-pr.yml b/.github/workflows/run_full_test_suite.yml
similarity index 97%
rename from .github/workflows/test-on-pr.yml
rename to .github/workflows/run_full_test_suite.yml
index fd88a474..90c07654 100644
--- a/.github/workflows/test-on-pr.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -1,62 +1,62 @@
-name: Django Test Suite on PR
-
-on:
-  pull_request:
-    branches:
-      - feature/add-github-actions
-
-jobs:
-  run-tests:
-    runs-on: ubuntu-latest
-
-    services:
-      docker:
-        image: docker:19.03.12
-        options: --privileged
-        ports:
-          - 5432:5432
-
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v2
-
-      - name: Set up Docker Compose
-        run: |
-          sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
-          sudo chmod +x /usr/local/bin/docker-compose
-
-      - name: Build the Docker environment
-        run: docker-compose -f local.yml build
-
-      - name: Run tests for delta patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_delta_patterns.py
-
-      - name: Run tests for exclude patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_exclude_patterns.py
-
-      - name: Run tests for include patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_include_patterns.py
-
-      - name: Run tests for field modifier patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_patterns.py
-
-      - name: Run tests for promote collection
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
-
-      - name: Run tests for migrate dump
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migrate_dump.py
-
-      - name: Run tests for pattern specificity
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py
-
-      - name: Run tests for APIs
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_apis.py
-
-      - name: Run tests for import fulltexts
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_import_fulltexts.py
-
-      - name: Run API tests
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
-
-      - name: Cleanup
-        run: docker-compose -f local.yml down
+name: Django Test Suite on PR
+
+on:
+  pull_request:
+    branches:
+      - feature/add-github-actions
+
+jobs:
+  run-tests:
+    runs-on: ubuntu-latest
+
+    services:
+      docker:
+        image: docker:19.03.12
+        options: --privileged
+        ports:
+          - 5432:5432
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v2
+
+      - name: Set up Docker Compose
+        run: |
+          sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+          sudo chmod +x /usr/local/bin/docker-compose
+
+      - name: Build the Docker environment
+        run: docker-compose -f local.yml build
+
+      - name: Run tests for delta patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_delta_patterns.py
+
+      - name: Run tests for exclude patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_exclude_patterns.py
+
+      - name: Run tests for include patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_include_patterns.py
+
+      - name: Run tests for field modifier patterns
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_patterns.py
+
+      - name: Run tests for promote collection
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
+
+      - name: Run tests for migrate dump
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migrate_dump.py
+
+      - name: Run tests for pattern specificity
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py
+
+      - name: Run tests for APIs
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_apis.py
+
+      - name: Run tests for import fulltexts
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_import_fulltexts.py
+
+      - name: Run API tests
+        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+
+      - name: Cleanup
+        run: docker-compose -f local.yml down

From b5beeb3736928e636018c7bba2e4ab38f82945b7 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 5 Dec 2024 14:43:45 +0530
Subject: [PATCH 248/441] Refresh page on workflow status change

---
 .../static/js/delta_url_list.js                | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index e34a940f..494a80f5 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -2071,8 +2071,26 @@ function postWorkflowStatus(collection_id, workflow_status) {
       "X-CSRFToken": csrftoken,
     },
     success: function (data) {
+      $('#workflowStatusChangeModal button').blur();
+      $("#workflowStatusChangeModal")
+        .removeClass('show')
+        .removeAttr('aria-hidden')
+        .modal('hide');
+      $('.modal-backdrop').remove();
+      $('body').removeClass('modal-open');
       toastr.success("Workflow Status Updated!");
+
+      // Refresh page after modal closes and success message shows
+      setTimeout(function() {
+        window.location = window.location.href;
+      }, 1500);
     },
+    error: function(xhr, status, error) {
+      $('#workflowStatusChangeModal button').blur();
+      $("#workflowStatusChangeModal").modal('hide');
+      $('.modal-backdrop').remove();
+      toastr.error("Error updating workflow status: " + error);
+    }
   });
 }
 

From 1308ebc8988b044cb4e0df24da83c5b4f415d1ed Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 14:25:48 -0600
Subject: [PATCH 249/441] update template worker counts to 3

---
 config_generation/xmls/indexing_template.xml        | 2 +-
 config_generation/xmls/scraper_template.xml         | 2 +-
 config_generation/xmls/webcrawler_initial_crawl.xml | 2 +-
 scraper/sinequa_webcrawler_base_template.xml        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config_generation/xmls/indexing_template.xml b/config_generation/xmls/indexing_template.xml
index 46e47f99..5faa8506 100644
--- a/config_generation/xmls/indexing_template.xml
+++ b/config_generation/xmls/indexing_template.xml
@@ -144,7 +144,7 @@
 	<CurationIdPattern></CurationIdPattern>
 	<RunIndexMiningInIndexer>false</RunIndexMiningInIndexer>
 	<Namespace></Namespace>
-	<WorkerCount>8</WorkerCount>
+	<WorkerCount>3</WorkerCount>
 	<MaxWorkerPerHost></MaxWorkerPerHost>
 	<Url>your_url_here</Url>
 	<UrlList></UrlList>
diff --git a/config_generation/xmls/scraper_template.xml b/config_generation/xmls/scraper_template.xml
index 0f44a2b5..4817394f 100644
--- a/config_generation/xmls/scraper_template.xml
+++ b/config_generation/xmls/scraper_template.xml
@@ -145,7 +145,7 @@
     <CurationIdPattern></CurationIdPattern>
     <RunIndexMiningInIndexer>false</RunIndexMiningInIndexer>
     <Namespace></Namespace>
-    <WorkerCount>8</WorkerCount>
+    <WorkerCount>3</WorkerCount>
     <MaxWorkerPerHost></MaxWorkerPerHost>
     <UrlList></UrlList>
     <DynamicUrlList></DynamicUrlList>
diff --git a/config_generation/xmls/webcrawler_initial_crawl.xml b/config_generation/xmls/webcrawler_initial_crawl.xml
index e86bc04e..9e02dd61 100644
--- a/config_generation/xmls/webcrawler_initial_crawl.xml
+++ b/config_generation/xmls/webcrawler_initial_crawl.xml
@@ -227,7 +227,7 @@
 		<BrowserForWebRequestsViewportWidth></BrowserForWebRequestsViewportWidth>
 		<BrowserForWebRequestsViewportHeight></BrowserForWebRequestsViewportHeight>
 	</UrlAccess>
-	<WorkerCount>8</WorkerCount>
+	<WorkerCount>3</WorkerCount>
 	<MaxWorkerPerHost></MaxWorkerPerHost>
 	<UrlList></UrlList>
 	<DynamicUrlList></DynamicUrlList>
diff --git a/scraper/sinequa_webcrawler_base_template.xml b/scraper/sinequa_webcrawler_base_template.xml
index 6db8d54c..c05c93d2 100644
--- a/scraper/sinequa_webcrawler_base_template.xml
+++ b/scraper/sinequa_webcrawler_base_template.xml
@@ -227,7 +227,7 @@
 		<BrowserForWebRequestsViewportWidth></BrowserForWebRequestsViewportWidth>
 		<BrowserForWebRequestsViewportHeight></BrowserForWebRequestsViewportHeight>
 	</UrlAccess>
-	<WorkerCount>8</WorkerCount>
+	<WorkerCount>3</WorkerCount>
 	<MaxWorkerPerHost></MaxWorkerPerHost>
 	<UrlList></UrlList>
 	<DynamicUrlList></DynamicUrlList>

From 1b70b52f9fa6d2580c88f7ce762ec9399d008578 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Thu, 5 Dec 2024 14:33:00 -0600
Subject: [PATCH 250/441] Fix incorrect import

---
 sde_collections/models/delta_url.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py
index c7f1f4c8..88df502b 100644
--- a/sde_collections/models/delta_url.py
+++ b/sde_collections/models/delta_url.py
@@ -6,7 +6,7 @@
 
 from ..utils.paired_field_descriptor import PairedFieldDescriptor
 from .collection_choice_fields import Divisions, DocumentTypes, TDAMMTags
-from .delta_patterns import DeltaExcludePattern, DeltaTitlePattern
+from .delta_patterns import DeltaExcludePattern, DeltaIncludePattern
 
 
 class DeltaUrlQuerySet(models.QuerySet):

From e0ac9e429a7c9a04774cd6767077fc6df447e8c9 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Thu, 5 Dec 2024 14:48:53 -0600
Subject: [PATCH 251/441] merge migrations conflict

---
 .../migrations/0070_merge_20241205_1437.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 sde_collections/migrations/0070_merge_20241205_1437.py

diff --git a/sde_collections/migrations/0070_merge_20241205_1437.py b/sde_collections/migrations/0070_merge_20241205_1437.py
new file mode 100644
index 00000000..8d904006
--- /dev/null
+++ b/sde_collections/migrations/0070_merge_20241205_1437.py
@@ -0,0 +1,13 @@
+# Generated by Django 4.2.9 on 2024-12-05 20:37
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0068_alter_deltadivisionpattern_collection_and_more"),
+        ("sde_collections", "0069_candidateurl_tdamm_tag_manual_and_more"),
+    ]
+
+    operations = []

From 74d02d7f9cc0d8e2ea3088f27f79eaa3b50cb4b5 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 17:12:22 -0600
Subject: [PATCH 252/441] add initial reindexing statuses

---
 sde_collections/admin.py                      | 20 +++++-
 sde_collections/models/collection.py          | 65 +++++++++++++++++++
 .../models/collection_choice_fields.py        | 16 +++++
 3 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 02f4b11f..0b1c7120 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -10,7 +10,7 @@
 )
 
 from .models.candidate_url import CandidateURL, ResolvedTitle
-from .models.collection import Collection, WorkflowHistory
+from .models.collection import Collection, ReindexingHistory, WorkflowHistory
 from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
 from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api
@@ -215,6 +215,7 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
                     "source",
                     "turned_on",
                     "is_multi_division",
+                    "reindexing_status",
                 ),
             },
         ),
@@ -248,9 +249,17 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
         "division",
         "new_collection",
         "is_multi_division",
+        "reindexing_status",
     )
     readonly_fields = ("config_folder",)
-    list_filter = ("division", "curation_status", "workflow_status", "turned_on", "is_multi_division")
+    list_filter = (
+        "division",
+        "curation_status",
+        "workflow_status",
+        "turned_on",
+        "is_multi_division",
+        "reindexing_status",
+    )
     search_fields = ("name", "url", "config_folder")
     actions = [
         generate_deployment_message,
@@ -310,6 +319,12 @@ class WorkflowHistoryAdmin(admin.ModelAdmin):
     list_filter = ["workflow_status", "old_status"]
 
 
+class ReindexingHistoryAdmin(admin.ModelAdmin):
+    list_display = ("collection", "old_status", "reindexing_status", "created_at")
+    search_fields = ["collection__name"]
+    list_filter = ["reindexing_status", "old_status"]
+
+
 class ResolvedTitleAdmin(admin.ModelAdmin):
     list_display = ["title_pattern", "candidate_url", "resolved_title", "created_at"]
 
@@ -365,6 +380,7 @@ class CuratedUrlAdmin(admin.ModelAdmin):
     list_filter = ("collection",)
 
 
+admin.site.register(ReindexingHistory, ReindexingHistoryAdmin)
 admin.site.register(WorkflowHistory, WorkflowHistoryAdmin)
 admin.site.register(CandidateURL, CandidateURLAdmin)
 admin.site.register(TitlePattern, TitlePatternAdmin)
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 5788f335..55d9b6da 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -23,6 +23,7 @@
     CurationStatusChoices,
     Divisions,
     DocumentTypes,
+    ReindexingStatusChoices,
     SourceChoices,
     UpdateFrequencies,
     WorkflowStatusChoices,
@@ -75,6 +76,11 @@ class Collection(models.Model):
         choices=WorkflowStatusChoices.choices,
         default=WorkflowStatusChoices.RESEARCH_IN_PROGRESS,
     )
+    reindexing_status = models.IntegerField(
+        choices=ReindexingStatusChoices.choices,
+        default=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
+        verbose_name="Reindexing Status",
+    )
     tracker = FieldTracker(fields=["workflow_status"])
 
     curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True)
@@ -155,6 +161,12 @@ def migrate_dump_to_delta(self):
         # self.refresh_url_lists_for_all_patterns() # TODO: I'm pretty confident we shouldn't be running this
         self.apply_all_patterns()
 
+        # After migrating, check if we should update reindexing status
+        curated_urls_count = self.curated_urls.count()
+        if curated_urls_count > 0:
+            self.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION
+            self.save()
+
     def create_or_update_delta_url(self, url_instance, to_delete=False):
         """
         Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
@@ -224,6 +236,12 @@ def promote_to_curated(self):
         # Step 4: Reapply patterns to DeltaUrls
         self.refresh_url_lists_for_all_patterns()
 
+        # After promoting, check if we should update reindexing status
+        curated_urls_count = self.curated_urls.count()
+        if curated_urls_count > 0:
+            self.reindexing_status = ReindexingStatusChoices.REINDEXING_CURATED
+            self.save()
+
     def add_to_public_query(self):
         """Add the collection to the public query."""
         if self.workflow_status not in [
@@ -349,6 +367,18 @@ def workflow_status_button_color(self) -> str:
         }
         return color_choices[self.workflow_status]
 
+    @property
+    def reindexing_status_button_color(self) -> str:
+        color_choices = {
+            1: "btn-light",  # NOT_NEEDED
+            2: "btn-warning",  # NEEDED
+            3: "btn-secondary",  # FINISHED
+            4: "btn-info",  # READY_FOR_CURATION
+            5: "btn-primary",  # CURATED
+            6: "btn-success",  # INDEXED_ON_PROD
+        }
+        return color_choices[self.reindexing_status]
+
     def _process_exclude_list(self):
         """Process the exclude list."""
         return [pattern._process_match_pattern() for pattern in self.excludepattern.all()]
@@ -654,6 +684,7 @@ def __init__(self, *args, **kwargs):
         # Create a cached version of the last workflow_status to compare against
         super().__init__(*args, **kwargs)
         self.old_workflow_status = self.workflow_status
+        self.old_reindexing_status = self.reindexing_status
 
 
 class RequiredUrls(models.Model):
@@ -734,6 +765,40 @@ def log_workflow_history(sender, instance, created, **kwargs):
             old_status=instance.old_workflow_status,
         )
 
+    if instance.reindexing_status != instance.old_reindexing_status:
+        ReindexingHistory.objects.create(
+            collection=instance,
+            reindexing_status=instance.reindexing_status,
+            curated_by=instance.curated_by,
+            old_status=instance.old_reindexing_status,
+        )
+
+
+class ReindexingHistory(models.Model):
+    collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="reindexing_history", null=True)
+    reindexing_status = models.IntegerField(
+        choices=ReindexingStatusChoices.choices,
+        default=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
+    )
+    old_status = models.IntegerField(choices=ReindexingStatusChoices.choices, null=True)
+    curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True)
+    created_at = models.DateTimeField(auto_now_add=True)
+
+    def __str__(self):
+        return str(self.collection) + str(self.reindexing_status)
+
+    @property
+    def reindexing_status_button_color(self) -> str:
+        color_choices = {
+            1: "btn-light",  # REINDEXING_NOT_NEEDED
+            2: "btn-warning",  # REINDEXING_NEEDED_ON_DEV
+            3: "btn-secondary",  # REINDEXING_FINISHED_ON_DEV
+            4: "btn-info",  # REINDEXING_READY_FOR_CURATION
+            5: "btn-primary",  # REINDEXING_CURATED
+            6: "btn-success",  # REINDEXING_INDEXED_ON_PROD
+        }
+        return color_choices[self.reindexing_status]
+
 
 @receiver(post_save, sender=Collection)
 def create_configs_on_status_change(sender, instance, created, **kwargs):
diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index 3a9a3664..1978e4a2 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -97,3 +97,19 @@ class WorkflowStatusChoices(models.IntegerChoices):
     PROD_MAJOR = 16, "Prod: Major Issues"
     MERGE_PENDING = 17, "Code Merge Pending"
     NEEDS_DELETE = 19, "Delete from Prod"
+
+
+class ReindexingStatusChoices(models.IntegerChoices):
+    REINDEXING_NOT_NEEDED = 1, "Reindexing Not Needed"
+    REINDEXING_NEEDED_ON_DEV = 2, "Reindexing Needed on LRM Dev"
+    REINDEXING_FINISHED_ON_DEV = 3, "Reindexing Finished on LRM Dev"
+    REINDEXING_READY_FOR_CURATION = 4, "Ready for Curation"
+    REINDEXING_CURATED = 5, "Curated"
+    REINDEXING_INDEXED_ON_PROD = 6, "Indexed on Prod"
+
+    @classmethod
+    def get_status_string(cls, value):
+        for choice in cls.choices:
+            if choice[0] == value:
+                return choice[1]
+        return "N/A"

From 8f87aefe94a99802598705a7f5eb0bb1793b686c Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 17:16:16 -0600
Subject: [PATCH 253/441] Update sde_collections/models/candidate_url.py

---
 sde_collections/models/candidate_url.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py
index 1ba80c93..2fe3dd82 100644
--- a/sde_collections/models/candidate_url.py
+++ b/sde_collections/models/candidate_url.py
@@ -101,7 +101,6 @@ class Meta:
         verbose_name = "Candidate URL"
         verbose_name_plural = "Candidate URLs"
         ordering = ["url"]
-        db_table = "sde_collections_candidateurl"
 
     @property
     def fileext(self) -> str:

From de20d47a5d5020f763392cfebf7f74555e4ab186 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 17:37:35 -0600
Subject: [PATCH 254/441] add a TDAMM field Not TDAMM

---
 ..._candidateurl_tdamm_tag_manual_and_more.py | 466 ++++++++++++++++++
 .../models/collection_choice_fields.py        |   1 +
 2 files changed, 467 insertions(+)
 create mode 100644 sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py

diff --git a/sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py
new file mode 100644
index 00000000..12b7ae8e
--- /dev/null
+++ b/sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py
@@ -0,0 +1,466 @@
+# Generated by Django 4.2.9 on 2024-12-05 23:36
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0070_merge_20241205_1437"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="candidateurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="candidateurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="curatedurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="curatedurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="deltaurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="dumpurl",
+            name="tdamm_tag_manual",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_manual",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="dumpurl",
+            name="tdamm_tag_ml",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        ("Not TDAMM", "Not TDAMM"),
+                        ("MMA_M_EM", "Messenger - EM Radiation"),
+                        ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"),
+                        ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"),
+                        ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"),
+                        ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"),
+                        ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"),
+                        ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"),
+                        ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"),
+                        ("MMA_M_G", "Messenger - Gravitational Waves"),
+                        ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"),
+                        ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"),
+                        ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"),
+                        ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"),
+                        ("MMA_M_C", "Messenger - Cosmic Rays"),
+                        ("MMA_M_N", "Messenger - Neutrinos"),
+                        ("MMA_O_BI", "Objects - Binaries"),
+                        ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"),
+                        ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"),
+                        ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"),
+                        ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"),
+                        ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"),
+                        ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"),
+                        ("MMA_O_BH", "Objects - Black Holes"),
+                        ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"),
+                        ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"),
+                        ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"),
+                        ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"),
+                        ("MMA_O_E", "Objects - Exoplanets"),
+                        ("MMA_O_N", "Objects - Neutron Stars"),
+                        ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"),
+                        ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"),
+                        ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"),
+                        ("MMA_O_S", "Objects - Supernova Remnants"),
+                        ("MMA_S_F", "Signals - Fast Radio Bursts"),
+                        ("MMA_S_G", "Signals - Gamma-ray Bursts"),
+                        ("MMA_S_K", "Signals - Kilonovae"),
+                        ("MMA_S_N", "Signals - Novae"),
+                        ("MMA_S_P", "Signals - Pevatrons"),
+                        ("MMA_S_ST", "Signals - Stellar flares"),
+                        ("MMA_S_SU", "Signals - Supernovae"),
+                    ],
+                    max_length=255,
+                ),
+                blank=True,
+                db_column="tdamm_tag_ml",
+                null=True,
+                size=None,
+            ),
+        ),
+        migrations.AlterModelTable(
+            name="candidateurl",
+            table=None,
+        ),
+    ]
diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index 60567abd..14793bef 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -102,6 +102,7 @@ class WorkflowStatusChoices(models.IntegerChoices):
 class TDAMMTags(models.TextChoices):
     """TDAMM (Tagged Data for Multi-Messenger Astronomy) tag choices."""
 
+    NOT_TDAMM = "Not TDAMM", "Not TDAMM"
     MMA_M_EM = "MMA_M_EM", "Messenger - EM Radiation"
     MMA_M_EM_G = "MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"
     MMA_M_EM_X = "MMA_M_EM_X", "Messenger - EM Radiation - X-rays"

From 547401c4c11260d3f8dd23a917f1d702a5ce92e9 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 17:42:50 -0600
Subject: [PATCH 255/441] add tdamm tags to the api serializer tests

---
 sde_collections/tests/test_apis.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_collections/tests/test_apis.py b/sde_collections/tests/test_apis.py
index 0df84834..1c842e8a 100644
--- a/sde_collections/tests/test_apis.py
+++ b/sde_collections/tests/test_apis.py
@@ -58,7 +58,7 @@ def test_delta_url_api_serializer_fields(self, client):
 
         assert response.status_code == status.HTTP_200_OK
         data = response.json()["results"][0]
-        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root"}
+        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root", "tdamm_tag"}
         assert set(data.keys()) == expected_fields
 
     def test_delta_url_api_pagination(self, client):
@@ -122,7 +122,7 @@ def test_curated_url_api_serializer_fields(self, client):
 
         assert response.status_code == status.HTTP_200_OK
         data = response.json()["results"][0]
-        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root"}
+        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root", "tdamm_tag"}
         assert set(data.keys()) == expected_fields
 
     def test_candidate_url_api_alias(self, client):
@@ -217,7 +217,7 @@ def test_candidate_url_api_serializer_fields(self, client):
 
         assert response.status_code == status.HTTP_200_OK
         data = response.json()["results"][0]
-        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root"}
+        expected_fields = {"url", "title", "document_type", "file_extension", "tree_root", "tdamm_tag"}
         assert set(data.keys()) == expected_fields
 
     def test_candidate_url_api_alias(self, client):

From a41facafcc02832d99c78898a30014b4ca550463 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 20:49:41 -0600
Subject: [PATCH 256/441] add initial status documentation

---
 .../models/README_REINDEXING_STATUSES.md      | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 sde_collections/models/README_REINDEXING_STATUSES.md

diff --git a/sde_collections/models/README_REINDEXING_STATUSES.md b/sde_collections/models/README_REINDEXING_STATUSES.md
new file mode 100644
index 00000000..ee05592b
--- /dev/null
+++ b/sde_collections/models/README_REINDEXING_STATUSES.md
@@ -0,0 +1,55 @@
+# Reindexing Status Documentation
+
+### Reindexing Not Needed
+- Variable name: `REINDEXING_NOT_NEEDED` (1)
+- Default status for new collections
+- Applied to collections in early workflow stages (research, engineering, etc.)
+
+### Reindexing Needed on LRM Dev
+- Variable name: `REINDEXING_NEEDED_ON_DEV` (2)
+- Indicates collections that need to be reindexed on LRM Dev environment
+- For collections that have already been indexed on production
+
+### Reindexing Finished on LRM Dev
+- Variable name: `REINDEXING_FINISHED_ON_DEV` (3)
+- For collections that have completed reindexing on LRM Dev
+- Currently managed manually by LRM team via admin interface
+
+### Ready for Curation
+- Variable name: `REINDEXING_READY_FOR_CURATION` (4)
+- Automatically set when:
+  - A collection's dump URLs are migrated to delta URLs AND there are curated URLs present
+  - Triggered by Collection.migrate_dump_to_delta() method
+
+### Curated
+- Variable name: `REINDEXING_CURATED` (5)
+- Automatically set when:
+  - Delta URLs are promoted to curated URLs AND there are curated URLs present
+  - Triggered by Collection.promote_to_curated() method
+
+### Indexed on Prod
+- Variable name: `REINDEXING_INDEXED_ON_PROD` (6)
+- Currently managed manually via command line
+- Future: Will be set automatically via plugin ping
+
+### Key Code Locations for Automatic Changes
+
+1. In migrate_dump_to_delta():
+```python
+# After migrating, check if we should update reindexing status
+curated_urls_count = self.curated_urls.count()
+if curated_urls_count > 0:
+    self.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION
+    self.save()
+```
+
+2. In promote_to_curated():
+```python
+# After promoting, check if we should update reindexing status
+curated_urls_count = self.curated_urls.count()
+if curated_urls_count > 0:
+    self.reindexing_status = ReindexingStatusChoices.REINDEXING_CURATED
+    self.save()
+```
+
+Note: All status changes are logged in the ReindexingHistory model for tracking purposes.

From b8bdc4c88d370a2796ca0cdf3024b3b50377dcd6 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 21:55:31 -0600
Subject: [PATCH 257/441] add initial migration to add reindexing statuses

---
 ...ion_reindexing_status_reindexinghistory.py | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py

diff --git a/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
new file mode 100644
index 00000000..2a2fdaae
--- /dev/null
+++ b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
@@ -0,0 +1,85 @@
+# Generated by Django 4.2.9 on 2024-12-06 03:51
+
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+        ("sde_collections", "0071_alter_candidateurl_tdamm_tag_manual_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="collection",
+            name="reindexing_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Reindexing Not Needed"),
+                    (2, "Reindexing Needed on LRM Dev"),
+                    (3, "Reindexing Finished on LRM Dev"),
+                    (4, "Ready for Curation"),
+                    (5, "Curated"),
+                    (6, "Indexed on Prod"),
+                ],
+                default=1,
+                verbose_name="Reindexing Status",
+            ),
+        ),
+        migrations.CreateModel(
+            name="ReindexingHistory",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                (
+                    "reindexing_status",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Reindexing Not Needed"),
+                            (2, "Reindexing Needed on LRM Dev"),
+                            (3, "Reindexing Finished on LRM Dev"),
+                            (4, "Ready for Curation"),
+                            (5, "Curated"),
+                            (6, "Indexed on Prod"),
+                        ],
+                        default=1,
+                    ),
+                ),
+                (
+                    "old_status",
+                    models.IntegerField(
+                        choices=[
+                            (1, "Reindexing Not Needed"),
+                            (2, "Reindexing Needed on LRM Dev"),
+                            (3, "Reindexing Finished on LRM Dev"),
+                            (4, "Ready for Curation"),
+                            (5, "Curated"),
+                            (6, "Indexed on Prod"),
+                        ],
+                        null=True,
+                    ),
+                ),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                (
+                    "collection",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="reindexing_history",
+                        to="sde_collections.collection",
+                    ),
+                ),
+                (
+                    "curated_by",
+                    models.ForeignKey(
+                        blank=True,
+                        null=True,
+                        on_delete=django.db.models.deletion.DO_NOTHING,
+                        to=settings.AUTH_USER_MODEL,
+                    ),
+                ),
+            ],
+        ),
+    ]

From ad82236accbb1c07c78c0e062aed48ea97ac7c07 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 21:56:16 -0600
Subject: [PATCH 258/441] add migration logic to set statuses

---
 ...ion_reindexing_status_reindexinghistory.py | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
index 2a2fdaae..fb94ef11 100644
--- a/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
+++ b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
@@ -5,6 +5,87 @@
 import django.db.models.deletion
 
 
+def set_initial_reindexing_status(apps, schema_editor):
+    Collection = apps.get_model("sde_collections", "Collection")
+
+    # List of collections that have been reindexed on LRM dev
+    reindexed_collections = {
+        "astrophysics_source_code_library",
+        "astrophysics_science_division_asd_code_660",
+        "the_astrophysics_astrochemistry_lab",
+        "Space_Physics_Data_Facility",
+        "ppi_node",
+        "sun_climate_powered_by_solar_irradiance",
+        "magnetospheric_multiscale_satellites",
+        "mdscc_deep_space_network",
+        "voyager",
+        "f_prime",
+        "interactive_multiinstrument_database_of_solar_flares",
+        "cii_hosted_payload_opportunity_online_database",
+        "national_space_weather_program",
+        "starchild_a_learning_center_for_young_astronomers",
+        "nexsci",
+        "explorer_1",
+        "the_new_great_observatories",
+        "nasa_ames_intelligent_systems_division_data",
+        "tropical_cyclone_information_system_data_repository",
+        "explorers_and_heliophysics_projects_division",
+    }
+
+    # Define the workflow status values
+    RESEARCH_IN_PROGRESS = 1
+    READY_FOR_ENGINEERING = 2
+    ENGINEERING_IN_PROGRESS = 3
+    READY_FOR_CURATION = 4
+    CURATION_IN_PROGRESS = 5
+    CURATED = 6
+    QUALITY_FIXED = 7
+    SECRET_DEPLOYMENT_STARTED = 8
+    SECRET_DEPLOYMENT_FAILED = 9
+    READY_FOR_LRM_QUALITY_CHECK = 10
+    READY_FOR_FINAL_QUALITY_CHECK = 11
+    QUALITY_CHECK_FAILED = 12
+    QUALITY_CHECK_PERFECT = 13
+    MERGE_PENDING = 14
+    NEEDS_DELETE = 19
+
+    # Workflow statuses that should be marked as reindexing not needed
+    reindexing_not_needed_statuses = [
+        RESEARCH_IN_PROGRESS,
+        READY_FOR_ENGINEERING,
+        ENGINEERING_IN_PROGRESS,
+        READY_FOR_CURATION,
+        CURATION_IN_PROGRESS,
+        CURATED,
+        QUALITY_FIXED,
+        SECRET_DEPLOYMENT_STARTED,
+        SECRET_DEPLOYMENT_FAILED,
+        READY_FOR_LRM_QUALITY_CHECK,
+        READY_FOR_FINAL_QUALITY_CHECK,
+        QUALITY_CHECK_FAILED,
+        QUALITY_CHECK_PERFECT,
+        MERGE_PENDING,
+        NEEDS_DELETE,
+    ]
+
+    # Set collections that have been reindexed
+    Collection.objects.filter(config_folder__in=reindexed_collections).update(reindexing_status=3)  # FINISHED
+
+    # Set collections that don't need reindexing
+    Collection.objects.filter(workflow_status__in=reindexing_not_needed_statuses).exclude(
+        config_folder__in=reindexed_collections
+    ).update(
+        reindexing_status=1
+    )  # NOT_NEEDED
+
+    # All other collections need reindexing
+    Collection.objects.exclude(config_folder__in=reindexed_collections).exclude(
+        workflow_status__in=reindexing_not_needed_statuses
+    ).update(
+        reindexing_status=2
+    )  # NEEDED
+
+
 class Migration(migrations.Migration):
 
     dependencies = [
@@ -82,4 +163,5 @@ class Migration(migrations.Migration):
                 ),
             ],
         ),
+        migrations.RunPython(set_initial_reindexing_status),
     ]

From 1f70b3208bfbe2357dc4aca6ffdf7e3b34128e08 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 5 Dec 2024 23:29:48 -0600
Subject: [PATCH 259/441] add initial frontend fixes

---
 sde_collections/serializers.py                |  4 +
 sde_collections/views.py                      |  2 +
 .../static/js/collection_list.js              | 97 ++++++++++++++++---
 .../sde_collections/collection_list.html      | 31 +++++-
 4 files changed, 121 insertions(+), 13 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 256290cc..1a26f854 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -15,6 +15,7 @@
 class CollectionSerializer(serializers.ModelSerializer):
     curation_status_display = serializers.CharField(source="get_curation_status_display", read_only=True)
     workflow_status_display = serializers.CharField(source="get_workflow_status_display", read_only=True)
+    reindexing_status_display = serializers.CharField(source="get_reindexing_status_display", read_only=True)
 
     class Meta:
         model = Collection
@@ -22,8 +23,10 @@ class Meta:
             "id",
             "curation_status",
             "workflow_status",
+            "reindexing_status",
             "curation_status_display",
             "workflow_status_display",
+            "reindexing_status_display",
             "curated_by",
             "division",
             "document_type",
@@ -33,6 +36,7 @@ class Meta:
             "division": {"required": False},
             "document_type": {"required": False},
             "name": {"required": False},
+            "reindexing_status": {"required": False},
         }
 
         # extra_kwargs = {
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 3ceaed84..60dc9a4e 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -24,6 +24,7 @@
     CurationStatusChoices,
     Divisions,
     DocumentTypes,
+    ReindexingStatusChoices,
     WorkflowStatusChoices,
 )
 from .models.delta_patterns import (
@@ -80,6 +81,7 @@ def get_context_data(self, **kwargs):
         context["curators"] = User.objects.filter(groups__name="Curators")
         context["curation_status_choices"] = CurationStatusChoices
         context["workflow_status_choices"] = WorkflowStatusChoices
+        context["reindexing_status_choices"] = ReindexingStatusChoices
 
         return context
 
diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index dc33ce3b..85c3aec5 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -108,7 +108,7 @@ let table = $("#collection_table").DataTable({
   ],
   columnDefs: [
     {
-      targets: 8,
+      targets: [8, 9], // Added 9 for reindexing status ID
       visible: false,
     },
     { width: "200px", targets: 1 },
@@ -170,7 +170,7 @@ let table = $("#collection_table").DataTable({
       searchPanes: {
         show: false,
       },
-      targets: [7, 8],
+      targets: [7, 8, 9], // Added 9 for reindexing status ID
     },
     {
       searchPanes: {
@@ -180,6 +180,14 @@ let table = $("#collection_table").DataTable({
       },
       targets: [5],
     },
+    {
+      searchPanes: {
+        dtOpts: {
+          scrollY: "100%",
+        },
+      },
+      targets: [6], // Add searchPane for reindexing status column
+    },
   ],
 });
 
@@ -302,6 +310,51 @@ function handleWorkflowStatusSelect() {
     postWorkflowStatus(collection_id, workflow_status);
   });
 }
+function handleReindexingStatusSelect() {
+  $("body").on("click", ".reindexing_status_select", function () {
+    var collection_id = $(this).data("collection-id");
+    var reindexing_status = $(this).attr("value");
+    var reindexing_status_text = $(this).text();
+    var color_choices = {
+      1: "btn-light",    // REINDEXING_NOT_NEEDED
+      2: "btn-warning",  // REINDEXING_NEEDED_ON_DEV
+      3: "btn-secondary", // REINDEXING_FINISHED_ON_DEV
+      4: "btn-info",     // REINDEXING_READY_FOR_CURATION
+      5: "btn-primary",  // REINDEXING_CURATED
+      6: "btn-success",  // REINDEXING_INDEXED_ON_PROD
+    };
+
+    $possible_buttons = $("body").find(
+      `[id="reindexing-status-button-${collection_id}"]`
+    );
+    if ($possible_buttons.length > 1) {
+      $button = $possible_buttons[1];
+      $button = $($button);
+    } else {
+      $button = $(`#reindexing-status-button-${collection_id}`);
+    }
+    $button.text(reindexing_status_text);
+    $button.removeClass(
+      "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
+    );
+    $button.addClass(color_choices[parseInt(reindexing_status)]);
+    var row = table.row("#" + collection_id);
+    let index = row.index();
+    var $html = $("<div />", { html: table.data()[index][9] }); // Assuming this is column index 9
+    $html.find("button").html(reindexing_status_text);
+    $html
+      .find("button")
+      .removeClass(
+        "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
+      );
+    $html.find("button").addClass(color_choices[parseInt(reindexing_status)]);
+    table.data()[index][9] = $html.html();
+    $("#collection_table").DataTable().searchPanes.rebuildPane(9);
+
+    postReindexingStatus(collection_id, reindexing_status);
+  });
+}
+
 
 function handleCuratorSelect() {
   $("body").on("click", ".curator_select", function () {
@@ -334,6 +387,24 @@ function handleCuratorSelect() {
   });
 }
 
+function postReindexingStatus(collection_id, reindexing_status) {
+  var url = `/api/collections/${collection_id}/`;
+  $.ajax({
+    url: url,
+    type: "PUT",
+    data: {
+      reindexing_status: reindexing_status,
+      csrfmiddlewaretoken: csrftoken,
+    },
+    headers: {
+      "X-CSRFToken": csrftoken,
+    },
+    success: function (data) {
+      toastr.success("Reindexing Status Updated!");
+    },
+  });
+}
+
 function postCurationStatus(collection_id, curation_status) {
   var url = `/api/collections/${collection_id}/`;
   $.ajax({
@@ -403,6 +474,7 @@ $(document).ready(function () {
     "Workflow Status",
     "Curator",
     "Connector Type",
+    "Reindexing Status",
   ];
 
   // Event listener for the collection search input
@@ -415,16 +487,16 @@ $(document).ready(function () {
 
     // Filter the table based on the query in the collection name and config folder data attribute
     table.rows().every(function () {
-        let row = $(this.node());
-        let name = row.find('td').first().text().toLowerCase();
-        let configFolder = row.data('config-folder').toLowerCase();
-        let url = row.find('td').eq(1).text().toLowerCase();
-
-        if (name.includes(query) || configFolder.includes(query) || url.includes(query)) {
-            row.show();
-        } else {
-            row.hide();
-        }
+      let row = $(this.node());
+      let name = row.find('td').first().text().toLowerCase();
+      let configFolder = row.data('config-folder').toLowerCase();
+      let url = row.find('td').eq(1).text().toLowerCase();
+
+      if (name.includes(query) || configFolder.includes(query) || url.includes(query)) {
+        row.show();
+      } else {
+        row.hide();
+      }
     });
   });
 
@@ -448,6 +520,7 @@ $(document).ready(function () {
 function setupClickHandlers() {
   // handleCurationStatusSelect();
   handleWorkflowStatusSelect();
+  handleReindexingStatusSelect();
   handleCuratorSelect();
 }
 
diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index 21bcc824..cddc1eb7 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -22,7 +22,8 @@ <h2 class="title">Welcome back!</h2>
                 <th class="text-center noBorder" style="padding-right:25px !important">Workflow Status</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Curator</th>
                 <th class="text-center noBorder" style="padding-right:25px !important">Connector Type</th>
-
+                <th class="text-center noBorder" style="padding-right:25px !important">Reindexing Status</th>
+                <th class="hideDisplay"></th>
                 <th class="hideDisplay"></th>
                 <th class="hideDisplay"></th>
             </tr>
@@ -44,8 +45,15 @@ <h2 class="title">Welcome back!</h2>
                 {% endfor %}
                 </select></td>
                 <td class="filterRowBottom"><input class="table_filter_row_input textBoxStyling" type="text" id="connectorTypeFilter" placeholder="Connector Type" /></td>
+                <td class="filterRowBottom"><select id="collection-dropdown-6" class="select-dropdown selectStyling">
+                    <option value="">SELECT</option>
+                    {% for choice in reindexing_status_choices %}
+                        <option value="{{ choice }}" data-collection-id={{ collection.id }}>{{ choice.label }}</option>
+                    {% endfor %}
+                </select></td>
                 <td class="hideDisplay"></td>
                 <td class="hideDisplay"></td>
+                <th class="hideDisplay"></th>
             </tr>
         </thead>
 
@@ -114,8 +122,29 @@ <h2 class="title">Welcome back!</h2>
                         </div>
                     </td>
                     <td class="whiteText noBorder">{{ collection.get_connector_display }}</td>
+                    <td class="noBorder">
+                        <div class="dropdown reindexing_status_dropdown"
+                             data-match-pattern
+                             remove_protocol
+                             row
+                             url>
+                            <button class="btn {{ collection.reindexing_status_button_color }} btn-sm dropdown-toggle"
+                                    type="button"
+                                    id="reindexing-status-button-{{ collection.id }}"
+                                    data-toggle="dropdown"
+                                    aria-haspopup="true"
+                                    aria-expanded="false">{{ collection.get_reindexing_status_display }}</button>
+                            <div class="dropdown-menu"
+                                 aria-labelledby="reindexing-status-button-{{ collection.id }}">
+                                {% for choice in reindexing_status_choices %}
+                                    <a class="dropdown-item reindexing_status_select" value="{{ choice }}" data-collection-id={{ collection.id }} >{{ choice.label }}</a>
+                                {% endfor %}
+                            </div>
+                        </div>
+                    </td>
                     <td class="hideDisplay">{{ collection.workflow_status }}</td>
                     <td class="hideDisplay">{{ collection.curated_by_id }}</td>
+                    <td class="hideDisplay">{{ collection.reindexing_status }}</td>
                 </tr>
             {% endfor %}
         </tbody>

From 55c25f686477cfa4a9b30c011058f6cc018209bb Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 6 Dec 2024 15:25:27 +0530
Subject: [PATCH 260/441] to_delete column visible on url page

---
 sde_indexing_helper/static/js/delta_url_list.js      | 12 ++++++++++++
 .../templates/sde_collections/delta_urls_list.html   |  5 +++++
 2 files changed, 17 insertions(+)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index e34a940f..b8edd573 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -253,6 +253,7 @@ function initializeDataTable() {
     columns: [
       getURLColumn(),
       getExcludedColumn(true_icon, false_icon),
+      getDeletedColumn(true_icon, false_icon),
       getScrapedTitleColumn(),
       getGeneratedTitleColumn(),
       getDocumentTypeColumn(),
@@ -1209,6 +1210,17 @@ function getCuratedURLColumn() {
   };
 }
 
+function getDeletedColumn(true_icon, false_icon) {
+  return {
+    data: "to_delete",
+    width: "10%",
+    class: "col-1 text-center",
+    render: function (data, type, row) {
+      return data === true ? true_icon : false_icon;
+    },
+  };
+}
+
 function getScrapedTitleColumn() {
   return {
     data: "scraped_title",
diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index 82698db1..7588d479 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -72,6 +72,7 @@ <h3 class="whiteText deltaTitle">
                     <tr>
                         <th scope="col" class="text-center col-1"><div class="header-title">URL</div></th>
                         <th scope="col" class="text-center col-1"><div class="header-title">Exclude</div></th>
+                        <th class="text-center col-1" scope="col"><div class="header-title">Deleted</div></th>
                         <th scope="col" class="text-center col-1"><div class="header-title">Scraped Title</div></th>
                         <th scope="col" class="text-center col-1"><div class="header-title">New Title</div></th>
                         <th scope="col" class="text-center col-1"><div class="header-title">Document Type</div></th>
@@ -94,6 +95,10 @@ <h3 class="whiteText deltaTitle">
                             <option value="false">FALSE</option>
                             <option value="true">TRUE</option>
                         </select></td>
+                        <td ><select class="dropdown-1 select-dropdown selectStyling"><option value="">SELECT</option>
+                            <option value="false">FALSE</option>
+                            <option value="true">TRUE</option>
+                        </select></td>
                         <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaScrapedTitleFilter" placeholder="Scraped Title" /></td>
                         <td ><input type="text" class="table_filter_row_input textBoxStyling" id="deltaNewTitleFilter" placeholder="New Title" /></td>
                         <td><select class="dropdown-4 select-dropdown selectStyling"><option value="">SELECT</option>

From 986f416b49da1a3e4e6dbbc5c34b575ed0c1fb04 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 6 Dec 2024 11:23:58 -0600
Subject: [PATCH 261/441] fix the display of the reindexing pane and annotate
 html file for clarity

---
 .../static/js/collection_list.js              |   9 +-
 .../sde_collections/collection_list.html      | 188 +++++++++++++-----
 2 files changed, 148 insertions(+), 49 deletions(-)

diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index 85c3aec5..faec2570 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -170,7 +170,7 @@ let table = $("#collection_table").DataTable({
       searchPanes: {
         show: false,
       },
-      targets: [7, 8, 9], // Added 9 for reindexing status ID
+      targets: [8, 9, 10], // this hides the id columns for reindexing status and some other one
     },
     {
       searchPanes: {
@@ -205,6 +205,7 @@ $("#collection-dropdown-5").on("change", function () {
     .draw();
 });
 
+
 $("#nameFilter").on("keyup", function () {
   table.columns(0).search(this.value).draw();
 });
@@ -340,7 +341,7 @@ function handleReindexingStatusSelect() {
     $button.addClass(color_choices[parseInt(reindexing_status)]);
     var row = table.row("#" + collection_id);
     let index = row.index();
-    var $html = $("<div />", { html: table.data()[index][9] }); // Assuming this is column index 9
+    var $html = $("<div />", { html: table.data()[index][7] }); // Assuming this is column index 7
     $html.find("button").html(reindexing_status_text);
     $html
       .find("button")
@@ -348,8 +349,8 @@ function handleReindexingStatusSelect() {
         "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
       );
     $html.find("button").addClass(color_choices[parseInt(reindexing_status)]);
-    table.data()[index][9] = $html.html();
-    $("#collection_table").DataTable().searchPanes.rebuildPane(9);
+    table.data()[index][7] = $html.html();
+    $("#collection_table").DataTable().searchPanes.rebuildPane(7);
 
     postReindexingStatus(collection_id, reindexing_status);
   });
diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index cddc1eb7..49515454 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -11,137 +11,231 @@
 {% block content %}
     {% csrf_token %}
     <br>
-    <h2 class="title">Welcome back!</h2>
-    <table id="collection_table" class="table table-striped" style="width:100%" >
-        <thead class="tableHeader">
-            <tr>
-                <th class="text-center noBorder" style="padding-right:25px !important">Name</th>
-                <th class="text-center noBorder url-th" style="padding-right:25px !important">Url</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Division</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Delta Urls</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Workflow Status</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Curator</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Connector Type</th>
-                <th class="text-center noBorder" style="padding-right:25px !important">Reindexing Status</th>
-                <th class="hideDisplay"></th>
-                <th class="hideDisplay"></th>
-                <th class="hideDisplay"></th>
-            </tr>
-            <tr>
-                <td class="filterRowBottom" ><input class="table_filter_row_input textBoxStyling" type="text" id="nameFilter" placeholder="Name" /></td>
-                <td class="filterRowBottom url-td"><input class="table_filter_row_input textBoxStyling" type="text" id="urlFilter" placeholder="URL" /></td>
-                <td class="filterRowBottom"><input class="table_filter_row_input textBoxStyling" type="text" id="divisionFilter" placeholder="Division" /></td>
-                <td class="filterRowBottom"></td>
-                <td class="filterRowBottom"><select id="collection-dropdown-4" class="select-dropdown selectStyling">
+<!-- Page Title -->
+<h2 class="title">Welcome back!</h2>
+
+<!-- Main Collections Table -->
+<table id="collection_table" class="table table-striped" style="width:100%">
+    <thead class="tableHeader">
+        <!-- Column Headers Row -->
+        <tr>
+            <!-- Visible Columns -->
+            <th class="text-center noBorder" style="padding-right:25px !important">Name</th>
+            <th class="text-center noBorder url-th" style="padding-right:25px !important">Url</th>
+            <th class="text-center noBorder" style="padding-right:25px !important">Division</th>
+            <th class="text-center noBorder" style="padding-right:25px !important">Delta Urls</th>
+            <th class="text-center noBorder" style="padding-right:25px !important">Workflow Status</th>
+            <th class="text-center noBorder" style="padding-right:25px !important">Curator</th>
+            <th class="text-center noBorder" style="padding-right:25px !important">Connector Type</th>
+            <th class="text-center noBorder" style="padding-right:25px !important">Reindexing Status</th>
+
+            <!-- Hidden Columns - Used for internal data storage -->
+            <th class="hideDisplay"></th>
+            <th class="hideDisplay"></th>
+            <th class="hideDisplay"></th>
+        </tr>
+
+        <!-- Filter Row - Contains inputs and dropdowns for filtering table data -->
+        <tr>
+            <!-- Text input filter for Name -->
+            <td class="filterRowBottom">
+                <input class="table_filter_row_input textBoxStyling"
+                       type="text"
+                       id="nameFilter"
+                       placeholder="Name" />
+            </td>
+
+            <!-- Text input filter for URL -->
+            <td class="filterRowBottom url-td">
+                <input class="table_filter_row_input textBoxStyling"
+                       type="text"
+                       id="urlFilter"
+                       placeholder="URL" />
+            </td>
+
+            <!-- Text input filter for Division -->
+            <td class="filterRowBottom">
+                <input class="table_filter_row_input textBoxStyling"
+                       type="text"
+                       id="divisionFilter"
+                       placeholder="Division" />
+            </td>
+
+            <!-- Empty cell for Delta Urls (no filter) -->
+            <td class="filterRowBottom"></td>
+
+            <!-- Dropdown filter for Workflow Status -->
+            <td class="filterRowBottom">
+                <select id="collection-dropdown-4" class="select-dropdown selectStyling">
                     <option value="">SELECT</option>
-                   {% for choice in workflow_status_choices %}
-                       <option value="{{ choice }}" data-collection-id={{ collection.id }}>{{ choice.label }}</option>
-                   {% endfor %}
-                </select></td>
-                <td class="filterRowBottom"><select id="collection-dropdown-5" class="select-dropdown selectStyling">
+                    {% for choice in workflow_status_choices %}
+                        <option value="{{ choice }}"
+                                data-collection-id={{ collection.id }}>{{ choice.label }}</option>
+                    {% endfor %}
+                </select>
+            </td>
+
+            <!-- Dropdown filter for Curator -->
+            <td class="filterRowBottom">
+                <select id="collection-dropdown-5" class="select-dropdown selectStyling">
                     <option value="">SELECT</option>
                     {% for curator in curators %}
-                    <option  value="{{ curator.pk }}" data-collection-id={{ collection.id }}>{{ curator.username }}</option>
-                {% endfor %}
-                </select></td>
-                <td class="filterRowBottom"><input class="table_filter_row_input textBoxStyling" type="text" id="connectorTypeFilter" placeholder="Connector Type" /></td>
-                <td class="filterRowBottom"><select id="collection-dropdown-6" class="select-dropdown selectStyling">
+                        <option value="{{ curator.pk }}"
+                                data-collection-id={{ collection.id }}>{{ curator.username }}</option>
+                    {% endfor %}
+                </select>
+            </td>
+
+            <!-- Text input filter for Connector Type -->
+            <td class="filterRowBottom">
+                <input class="table_filter_row_input textBoxStyling"
+                       type="text"
+                       id="connectorTypeFilter"
+                       placeholder="Connector Type" />
+            </td>
+
+            <!-- Dropdown filter for Reindexing Status -->
+            <td class="filterRowBottom">
+                <select id="collection-dropdown-6" class="select-dropdown selectStyling">
                     <option value="">SELECT</option>
                     {% for choice in reindexing_status_choices %}
-                        <option value="{{ choice }}" data-collection-id={{ collection.id }}>{{ choice.label }}</option>
+                        <option value="{{ choice }}"
+                                data-collection-id={{ collection.id }}>{{ choice.label }}</option>
                     {% endfor %}
-                </select></td>
-                <td class="hideDisplay"></td>
-                <td class="hideDisplay"></td>
-                <th class="hideDisplay"></th>
-            </tr>
-        </thead>
+                </select>
+            </td>
 
+            <!-- Hidden cells corresponding to hidden columns -->
+            <td class="hideDisplay"></td>  <!-- Stores raw workflow_status value (number) -->
+            <td class="hideDisplay"></td>  <!-- Stores raw curated_by_id value -->
+            <td class="hideDisplay"></td>  <!-- Stores raw reindexing_status value (number) -->
+        </tr>
+    </thead>
         <tbody>
+            <!-- Universal search bar above the table -->
             <div class="search-container">
                 <label for="collectionSearch">Universal Search</label>
                 <input type="text" id="collectionSearch" placeholder="Type to search..." class="table_filter_row_input textBoxStyling">
             </div>
+
+            <!-- Iterate through each collection to create table rows -->
             {% for collection in collections %}
                 <tr id="{{ collection.id }}" data-config-folder="{{ collection.config_folder }}">
+                    <!-- Name Column with Link -->
                     <td class="noBorder">
-                        <a class="nameStyling" href="{% url 'sde_collections:detail' collection.pk %}">{{ collection.name }} <i class="material-icons">chevron_right</i></a>
+                        <a class="nameStyling" href="{% url 'sde_collections:detail' collection.pk %}">
+                            {{ collection.name }}
+                            <i class="material-icons">chevron_right</i>
+                        </a>
                     </td>
+
+                    <!-- URL Column with External Link -->
                     <td class="url-td whiteText noBorder">
                         {% if collection.url %}
                             <div class="url-cell">
+                                <!-- URL text with tooltip -->
                                 <span class="url" title="{{collection.url}}">{{ collection.url }}</span>
-                                <a class="url-link" target="_blank" href=" {{ collection.url }} "> <i class="material-icons urlIcon">open_in_new</i></a></div>
+                                <!-- External link icon -->
+                                <a class="url-link" target="_blank" href=" {{ collection.url }} ">
+                                    <i class="material-icons urlIcon">open_in_new</i>
+                                </a>
+                            </div>
                         {% endif %}
                     </td>
+
+                    <!-- Division Column -->
                     <td class="whiteText noBorder">{{ collection.get_division_display }}</td>
+
+                    <!-- Delta URLs Column - Shows count and links if > 0 -->
                     <td class="noBorder centerAlign">
                         <a href=" {% if collection.num_delta_urls > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
                            class="btn btn-sm {% if collection.num_delta_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
                            role="button">{{ collection.num_delta_urls|intcomma }}</a>
                     </td>
+
+                    <!-- Workflow Status Dropdown -->
                     <td class="noBorder">
                         <div class="dropdown workflow_status_dropdown"
                              data-match-pattern
                              remove_protocol
                              row
                              url>
+                            <!-- Workflow status button with dynamic color -->
                             <button class="btn {{ collection.workflow_status_button_color }} btn-sm dropdown-toggle"
                                     type="button"
                                     id="workflow-status-button-{{ collection.id }}"
                                     data-toggle="dropdown"
                                     aria-haspopup="true"
                                     aria-expanded="false">{{ collection.get_workflow_status_display }}</button>
+                            <!-- Dropdown menu with all possible workflow statuses -->
                             <div class="dropdown-menu"
                                  aria-labelledby="workflow-status-button-{{ collection.id }}">
                                 {% for choice in workflow_status_choices %}
-                                    <a class="dropdown-item workflow_status_select" value="{{ choice }}" data-collection-id={{ collection.id }} >{{ choice.label }}</a>
+                                    <a class="dropdown-item workflow_status_select"
+                                       value="{{ choice }}"
+                                       data-collection-id={{ collection.id }}>{{ choice.label }}</a>
                                 {% endfor %}
                             </div>
                         </div>
                     </td>
+
+                    <!-- Curator Dropdown -->
                     <td class="noBorder">
                         <div class="dropdown curator_dropdown"
                              data-match-pattern
                              remove_protocol
                              row
                              url>
+                            <!-- Curator button - green if assigned, dark if not -->
                             <button class="btn {% if collection.curated_by %} btn-success {% else %} btn-dark {% endif %} btn-sm dropdown-toggle"
                                     type="button"
                                     id="curator-button-{{ collection.id }}"
                                     data-toggle="dropdown"
                                     aria-haspopup="true"
                                     aria-expanded="false">{{ collection.curated_by }}</button>
+                            <!-- Dropdown menu with all possible curators -->
                             <div class="dropdown-menu"
                                  aria-labelledby="curator-button-{{ collection.id }}">
                                 {% for curator in curators %}
-                                    <a  key="{{ curator.pk }}" class="dropdown-item curator_select" value="{{ curator.pk }}"  data-collection-id={{ collection.id }}
-                                     >{{ curator.username }}</a>
+                                    <a key="{{ curator.pk }}"
+                                       class="dropdown-item curator_select"
+                                       value="{{ curator.pk }}"
+                                       data-collection-id={{ collection.id }}>{{ curator.username }}</a>
                                 {% endfor %}
                             </div>
                         </div>
                     </td>
+
+                    <!-- Connector Type Column -->
                     <td class="whiteText noBorder">{{ collection.get_connector_display }}</td>
+
+                    <!-- Reindexing Status Dropdown -->
                     <td class="noBorder">
                         <div class="dropdown reindexing_status_dropdown"
                              data-match-pattern
                              remove_protocol
                              row
                              url>
+                            <!-- Reindexing status button with dynamic color -->
                             <button class="btn {{ collection.reindexing_status_button_color }} btn-sm dropdown-toggle"
                                     type="button"
                                     id="reindexing-status-button-{{ collection.id }}"
                                     data-toggle="dropdown"
                                     aria-haspopup="true"
                                     aria-expanded="false">{{ collection.get_reindexing_status_display }}</button>
+                            <!-- Dropdown menu with all possible reindexing statuses -->
                             <div class="dropdown-menu"
                                  aria-labelledby="reindexing-status-button-{{ collection.id }}">
                                 {% for choice in reindexing_status_choices %}
-                                    <a class="dropdown-item reindexing_status_select" value="{{ choice }}" data-collection-id={{ collection.id }} >{{ choice.label }}</a>
+                                    <a class="dropdown-item reindexing_status_select"
+                                       value="{{ choice }}"
+                                       data-collection-id={{ collection.id }}>{{ choice.label }}</a>
                                 {% endfor %}
                             </div>
                         </div>
                     </td>
+
+                    <!-- Hidden columns for internal use -->
                     <td class="hideDisplay">{{ collection.workflow_status }}</td>
                     <td class="hideDisplay">{{ collection.curated_by_id }}</td>
                     <td class="hideDisplay">{{ collection.reindexing_status }}</td>
@@ -150,15 +244,19 @@ <h2 class="title">Welcome back!</h2>
         </tbody>
     </table>
 
+    <!-- Column Customization Modal -->
     <div id="hideShowColumnsModal" class="modal pr-4 pl-4 pt-4 customizeColumnContainer">
         <div class="modalDialog">
             <div class="modalContent">
+                <!-- Modal Header -->
                 <div class="modalHeader ">
                     <h5 class="modalTitle whiteText" id="hideShowColumnsModalTitle">Customize Columns</h5>
                     <p id="subTitle" class="whiteText">Attributes marked with a checkbox will be displayed in the table.</p>
                 </div>
+                <!-- Modal Form -->
                 <form id="hide_show_columns_form">
                     <div class="modalBody whiteText" id="modalBody">
+                        <!-- Checkboxes are dynamically inserted here by JavaScript -->
                     </div>
                     <div class="modalFooter customizeColumnContainer">
                         <div type="submit" class="btn-prime hideShowSubmitButton" id="hideShowSubmitButton">Confirm</div>

From 1d90663a2e9a1951f8070722c70cbae8e7fe76dc Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 6 Dec 2024 13:03:21 -0600
Subject: [PATCH 262/441] refactor collection list to use column names instead
 of indices

---
 .../static/js/collection_list.js              | 137 +++++++-----------
 1 file changed, 53 insertions(+), 84 deletions(-)

diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index faec2570..8861a050 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -1,3 +1,18 @@
+// Define column constants for better maintainability
+const COLUMNS = {
+  NAME: 0,
+  URL: 1,
+  DIVISION: 2,
+  DELTA_URLS: 3,
+  WORKFLOW_STATUS: 4,
+  CURATOR: 5,
+  CONNECTOR_TYPE: 6,
+  REINDEXING_STATUS: 7,
+  WORKFLOW_STATUS_RAW: 8,
+  CURATOR_ID: 9,
+  REINDEXING_STATUS_RAW: 10
+};
+
 var uniqueId; //used for logic related to contents on column customization modal
 
 function modalContents(tableName) {
@@ -107,70 +122,68 @@ let table = $("#collection_table").DataTable({
     },
   ],
   columnDefs: [
+    // hide the data columns
     {
-      targets: [8, 9], // Added 9 for reindexing status ID
+      targets: [COLUMNS.WORKFLOW_STATUS_RAW, COLUMNS.CURATOR_ID, COLUMNS.REINDEXING_STATUS_RAW],
       visible: false,
     },
-    { width: "200px", targets: 1 },
+    { width: "200px", targets: COLUMNS.URL },
     {
       searchPanes: {
         options: [
           {
             label: "0 URLs",
             value: function (rowData, rowIdx) {
-              return $(rowData[3]).text() == 0;
+              return $(rowData[COLUMNS.DELTA_URLS]).text() == 0;
             },
           },
           {
             label: "1 solo URL",
             value: function (rowData, rowIdx) {
-              return $(rowData[3]).text() == 1;
+              return $(rowData[COLUMNS.DELTA_URLS]).text() == 1;
             },
           },
           {
             label: "1 to 100 URLs",
             value: function (rowData, rowIdx) {
-              return $(rowData[3]).text() <= 100 && $(rowData[3]).text() > 1;
+              return $(rowData[COLUMNS.DELTA_URLS]).text() <= 100 && $(rowData[COLUMNS.DELTA_URLS]).text() > 1;
             },
           },
           {
             label: "100 to 1,000 URLs",
             value: function (rowData, rowIdx) {
-              return $(rowData[3]).text() <= 1000 && $(rowData[3]).text() > 100;
+              return $(rowData[COLUMNS.DELTA_URLS]).text() <= 1000 && $(rowData[COLUMNS.DELTA_URLS]).text() > 100;
             },
           },
           {
             label: "1,000 to 10,000 URLs",
             value: function (rowData, rowIdx) {
-              return (
-                $(rowData[3]).text() <= 10000 && $(rowData[3]).text() > 1000
-              );
+              return $(rowData[COLUMNS.DELTA_URLS]).text() <= 10000 && $(rowData[COLUMNS.DELTA_URLS]).text() > 1000;
             },
           },
           {
             label: "10,000 to 100,000 URLs",
             value: function (rowData, rowIdx) {
-              return (
-                $(rowData[3]).text() <= 100000 && $(rowData[3]).text() > 10000
-              );
+              return $(rowData[COLUMNS.DELTA_URLS]).text() <= 100000 && $(rowData[COLUMNS.DELTA_URLS]).text() > 10000;
             },
           },
           {
             label: "Over 100,000 URLs",
             value: function (rowData, rowIdx) {
-              return $(rowData[3]).text() > 100000;
+              return $(rowData[COLUMNS.DELTA_URLS]).text() > 100000;
             },
           },
         ],
       },
-      targets: [3],
+      targets: [COLUMNS.DELTA_URLS],
       type: "num-fmt",
     },
+    // hide the data panes
     {
       searchPanes: {
         show: false,
       },
-      targets: [8, 9, 10], // this hides the id columns for reindexing status and some other one
+      targets: [COLUMNS.WORKFLOW_STATUS_RAW, COLUMNS.CURATOR_ID, COLUMNS.REINDEXING_STATUS_RAW],
     },
     {
       searchPanes: {
@@ -178,7 +191,7 @@ let table = $("#collection_table").DataTable({
           scrollY: "100%",
         },
       },
-      targets: [5],
+      targets: [COLUMNS.CURATOR],
     },
     {
       searchPanes: {
@@ -186,77 +199,50 @@ let table = $("#collection_table").DataTable({
           scrollY: "100%",
         },
       },
-      targets: [6], // Add searchPane for reindexing status column
+      targets: [COLUMNS.CONNECTOR_TYPE],
     },
   ],
 });
 
 $("#collection-dropdown-4").on("change", function () {
   table
-    .columns(7)
+    .columns(COLUMNS.WORKFLOW_STATUS_RAW)
     .search(this.value ? "^" + this.value + "$" : "", true, false)
     .draw();
 });
 
 $("#collection-dropdown-5").on("change", function () {
   table
-    .columns(8)
+    .columns(COLUMNS.CURATOR_ID)
     .search(this.value ? "^" + this.value + "$" : "", true, false)
     .draw();
 });
 
+$("#collection-dropdown-6").on("change", function () {
+  table
+    .columns(COLUMNS.REINDEXING_STATUS_RAW)
+    .search(this.value ? "^" + this.value + "$" : "", true, false)
+    .draw();
+});
 
 $("#nameFilter").on("keyup", function () {
-  table.columns(0).search(this.value).draw();
+  table.columns(COLUMNS.NAME).search(this.value).draw();
 });
 
 $("#urlFilter").on("keyup", function () {
-  table.columns(1).search(this.value).draw();
+  table.columns(COLUMNS.URL).search(this.value).draw();
 });
 
 $("#divisionFilter").on("keyup", function () {
-  table.columns(2).search(this.value).draw();
+  table.columns(COLUMNS.DIVISION).search(this.value).draw();
 });
 
 $("#connectorTypeFilter").on("keyup", function () {
-  table.columns(6).search(this.value).draw();
+  table.columns(COLUMNS.CONNECTOR_TYPE).search(this.value).draw();
 });
 
 var csrftoken = $('input[name="csrfmiddlewaretoken"]').val();
 
-// I don't think this function is being used
-// function handleCurationStatusSelect() {
-//     $("body").on("click", ".curation_status_select", function () {
-//         var collection_id = $(this).data('collection-id');
-//         var curation_status = $(this).attr('value');
-//         var curation_status_text = $(this).text();
-//         var color_choices = {
-//             1: "btn-light",
-//             2: "btn-danger",
-//             3: "btn-warning",
-//             4: "btn-info",
-//             5: "btn-success",
-//             6: "btn-primary",
-//             7: "btn-info",
-//             8: "btn-secondary",
-//         }
-
-//         $possible_buttons = $('body').find(`[id="curation-status-button-${collection_id}"]`);
-//         if ($possible_buttons.length > 1) {
-//             $button = $possible_buttons[1];
-//             $button = $($button);
-//         } else {
-//             $button = $(`#curation-status-button-${collection_id}`);
-//         }
-//         $button.text(curation_status_text);
-//         $button.removeClass('btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary');
-//         $button.addClass(color_choices[parseInt(curation_status)]);
-//         $('#collection_table').DataTable().searchPanes.rebuildPane(6);
-//         var collection_division = $(this).data('collection-division');
-//         postCurationStatus(collection_id, curation_status, collection_division);
-//     });
-// }
-
 function handleWorkflowStatusSelect() {
   $("body").on("click", ".workflow_status_select", function () {
     var collection_id = $(this).data("collection-id");
@@ -297,7 +283,7 @@ function handleWorkflowStatusSelect() {
     $button.addClass(color_choices[parseInt(workflow_status)]);
     var row = table.row("#" + collection_id);
     let index = row.index();
-    var $html = $("<div />", { html: table.data()[index][4] });
+    var $html = $("<div />", { html: table.data()[index][COLUMNS.WORKFLOW_STATUS] });
     $html.find("button").html(workflow_status_text);
     $html
       .find("button")
@@ -305,12 +291,13 @@ function handleWorkflowStatusSelect() {
         "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
       );
     $html.find("button").addClass(color_choices[parseInt(workflow_status)]);
-    table.data()[index][4] = $html.html();
-    $("#collection_table").DataTable().searchPanes.rebuildPane(4);
+    table.data()[index][COLUMNS.WORKFLOW_STATUS] = $html.html();
+    $("#collection_table").DataTable().searchPanes.rebuildPane(COLUMNS.WORKFLOW_STATUS);
 
     postWorkflowStatus(collection_id, workflow_status);
   });
 }
+
 function handleReindexingStatusSelect() {
   $("body").on("click", ".reindexing_status_select", function () {
     var collection_id = $(this).data("collection-id");
@@ -341,7 +328,7 @@ function handleReindexingStatusSelect() {
     $button.addClass(color_choices[parseInt(reindexing_status)]);
     var row = table.row("#" + collection_id);
     let index = row.index();
-    var $html = $("<div />", { html: table.data()[index][7] }); // Assuming this is column index 7
+    var $html = $("<div />", { html: table.data()[index][COLUMNS.REINDEXING_STATUS] });
     $html.find("button").html(reindexing_status_text);
     $html
       .find("button")
@@ -349,14 +336,13 @@ function handleReindexingStatusSelect() {
         "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
       );
     $html.find("button").addClass(color_choices[parseInt(reindexing_status)]);
-    table.data()[index][7] = $html.html();
-    $("#collection_table").DataTable().searchPanes.rebuildPane(7);
+    table.data()[index][COLUMNS.REINDEXING_STATUS] = $html.html();
+    $("#collection_table").DataTable().searchPanes.rebuildPane(COLUMNS.REINDEXING_STATUS);
 
     postReindexingStatus(collection_id, reindexing_status);
   });
 }
 
-
 function handleCuratorSelect() {
   $("body").on("click", ".curator_select", function () {
     var collection_id = $(this).data("collection-id");
@@ -380,10 +366,10 @@ function handleCuratorSelect() {
     $button.addClass("btn-success");
     var row = table.row("#" + collection_id);
     let index = row.index();
-    var $html = $("<div />", { html: table.data()[index][5] });
+    var $html = $("<div />", { html: table.data()[index][COLUMNS.CURATOR] });
     $html.find("button").html(curator_text);
-    table.data()[index][5] = $html.html();
-    table.searchPanes.rebuildPane(5);
+    table.data()[index][COLUMNS.CURATOR] = $html.html();
+    table.searchPanes.rebuildPane(COLUMNS.CURATOR);
     postCurator(collection_id, curator_id);
   });
 }
@@ -406,23 +392,6 @@ function postReindexingStatus(collection_id, reindexing_status) {
   });
 }
 
-function postCurationStatus(collection_id, curation_status) {
-  var url = `/api/collections/${collection_id}/`;
-  $.ajax({
-    url: url,
-    type: "PUT",
-    data: {
-      curation_status: curation_status,
-      csrfmiddlewaretoken: csrftoken,
-    },
-    headers: {
-      "X-CSRFToken": csrftoken,
-    },
-    success: function (data) {
-      toastr.success("Curation Status Updated!");
-    },
-  });
-}
 
 function postWorkflowStatus(collection_id, workflow_status) {
   var url = `/api/collections/${collection_id}/`;
@@ -486,6 +455,7 @@ $(document).ready(function () {
     // Clear previous search
     table.search('').columns().search('');
 
+    // TODO: this section might still need to be refactored to align with our column index definitions
     // Filter the table based on the query in the collection name and config folder data attribute
     table.rows().every(function () {
       let row = $(this.node());
@@ -519,7 +489,6 @@ $(document).ready(function () {
 });
 
 function setupClickHandlers() {
-  // handleCurationStatusSelect();
   handleWorkflowStatusSelect();
   handleReindexingStatusSelect();
   handleCuratorSelect();

From 04ba82cfd72e1a2b1c5474548ccb01a7374f4603 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 6 Dec 2024 13:17:59 -0600
Subject: [PATCH 263/441] switch to text insertion instead of html in
 collection list

---
 sde_indexing_helper/static/js/collection_list.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index 8861a050..f5c88606 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -284,7 +284,7 @@ function handleWorkflowStatusSelect() {
     var row = table.row("#" + collection_id);
     let index = row.index();
     var $html = $("<div />", { html: table.data()[index][COLUMNS.WORKFLOW_STATUS] });
-    $html.find("button").html(workflow_status_text);
+    $html.find("button").text(workflow_status_text);
     $html
       .find("button")
       .removeClass(
@@ -329,7 +329,7 @@ function handleReindexingStatusSelect() {
     var row = table.row("#" + collection_id);
     let index = row.index();
     var $html = $("<div />", { html: table.data()[index][COLUMNS.REINDEXING_STATUS] });
-    $html.find("button").html(reindexing_status_text);
+    $html.find("button").text(reindexing_status_text);
     $html
       .find("button")
       .removeClass(
@@ -367,7 +367,7 @@ function handleCuratorSelect() {
     var row = table.row("#" + collection_id);
     let index = row.index();
     var $html = $("<div />", { html: table.data()[index][COLUMNS.CURATOR] });
-    $html.find("button").html(curator_text);
+    $html.find("button").text(curator_text);
     table.data()[index][COLUMNS.CURATOR] = $html.html();
     table.searchPanes.rebuildPane(COLUMNS.CURATOR);
     postCurator(collection_id, curator_id);

From ee7229c02b129590db0de4f28d3c611d026d7d52 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 6 Dec 2024 17:21:46 -0600
Subject: [PATCH 264/441] change worker count in new_collection_template

---
 sde_collections/xml_templates/new_collection_template.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/xml_templates/new_collection_template.xml b/sde_collections/xml_templates/new_collection_template.xml
index 8c80f33b..0ee71927 100644
--- a/sde_collections/xml_templates/new_collection_template.xml
+++ b/sde_collections/xml_templates/new_collection_template.xml
@@ -145,7 +145,7 @@
     <CurationIdPattern></CurationIdPattern>
     <RunIndexMiningInIndexer>false</RunIndexMiningInIndexer>
     <Namespace></Namespace>
-    <WorkerCount>8</WorkerCount>
+    <WorkerCount>3</WorkerCount>
     <MaxWorkerPerHost></MaxWorkerPerHost>
     <UrlList></UrlList>
     <DynamicUrlList></DynamicUrlList>

From 1b71c2db990f5b844f5f3b4b499e35e4242ecb62 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 6 Dec 2024 17:23:27 -0600
Subject: [PATCH 265/441] update worker count in default scraper

---
 default_scraper.xml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/default_scraper.xml b/default_scraper.xml
index 642b2e53..ef583b3b 100644
--- a/default_scraper.xml
+++ b/default_scraper.xml
@@ -33,8 +33,7 @@
         <StoreInCollectionCache>false</StoreInCollectionCache>
         <GetFilePropertiesFromConverter>false</GetFilePropertiesFromConverter>
     </Indexation>
-    <System>
-    </System>
+    <System> </System>
     <DisplayLongProperties>false</DisplayLongProperties>
     <LongPropertyLimit></LongPropertyLimit>
     <UsePerformanceMetrics>true</UsePerformanceMetrics>
@@ -143,7 +142,7 @@
     <CurationIdPattern></CurationIdPattern>
     <RunIndexMiningInIndexer>false</RunIndexMiningInIndexer>
     <Namespace></Namespace>
-    <WorkerCount>8</WorkerCount>
+    <WorkerCount>3</WorkerCount>
     <MaxWorkerPerHost></MaxWorkerPerHost>
     <Url>enter your url here</Url>
     <UrlList></UrlList>

From 2b811b65b05f5e474efeb7c3f6413ede1ea58cee Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 6 Dec 2024 20:10:28 -0600
Subject: [PATCH 266/441] add automatic batch size reduction to sinequa_api

---
 sde_collections/sinequa_api.py     | 79 +++++++++++++++++-------------
 sde_collections/tests/api_tests.py | 65 ++++++++++++++++++++++++
 2 files changed, 110 insertions(+), 34 deletions(-)

diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py
index 78ac4e05..8dedbda0 100644
--- a/sde_collections/sinequa_api.py
+++ b/sde_collections/sinequa_api.py
@@ -188,15 +188,26 @@ def _process_rows_to_records(self, rows: list) -> list[dict]:
             processed_records.append({"url": row[0], "full_text": row[1], "title": row[2]})
         return processed_records
 
-    def get_full_texts(self, collection_config_folder: str, source: str = None) -> Iterator[dict]:
+    def get_full_texts(
+        self,
+        collection_config_folder: str,
+        source: str = None,
+        start_at: int = 0,
+        batch_size: int = 500,
+        min_batch_size: int = 1,
+    ) -> Iterator[dict]:
         """
         Retrieves and yields batches of text records from the SQL database for a given collection.
-        Uses pagination to handle large datasets efficiently.
+        Uses pagination to handle large datasets efficiently. If a query fails, it automatically
+        reduces the batch size and retries, with the ability to recover batch size after successful queries.
 
         Args:
-            collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "SMD")
+            collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "CASEI")
             source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers
                 or "SDE" for other servers.
+            start_at (int, optional): Starting offset for records. Defaults to 0.
+            page_size (int, optional): Initial number of records per batch. Defaults to 500.
+            min_batch_size (int, optional): Minimum batch size before giving up. Defaults to 1.
 
         Yields:
             list[dict]: Batches of records, where each record is a dictionary containing:
@@ -208,29 +219,16 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> I
 
         Raises:
             ValueError: If the server's index is not defined in its configuration
-
-        Example batch:
-            [
-                {
-                    "url": "https://example.nasa.gov/doc1",
-                    "full_text": "This is the content of doc1...",
-                    "title": "Document 1 Title"
-                },
-                {
-                    "url": "https://example.nasa.gov/doc2",
-                    "full_text": "This is the content of doc2...",
-                    "title": "Document 2 Title"
-                }
-            ]
+            ValueError: If batch size reaches minimum without success
 
         Note:
-            - Results are paginated in batches of 5000 records
+            - Results are paginated with adaptive batch sizing
             - Each batch is processed into clean dictionaries before being yielded
             - The iterator will stop when either:
                 1. No more rows are returned from the query
                 2. The total count of records has been reached
+            - Batch size will decrease on failure and can recover after successful queries
         """
-
         if not source:
             source = self._get_source_name()
 
@@ -240,29 +238,42 @@ def get_full_texts(self, collection_config_folder: str, source: str = None) -> I
                 "Please update server configuration with the required index."
             )
 
-        sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
+        base_sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'"
 
-        page = 0
-        page_size = 5000
-        total_processed = 0
+        current_offset = start_at
+        current_batch_size = batch_size
+        total_count = None
 
         while True:
-            paginated_sql = f"{sql} SKIP {total_processed} COUNT {page_size}"
-            response = self._execute_sql_query(paginated_sql)
+            sql = f"{base_sql} SKIP {current_offset} COUNT {current_batch_size}"
+
+            try:
+                response = self._execute_sql_query(sql)
+                rows = response.get("Rows", [])
+
+                if not rows:  # Stop if we get an empty batch
+                    break
+
+                if total_count is None:
+                    total_count = response.get("TotalRowCount", 0)
 
-            rows = response.get("Rows", [])
-            if not rows:  # Stop if we get an empty batch
-                break
+                yield self._process_rows_to_records(rows)
 
-            yield self._process_rows_to_records(rows)
+                current_offset += len(rows)
 
-            total_processed += len(rows)
-            total_count = response.get("TotalRowCount", 0)
+                if total_count and current_offset >= total_count:  # Stop if we've processed all records
+                    break
 
-            if total_processed >= total_count:  # Stop if we've processed all records
-                break
+            except (requests.RequestException, ValueError) as e:
+                if current_batch_size <= min_batch_size:
+                    raise ValueError(
+                        f"Failed to process batch even at minimum size {min_batch_size}. " f"Last error: {str(e)}"
+                    )
 
-            page += 1
+                # Halve the batch size and retry
+                current_batch_size = max(current_batch_size // 2, min_batch_size)
+                print(f"Reducing batch size to {current_batch_size} and retrying...")
+                continue
 
     @staticmethod
     def _process_full_text_response(batch_data: dict):
diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
index 88a0f44f..1ef3a477 100644
--- a/sde_collections/tests/api_tests.py
+++ b/sde_collections/tests/api_tests.py
@@ -2,6 +2,7 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
+import requests
 from django.utils import timezone
 
 from sde_collections.models.collection import WorkflowStatusChoices
@@ -160,3 +161,67 @@ def test_query_dev_server_missing_credentials(self, mock_post, api_instance):
 
         with pytest.raises(ValueError, match="Authentication error: Missing credentials for dev server"):
             api_instance.query(page=1)
+
+    @patch("sde_collections.sinequa_api.Api._execute_sql_query")
+    def test_get_full_texts_batch_size_reduction(self, mock_execute_sql, api_instance):
+        """Test that batch size reduces appropriately on failure and continues processing."""
+        # Mock first query to fail, then succeed with smaller batch
+        mock_execute_sql.side_effect = [
+            requests.RequestException("Query too large"),  # First attempt fails
+            {
+                "Rows": [["http://example.com/1", "Text 1", "Title 1"]],
+                "TotalRowCount": 1,
+            },  # Succeeds with smaller batch
+        ]
+
+        batches = list(api_instance.get_full_texts("test_folder", batch_size=100, min_batch_size=1))
+
+        # Verify the batches were processed correctly after size reduction
+        assert len(batches) == 1
+        assert len(batches[0]) == 1
+        assert batches[0][0]["url"] == "http://example.com/1"
+
+        # Verify the calls made - first with original size, then with reduced size
+        assert mock_execute_sql.call_count == 2
+        first_call = mock_execute_sql.call_args_list[0][0][0]
+        second_call = mock_execute_sql.call_args_list[1][0][0]
+        assert "COUNT 100" in first_call
+        assert "COUNT 50" in second_call  # Should be halved from 100
+
+    @patch("sde_collections.sinequa_api.Api._execute_sql_query")
+    def test_get_full_texts_minimum_batch_size(self, mock_execute_sql, api_instance):
+        """Test behavior when reaching minimum batch size."""
+        mock_execute_sql.side_effect = requests.RequestException("Query failed")
+
+        # Start with batch_size=4, min_batch_size=1
+        # Should try: 4 -> 2 -> 1 -> raise error
+        with pytest.raises(ValueError, match="Failed to process batch even at minimum size 1"):
+            list(api_instance.get_full_texts("test_folder", batch_size=4, min_batch_size=1))
+
+        # Should have tried 3 times before giving up
+        assert mock_execute_sql.call_count == 3
+        calls = mock_execute_sql.call_args_list
+        assert "COUNT 4" in calls[0][0][0]  # First try with 4
+        assert "COUNT 2" in calls[1][0][0]  # Second try with 2
+        assert "COUNT 1" in calls[2][0][0]  # Final try with 1
+
+    @patch("sde_collections.sinequa_api.Api._execute_sql_query")
+    def test_get_full_texts_batch_size_progression(self, mock_execute_sql, api_instance):
+        """Test multiple batch size reductions followed by successful query."""
+        mock_execute_sql.side_effect = [
+            requests.RequestException("First failure"),
+            requests.RequestException("Second failure"),
+            {"Rows": [["http://example.com/1", "Text 1", "Title 1"]], "TotalRowCount": 1},
+        ]
+
+        # Start with batch_size=100, should reduce to 25 before succeeding
+        batches = list(api_instance.get_full_texts("test_folder", batch_size=100, min_batch_size=1))
+
+        assert len(batches) == 1  # Should get one successful batch
+        assert mock_execute_sql.call_count == 3
+
+        calls = mock_execute_sql.call_args_list
+        # Verify the progression of batch sizes
+        assert "COUNT 100" in calls[0][0][0]  # First attempt
+        assert "COUNT 50" in calls[1][0][0]  # After first failure
+        assert "COUNT 25" in calls[2][0][0]  # After second failure

From 168eccc1bfe5a3caf02dfae214fe32db75b74faa Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 12:54:53 -0600
Subject: [PATCH 267/441] Modifications1

---
 .github/workflows/run_full_test_suite.yml | 31 ++---------------------
 init.sh                                   | 11 ++++++++
 2 files changed, 13 insertions(+), 29 deletions(-)
 create mode 100644 init.sh

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 90c07654..9158a38c 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -28,35 +28,8 @@ jobs:
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
-      - name: Run tests for delta patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_delta_patterns.py
-
-      - name: Run tests for exclude patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_exclude_patterns.py
-
-      - name: Run tests for include patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_include_patterns.py
-
-      - name: Run tests for field modifier patterns
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_patterns.py
-
-      - name: Run tests for promote collection
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
-
-      - name: Run tests for migrate dump
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migrate_dump.py
-
-      - name: Run tests for pattern specificity
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py
-
-      - name: Run tests for APIs
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_apis.py
-
-      - name: Run tests for import fulltexts
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_import_fulltexts.py
-
-      - name: Run API tests
-        run: docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+      - name: Run test suite
+        run: docker-compose -f local.yml run --rm django bash ./init.sh
 
       - name: Cleanup
         run: docker-compose -f local.yml down
diff --git a/init.sh b/init.sh
new file mode 100644
index 00000000..827b6b60
--- /dev/null
+++ b/init.sh
@@ -0,0 +1,11 @@
+
+!/bin/bash
+echo "Running all test cases across the project..."
+
+# Find and run all Python files starting with 'test_' in the entire project directory
+for test_file in $(find . -type f -name "test_*.py"); do
+    echo "Running $test_file..."
+    pytest "$test_file"
+done
+
+echo "All tests completed!"

From 348a207f7dd50ce44b1113cd5f4e4f0806b66c6b Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 13:31:34 -0600
Subject: [PATCH 268/441] Modifications2

---
 init.sh                | 20 +++++++++++++++++---
 requirements/local.txt |  4 ++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/init.sh b/init.sh
index 827b6b60..04643dd0 100644
--- a/init.sh
+++ b/init.sh
@@ -1,11 +1,25 @@
-
-!/bin/bash
+#!/bin/bash
 echo "Running all test cases across the project..."
 
+# Initialize a failure counter
+failure_count=0
+
 # Find and run all Python files starting with 'test_' in the entire project directory
 for test_file in $(find . -type f -name "test_*.py"); do
     echo "Running $test_file..."
     pytest "$test_file"
+    
+    # Check the exit status of pytest
+    if [ $? -ne 0 ]; then
+        echo "Test failed: $test_file"
+        failure_count=$((failure_count + 1))
+    fi
 done
 
-echo "All tests completed!"
+# Report the results
+if [ $failure_count -ne 0 ]; then
+    echo "$failure_count test(s) failed."
+    exit 1
+else
+    echo "All tests passed successfully!"
+fi
diff --git a/requirements/local.txt b/requirements/local.txt
index ef637bca..ff92302e 100644
--- a/requirements/local.txt
+++ b/requirements/local.txt
@@ -13,6 +13,10 @@ pytest==8.0.0  # https://github.com/pytest-dev/pytest
 pytest-sugar==1.0.0  # https://github.com/Frozenball/pytest-sugar
 types-requests # maybe instead, we should add `mypy --install-types` to the dockerfile?
 types-xmltodict
+numpy
+pandas
+torch
+transformers
 
 # Documentation
 # ------------------------------------------------------------------------------

From 5ee9087d538dd499ba2e7140d409cad3ce336d7d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 19:31:54 +0000
Subject: [PATCH 269/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 init.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init.sh b/init.sh
index 04643dd0..704adf17 100644
--- a/init.sh
+++ b/init.sh
@@ -8,7 +8,7 @@ failure_count=0
 for test_file in $(find . -type f -name "test_*.py"); do
     echo "Running $test_file..."
     pytest "$test_file"
-    
+
     # Check the exit status of pytest
     if [ $? -ne 0 ]; then
         echo "Test failed: $test_file"

From 74e51303bd0e841a89c644a94035fed729676844 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 13:39:49 -0600
Subject: [PATCH 270/441] Mod3

---
 .../{check_collection.py => test_check_collection.py}           | 0
 requirements/local.txt                                          | 2 ++
 sde_collections/tests/{api_tests.py => test_api.py}             | 0
 3 files changed, 2 insertions(+)
 rename functional_tests/{check_collection.py => test_check_collection.py} (100%)
 rename sde_collections/tests/{api_tests.py => test_api.py} (100%)

diff --git a/functional_tests/check_collection.py b/functional_tests/test_check_collection.py
similarity index 100%
rename from functional_tests/check_collection.py
rename to functional_tests/test_check_collection.py
diff --git a/requirements/local.txt b/requirements/local.txt
index ff92302e..b00904e4 100644
--- a/requirements/local.txt
+++ b/requirements/local.txt
@@ -17,6 +17,8 @@ numpy
 pandas
 torch
 transformers
+selenium==4.14.0
+webdriver-manager==3.8.6
 
 # Documentation
 # ------------------------------------------------------------------------------
diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/test_api.py
similarity index 100%
rename from sde_collections/tests/api_tests.py
rename to sde_collections/tests/test_api.py

From b1192e510350718644e95d2c0c486b1de29d6a4a Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 15:26:17 -0600
Subject: [PATCH 271/441] Mod4

---
 init.sh                                       | 10 +++-
 requirements/local.txt                        |  6 --
 .../tests/test_models_collections.py          | 12 ----
 sde_indexing_helper/users/tests/test_views.py | 56 ++++++++++++++-----
 4 files changed, 50 insertions(+), 34 deletions(-)
 delete mode 100644 sde_collections/tests/test_models_collections.py

diff --git a/init.sh b/init.sh
index 704adf17..be89aad0 100644
--- a/init.sh
+++ b/init.sh
@@ -4,8 +4,14 @@ echo "Running all test cases across the project..."
 # Initialize a failure counter
 failure_count=0
 
-# Find and run all Python files starting with 'test_' in the entire project directory
-for test_file in $(find . -type f -name "test_*.py"); do
+# Exclude tests in `document_classifier` and `functional_tests` directories
+excluded_dirs="document_classifier functional_tests"
+
+# Find all test files except those in excluded directories
+test_files=$(find . -type f -name "test_*.py" | grep -Ev "$(echo $excluded_dirs | sed 's/ /|/g')")
+
+# Run each test file
+for test_file in $test_files; do
     echo "Running $test_file..."
     pytest "$test_file"
 
diff --git a/requirements/local.txt b/requirements/local.txt
index b00904e4..ef637bca 100644
--- a/requirements/local.txt
+++ b/requirements/local.txt
@@ -13,12 +13,6 @@ pytest==8.0.0  # https://github.com/pytest-dev/pytest
 pytest-sugar==1.0.0  # https://github.com/Frozenball/pytest-sugar
 types-requests # maybe instead, we should add `mypy --install-types` to the dockerfile?
 types-xmltodict
-numpy
-pandas
-torch
-transformers
-selenium==4.14.0
-webdriver-manager==3.8.6
 
 # Documentation
 # ------------------------------------------------------------------------------
diff --git a/sde_collections/tests/test_models_collections.py b/sde_collections/tests/test_models_collections.py
deleted file mode 100644
index a5a2a114..00000000
--- a/sde_collections/tests/test_models_collections.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from config_generation.db_to_xml import XmlEditor
-
-from ..models.collection import Collection
-from ..models.collection_choice_fields import Divisions, DocumentTypes
-
-
-def test_create_config_xml():
-    collection = Collection(name="test", division=Divisions.EARTH_SCIENCE, document_type=DocumentTypes.DATA)
-    output_xml = collection.create_config_xml()
-    editor = XmlEditor(output_xml)
-    assert collection.tree_root == editor.fetch_treeroot()
-    assert collection.document_type == editor.fetch_document_type()
diff --git a/sde_indexing_helper/users/tests/test_views.py b/sde_indexing_helper/users/tests/test_views.py
index 1569b75d..88d67e77 100644
--- a/sde_indexing_helper/users/tests/test_views.py
+++ b/sde_indexing_helper/users/tests/test_views.py
@@ -22,39 +22,47 @@
 
 class TestUserUpdateView:
     """
-    TODO:
-        extracting view initialization code as class-scoped fixture
-        would be great if only pytest-django supported non-function-scoped
-        fixture db access -- this is a work-in-progress for now:
-        https://github.com/pytest-dev/pytest-django/pull/258
+    Tests for the UserUpdateView.
     """
 
-    def dummy_get_response(self, request: HttpRequest):
+    @staticmethod
+    def dummy_get_response(request: HttpRequest):
+        """Dummy get_response method for middleware testing."""
         return None
 
     def test_get_success_url(self, user: User, rf: RequestFactory):
+        """
+        Test that UserUpdateView redirects to the correct success URL.
+        """
         view = UserUpdateView()
         request = rf.get("/fake-url/")
         request.user = user
-
         view.request = request
 
-        assert view.get_success_url() == f"/users/{user.username}/"
+        expected_url = f"/users/{user.username}/"
+        assert view.get_success_url() == expected_url, (
+            f"Expected {expected_url}, got {view.get_success_url()}"
+        )
 
     def test_get_object(self, user: User, rf: RequestFactory):
+        """
+        Test that UserUpdateView retrieves the correct user object.
+        """
         view = UserUpdateView()
         request = rf.get("/fake-url/")
         request.user = user
-
         view.request = request
 
         assert view.get_object() == user
 
     def test_form_valid(self, user: User, rf: RequestFactory):
+        """
+        Test that form submission in UserUpdateView processes correctly.
+        """
         view = UserUpdateView()
         request = rf.get("/fake-url/")
 
-        # Add the session/message middleware to the request
+        # Add session and message middleware
         SessionMiddleware(self.dummy_get_response).process_request(request)
         MessageMiddleware(self.dummy_get_response).process_request(request)
         request.user = user
@@ -72,26 +80,45 @@ def test_form_valid(self, user: User, rf: RequestFactory):
 
 
 class TestUserRedirectView:
+    """
+    Tests for the UserRedirectView.
+    """
+
     def test_get_redirect_url(self, user: User, rf: RequestFactory):
+        """
+        Test that UserRedirectView redirects to the correct user detail URL.
+        """
         view = UserRedirectView()
-        request = rf.get("/fake-url")
+        request = rf.get("/fake-url/")
         request.user = user
-
         view.request = request
 
-        assert view.get_redirect_url() == f"/users/{user.username}/"
+        expected_url = f"/users/{user.username}/"
+        assert view.get_redirect_url() == expected_url, (
+            f"Expected {expected_url}, got {view.get_redirect_url()}"
+        )
 
 
 class TestUserDetailView:
+    """
+    Tests for the user_detail_view function.
+    """
+
     def test_authenticated(self, user: User, rf: RequestFactory):
+        """
+        Test that an authenticated user can access their detail view.
+        """
         request = rf.get("/fake-url/")
-        request.user = UserFactory()
+        request.user = user
 
         response = user_detail_view(request, username=user.username)
 
         assert response.status_code == 200
 
     def test_not_authenticated(self, user: User, rf: RequestFactory):
+        """
+        Test that an unauthenticated user is redirected to the login page.
+        """
         request = rf.get("/fake-url/")
         request.user = AnonymousUser()
 
@@ -101,3 +128,4 @@ def test_not_authenticated(self, user: User, rf: RequestFactory):
         assert isinstance(response, HttpResponseRedirect)
         assert response.status_code == 302
         assert response.url == f"{login_url}?next=/fake-url/"
+

From 2c0d3615a117081001dd68e46712e9e6c623061b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 9 Dec 2024 15:37:36 -0600
Subject: [PATCH 272/441] add postgres to django dockerfile for better backup
 system

---
 compose/local/django/Dockerfile      |  7 +++++++
 compose/production/django/Dockerfile | 14 ++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
index 5d7fa082..10c3d17e 100644
--- a/compose/local/django/Dockerfile
+++ b/compose/local/django/Dockerfile
@@ -38,6 +38,8 @@ WORKDIR ${APP_HOME}
 
 # Install required system dependencies
 RUN apt-get update && apt-get install --no-install-recommends -y \
+  wget \
+  gnupg \
   # psycopg2 dependencies
   libpq-dev \
   # Translations dependencies
@@ -45,6 +47,11 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   # pycurl dependencies
   libcurl4-openssl-dev \
   libssl-dev \
+  # PostgreSQL 15
+  && sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list' \
+  && wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \
+  && apt-get update \
+  && apt-get install -y postgresql-15 postgresql-client-15 \
   # cleaning up unused files
   && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
   && rm -rf /var/lib/apt/lists/*
diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile
index d470d493..4e4358bd 100644
--- a/compose/production/django/Dockerfile
+++ b/compose/production/django/Dockerfile
@@ -23,7 +23,6 @@ COPY ./requirements .
 RUN pip wheel --wheel-dir /usr/src/app/wheels  \
   -r ${BUILD_ENVIRONMENT}.txt
 
-
 # Python 'run' stage
 FROM python AS python-run-stage
 
@@ -39,9 +38,10 @@ WORKDIR ${APP_HOME}
 RUN addgroup --system django \
   && adduser --system --ingroup django django
 
-
 # Install required system dependencies
 RUN apt-get update && apt-get install --no-install-recommends -y \
+  wget \
+  gnupg \
   # psycopg2 dependencies
   libpq-dev \
   # Translations dependencies
@@ -49,6 +49,11 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   # pycurl dependencies
   libcurl4-openssl-dev \
   libssl-dev \
+  # PostgreSQL 15
+  && sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list' \
+  && wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \
+  && apt-get update \
+  && apt-get install -y postgresql-15 postgresql-client-15 \
   # cleaning up unused files
   && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
   && rm -rf /var/lib/apt/lists/*
@@ -61,25 +66,22 @@ COPY --from=python-build-stage /usr/src/app/wheels  /wheels/
 RUN pip install --no-cache-dir --no-index --find-links=/wheels/ /wheels/* \
   && rm -rf /wheels/
 
-
 COPY --chown=django:django ./compose/production/django/entrypoint /entrypoint
 RUN sed -i 's/\r$//g' /entrypoint
 RUN chmod +x /entrypoint
 
-
 COPY --chown=django:django ./compose/production/django/start /start
 RUN sed -i 's/\r$//g' /start
 RUN chmod +x /start
+
 COPY --chown=django:django ./compose/production/django/celery/worker/start /start-celeryworker
 RUN sed -i 's/\r$//g' /start-celeryworker
 RUN chmod +x /start-celeryworker
 
-
 COPY --chown=django:django ./compose/production/django/celery/beat/start /start-celerybeat
 RUN sed -i 's/\r$//g' /start-celerybeat
 RUN chmod +x /start-celerybeat
 
-
 COPY ./compose/production/django/celery/flower/start /start-flower
 RUN sed -i 's/\r$//g' /start-flower
 RUN chmod +x /start-flower

From 97f756a1b15c06c20b165359fbea764cdb2e1707 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 9 Dec 2024 15:39:06 -0600
Subject: [PATCH 273/441] add initial commands for database restore and backup

---
 .../management/commands/database_backup.py    | 57 ++++++++++++++
 .../management/commands/database_restore.py   | 75 +++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 sde_collections/management/commands/database_backup.py
 create mode 100644 sde_collections/management/commands/database_restore.py

diff --git a/sde_collections/management/commands/database_backup.py b/sde_collections/management/commands/database_backup.py
new file mode 100644
index 00000000..2f6fdc05
--- /dev/null
+++ b/sde_collections/management/commands/database_backup.py
@@ -0,0 +1,57 @@
+"""
+Management command to backup PostgreSQL database.
+
+Usage:
+    docker-compose -f local.yml run --rm django python manage.py database_backup
+    docker-compose -f production.yml run --rm django python manage.py database_backup
+"""
+
+import enum
+import os
+import socket
+import subprocess
+from datetime import datetime
+
+from django.conf import settings
+from django.core.management.base import BaseCommand
+
+
+class Server(enum.Enum):
+    PRODUCTION = "PRODUCTION"
+    STAGING = "STAGING"
+    UNKNOWN = "UNKNOWN"
+
+
+def detect_server() -> Server:
+    hostname = socket.gethostname().upper()
+
+    if "PRODUCTION" in hostname:
+        return Server.PRODUCTION
+    elif "STAGING" in hostname:
+        return Server.STAGING
+    return Server.UNKNOWN
+
+
+class Command(BaseCommand):
+    help = "Creates a PostgreSQL backup using pg_dump"
+
+    def handle(self, *args, **options):
+        server = detect_server()
+        date_str = datetime.now().strftime("%Y%m%d")
+        backup_file = f"{server.value.lower()}_backup_{date_str}.sql"
+
+        db_settings = settings.DATABASES["default"]
+        host = db_settings["HOST"]
+        name = db_settings["NAME"]
+        user = db_settings["USER"]
+        password = db_settings["PASSWORD"]
+
+        cmd = ["pg_dump", "-h", host, "-U", user, "-d", name, "--no-owner", "--no-privileges", "-f", backup_file]
+
+        try:
+            env = os.environ.copy()
+            env["PGPASSWORD"] = password
+            subprocess.run(cmd, env=env, check=True)
+            self.stdout.write(self.style.SUCCESS(f"Successfully created backup for {server.value}: {backup_file}"))
+        except subprocess.CalledProcessError as e:
+            self.stdout.write(self.style.ERROR(f"Backup failed on {server.value}: {str(e)}"))
diff --git a/sde_collections/management/commands/database_restore.py b/sde_collections/management/commands/database_restore.py
new file mode 100644
index 00000000..7dab254e
--- /dev/null
+++ b/sde_collections/management/commands/database_restore.py
@@ -0,0 +1,75 @@
+"""
+Management command to restore PostgreSQL database.
+
+Usage:
+    docker-compose -f local.yml run --rm django python manage.py database_restore path/to/backup.sql
+    docker-compose -f production.yml run --rm django python manage.py database_restore path/to/backup.sql
+"""
+
+import enum
+import os
+import socket
+import subprocess
+
+from django.conf import settings
+from django.core.management.base import BaseCommand, CommandError
+
+
+class Server(enum.Enum):
+    PRODUCTION = "PRODUCTION"
+    STAGING = "STAGING"
+    UNKNOWN = "UNKNOWN"
+
+
+def detect_server() -> Server:
+    hostname = socket.gethostname().upper()
+    if "PRODUCTION" in hostname:
+        return Server.PRODUCTION
+    elif "STAGING" in hostname:
+        return Server.STAGING
+    return Server.UNKNOWN
+
+
+class Command(BaseCommand):
+    help = "Restores PostgreSQL database from backup file"
+
+    def add_arguments(self, parser):
+        parser.add_argument("backup_path", type=str, help="Path to the backup file")
+
+    def handle(self, *args, **options):
+        server = detect_server()
+        backup_path = options["backup_path"]
+
+        if not os.path.exists(backup_path):
+            raise CommandError(f"Backup file not found: {backup_path}")
+
+        db_settings = settings.DATABASES["default"]
+        host = db_settings["HOST"]
+        name = db_settings["NAME"]
+        user = db_settings["USER"]
+        password = db_settings["PASSWORD"]
+
+        # Drop and recreate database
+        drop_cmd = ["psql", "-h", host, "-U", user, "-d", "postgres", "-c", f"DROP DATABASE IF EXISTS {name}"]
+        create_cmd = ["psql", "-h", host, "-U", user, "-d", "postgres", "-c", f"CREATE DATABASE {name}"]
+
+        # Restore command
+        restore_cmd = ["psql", "-h", host, "-U", user, "-d", name, "-f", backup_path]
+
+        try:
+            env = os.environ.copy()
+            env["PGPASSWORD"] = password
+
+            self.stdout.write(f"Dropping database {name}...")
+            subprocess.run(drop_cmd, env=env, check=True)
+
+            self.stdout.write(f"Creating database {name}...")
+            subprocess.run(create_cmd, env=env, check=True)
+
+            self.stdout.write("Restoring from backup...")
+            subprocess.run(restore_cmd, env=env, check=True)
+
+            self.stdout.write(self.style.SUCCESS(f"Successfully restored {server.value} database from {backup_path}"))
+
+        except subprocess.CalledProcessError as e:
+            self.stdout.write(self.style.ERROR(f"Restore failed on {server.value}: {str(e)}"))

From e59366a7274368bc8f5569e73ad6faea41e66d4d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 9 Dec 2024 15:56:02 -0600
Subject: [PATCH 274/441] refactor database_backup and include compression

---
 .../management/commands/database_backup.py    | 81 +++++++++++++---
 .../management/commands/database_restore.py   | 97 +++++++++++++------
 2 files changed, 138 insertions(+), 40 deletions(-)

diff --git a/sde_collections/management/commands/database_backup.py b/sde_collections/management/commands/database_backup.py
index 2f6fdc05..5b42d502 100644
--- a/sde_collections/management/commands/database_backup.py
+++ b/sde_collections/management/commands/database_backup.py
@@ -3,13 +3,17 @@
 
 Usage:
     docker-compose -f local.yml run --rm django python manage.py database_backup
+    docker-compose -f local.yml run --rm django python manage.py database_backup --no-compress
     docker-compose -f production.yml run --rm django python manage.py database_backup
 """
 
 import enum
+import gzip
 import os
+import shutil
 import socket
 import subprocess
+from contextlib import contextmanager
 from datetime import datetime
 
 from django.conf import settings
@@ -24,7 +28,6 @@ class Server(enum.Enum):
 
 def detect_server() -> Server:
     hostname = socket.gethostname().upper()
-
     if "PRODUCTION" in hostname:
         return Server.PRODUCTION
     elif "STAGING" in hostname:
@@ -32,26 +35,78 @@ def detect_server() -> Server:
     return Server.UNKNOWN
 
 
+@contextmanager
+def temp_file_handler(filename: str):
+    """Context manager to handle temporary files, ensuring cleanup."""
+    try:
+        yield filename
+    finally:
+        if os.path.exists(filename):
+            os.remove(filename)
+
+
 class Command(BaseCommand):
     help = "Creates a PostgreSQL backup using pg_dump"
 
-    def handle(self, *args, **options):
-        server = detect_server()
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--no-compress",
+            action="store_true",
+            help="Disable backup file compression (enabled by default)",
+        )
+
+    def get_backup_filename(self, server: Server, compress: bool) -> tuple[str, str]:
+        """Generate backup filename and actual dump path."""
         date_str = datetime.now().strftime("%Y%m%d")
-        backup_file = f"{server.value.lower()}_backup_{date_str}.sql"
+        base_name = f"{server.value.lower()}_backup_{date_str}.sql"
+        return f"{base_name}.gz" if compress else base_name, base_name
 
+    def run_pg_dump(self, output_file: str, env: dict) -> None:
+        """Execute pg_dump with given parameters."""
         db_settings = settings.DATABASES["default"]
-        host = db_settings["HOST"]
-        name = db_settings["NAME"]
-        user = db_settings["USER"]
-        password = db_settings["PASSWORD"]
+        cmd = [
+            "pg_dump",
+            "-h",
+            db_settings["HOST"],
+            "-U",
+            db_settings["USER"],
+            "-d",
+            db_settings["NAME"],
+            "--no-owner",
+            "--no-privileges",
+            "-f",
+            output_file,
+        ]
+        subprocess.run(cmd, env=env, check=True)
 
-        cmd = ["pg_dump", "-h", host, "-U", user, "-d", name, "--no-owner", "--no-privileges", "-f", backup_file]
+    def compress_file(self, input_file: str, output_file: str) -> None:
+        """Compress input file to output file using gzip."""
+        with open(input_file, "rb") as f_in:
+            with gzip.open(output_file, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+    def handle(self, *args, **options):
+        server = detect_server()
+        compress = not options["no_compress"]
+        backup_file, dump_file = self.get_backup_filename(server, compress)
+
+        env = os.environ.copy()
+        env["PGPASSWORD"] = settings.DATABASES["default"]["PASSWORD"]
 
         try:
-            env = os.environ.copy()
-            env["PGPASSWORD"] = password
-            subprocess.run(cmd, env=env, check=True)
-            self.stdout.write(self.style.SUCCESS(f"Successfully created backup for {server.value}: {backup_file}"))
+            if compress:
+                with temp_file_handler(dump_file):
+                    self.run_pg_dump(dump_file, env)
+                    self.compress_file(dump_file, backup_file)
+            else:
+                self.run_pg_dump(backup_file, env)
+
+            self.stdout.write(
+                self.style.SUCCESS(
+                    f"Successfully created {'compressed ' if compress else ''}backup for {server.value}: {backup_file}"
+                )
+            )
         except subprocess.CalledProcessError as e:
             self.stdout.write(self.style.ERROR(f"Backup failed on {server.value}: {str(e)}"))
+        except Exception as e:
+            self.stdout.write(self.style.ERROR(f"Error during backup process: {str(e)}"))
diff --git a/sde_collections/management/commands/database_restore.py b/sde_collections/management/commands/database_restore.py
index 7dab254e..2779cf51 100644
--- a/sde_collections/management/commands/database_restore.py
+++ b/sde_collections/management/commands/database_restore.py
@@ -1,15 +1,18 @@
 """
-Management command to restore PostgreSQL database.
+Management command to restore PostgreSQL database from backup.
 
 Usage:
-    docker-compose -f local.yml run --rm django python manage.py database_restore path/to/backup.sql
-    docker-compose -f production.yml run --rm django python manage.py database_restore path/to/backup.sql
+    docker-compose -f local.yml run --rm django python manage.py database_restore path/to/backup.sql[.gz]
+    docker-compose -f production.yml run --rm django python manage.py database_restore path/to/backup.sql[.gz]
 """
 
 import enum
+import gzip
 import os
+import shutil
 import socket
 import subprocess
+from contextlib import contextmanager
 
 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError
@@ -30,46 +33,86 @@ def detect_server() -> Server:
     return Server.UNKNOWN
 
 
+@contextmanager
+def temp_file_handler(filename: str):
+    """Context manager to handle temporary files, ensuring cleanup."""
+    try:
+        yield filename
+    finally:
+        if os.path.exists(filename):
+            os.remove(filename)
+
+
 class Command(BaseCommand):
-    help = "Restores PostgreSQL database from backup file"
+    help = "Restores PostgreSQL database from backup file (compressed or uncompressed)"
 
     def add_arguments(self, parser):
-        parser.add_argument("backup_path", type=str, help="Path to the backup file")
+        parser.add_argument("backup_path", type=str, help="Path to the backup file (.sql or .sql.gz)")
+
+    def get_db_settings(self):
+        """Get database connection settings."""
+        db = settings.DATABASES["default"]
+        return {
+            "host": db["HOST"],
+            "name": db["NAME"],
+            "user": db["USER"],
+            "password": db["PASSWORD"],
+        }
+
+    def run_psql_command(self, command: str, db_name: str = "postgres", env: dict = None) -> None:
+        """Execute a psql command."""
+        db = self.get_db_settings()
+        cmd = ["psql", "-h", db["host"], "-U", db["user"], "-d", db_name, "-c", command]
+        subprocess.run(cmd, env=env, check=True)
+
+    def reset_database(self, env: dict) -> None:
+        """Drop and recreate the database."""
+        db = self.get_db_settings()
+        self.stdout.write(f"Dropping database {db['name']}...")
+        self.run_psql_command(f"DROP DATABASE IF EXISTS {db['name']}", env=env)
+
+        self.stdout.write(f"Creating database {db['name']}...")
+        self.run_psql_command(f"CREATE DATABASE {db['name']}", env=env)
+
+    def restore_backup(self, backup_file: str, env: dict) -> None:
+        """Restore database from backup file."""
+        db = self.get_db_settings()
+        cmd = ["psql", "-h", db["host"], "-U", db["user"], "-d", db["name"], "-f", backup_file]
+        self.stdout.write("Restoring from backup...")
+        subprocess.run(cmd, env=env, check=True)
+
+    def decompress_file(self, input_file: str, output_file: str) -> None:
+        """Decompress gzipped file to output file."""
+        with gzip.open(input_file, "rb") as f_in:
+            with open(output_file, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
 
     def handle(self, *args, **options):
         server = detect_server()
         backup_path = options["backup_path"]
+        is_compressed = backup_path.endswith(".gz")
 
         if not os.path.exists(backup_path):
             raise CommandError(f"Backup file not found: {backup_path}")
 
-        db_settings = settings.DATABASES["default"]
-        host = db_settings["HOST"]
-        name = db_settings["NAME"]
-        user = db_settings["USER"]
-        password = db_settings["PASSWORD"]
-
-        # Drop and recreate database
-        drop_cmd = ["psql", "-h", host, "-U", user, "-d", "postgres", "-c", f"DROP DATABASE IF EXISTS {name}"]
-        create_cmd = ["psql", "-h", host, "-U", user, "-d", "postgres", "-c", f"CREATE DATABASE {name}"]
-
-        # Restore command
-        restore_cmd = ["psql", "-h", host, "-U", user, "-d", name, "-f", backup_path]
+        env = os.environ.copy()
+        env["PGPASSWORD"] = self.get_db_settings()["password"]
 
         try:
-            env = os.environ.copy()
-            env["PGPASSWORD"] = password
-
-            self.stdout.write(f"Dropping database {name}...")
-            subprocess.run(drop_cmd, env=env, check=True)
-
-            self.stdout.write(f"Creating database {name}...")
-            subprocess.run(create_cmd, env=env, check=True)
+            # Reset the database first
+            self.reset_database(env)
 
-            self.stdout.write("Restoring from backup...")
-            subprocess.run(restore_cmd, env=env, check=True)
+            # Handle backup restoration
+            if is_compressed:
+                with temp_file_handler(backup_path[:-3]) as temp_file:
+                    self.decompress_file(backup_path, temp_file)
+                    self.restore_backup(temp_file, env)
+            else:
+                self.restore_backup(backup_path, env)
 
             self.stdout.write(self.style.SUCCESS(f"Successfully restored {server.value} database from {backup_path}"))
 
         except subprocess.CalledProcessError as e:
             self.stdout.write(self.style.ERROR(f"Restore failed on {server.value}: {str(e)}"))
+        except Exception as e:
+            self.stdout.write(self.style.ERROR(f"Error during restore process: {str(e)}"))

From 48c66b8e96a72c0b5d2d19b49fd0e62a8365b795 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 9 Dec 2024 16:31:19 -0600
Subject: [PATCH 275/441] improve logic in get_backup_filename

---
 .../management/commands/database_backup.py          | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/sde_collections/management/commands/database_backup.py b/sde_collections/management/commands/database_backup.py
index 5b42d502..edb93351 100644
--- a/sde_collections/management/commands/database_backup.py
+++ b/sde_collections/management/commands/database_backup.py
@@ -56,10 +56,17 @@ def add_arguments(self, parser):
         )
 
     def get_backup_filename(self, server: Server, compress: bool) -> tuple[str, str]:
-        """Generate backup filename and actual dump path."""
+        """Generate backup filename and actual dump path.
+
+        Returns:
+            tuple[str, str]: A tuple containing (final_filename, temp_filename)
+                - final_filename: The name of the final backup file (with .gz if compressed)
+                - temp_filename: The name of the temporary dump file (always without .gz)
+        """
         date_str = datetime.now().strftime("%Y%m%d")
-        base_name = f"{server.value.lower()}_backup_{date_str}.sql"
-        return f"{base_name}.gz" if compress else base_name, base_name
+        temp_filename = f"{server.value.lower()}_backup_{date_str}.sql"
+        final_filename = f"{temp_filename}.gz" if compress else temp_filename
+        return final_filename, temp_filename
 
     def run_pg_dump(self, output_file: str, env: dict) -> None:
         """Execute pg_dump with given parameters."""

From 818158b1f7e38e04f5800ffd8e0a3d4814ed9fa2 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 18:14:19 -0600
Subject: [PATCH 276/441] new

---
 sde_collections/tests/{test_api.py => test_api_class.py} | 0
 sde_indexing_helper/users/tests/test_views.py            | 5 ++---
 2 files changed, 2 insertions(+), 3 deletions(-)
 rename sde_collections/tests/{test_api.py => test_api_class.py} (100%)

diff --git a/sde_collections/tests/test_api.py b/sde_collections/tests/test_api_class.py
similarity index 100%
rename from sde_collections/tests/test_api.py
rename to sde_collections/tests/test_api_class.py
diff --git a/sde_indexing_helper/users/tests/test_views.py b/sde_indexing_helper/users/tests/test_views.py
index 88d67e77..2ee96990 100644
--- a/sde_indexing_helper/users/tests/test_views.py
+++ b/sde_indexing_helper/users/tests/test_views.py
@@ -86,14 +86,14 @@ class TestUserRedirectView:
 
     def test_get_redirect_url(self, user: User, rf: RequestFactory):
         """
-        Test that UserRedirectView redirects to the correct user detail URL.
+        Test that UserRedirectView redirects to the "sde_collections:list" URL.
         """
         view = UserRedirectView()
         request = rf.get("/fake-url/")
         request.user = user
         view.request = request
 
-        expected_url = f"/users/{user.username}/"
+        expected_url = reverse("sde_collections:list")
         assert view.get_redirect_url() == expected_url, (
             f"Expected {expected_url}, got {view.get_redirect_url()}"
         )
@@ -128,4 +128,3 @@ def test_not_authenticated(self, user: User, rf: RequestFactory):
         assert isinstance(response, HttpResponseRedirect)
         assert response.status_code == 302
         assert response.url == f"{login_url}?next=/fake-url/"
-

From 8416cc56b5ffce0c6f7f07096a5a0a149eef9f1d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 18:14:19 -0600
Subject: [PATCH 277/441] new

---
 sde_collections/tests/{test_api.py => test_api_class.py} | 0
 sde_indexing_helper/users/tests/test_views.py            | 5 ++---
 2 files changed, 2 insertions(+), 3 deletions(-)
 rename sde_collections/tests/{test_api.py => test_api_class.py} (100%)

diff --git a/sde_collections/tests/test_api.py b/sde_collections/tests/test_api_class.py
similarity index 100%
rename from sde_collections/tests/test_api.py
rename to sde_collections/tests/test_api_class.py
diff --git a/sde_indexing_helper/users/tests/test_views.py b/sde_indexing_helper/users/tests/test_views.py
index 88d67e77..2ee96990 100644
--- a/sde_indexing_helper/users/tests/test_views.py
+++ b/sde_indexing_helper/users/tests/test_views.py
@@ -86,14 +86,14 @@ class TestUserRedirectView:
 
     def test_get_redirect_url(self, user: User, rf: RequestFactory):
         """
-        Test that UserRedirectView redirects to the correct user detail URL.
+        Test that UserRedirectView redirects to the "sde_collections:list" URL.
         """
         view = UserRedirectView()
         request = rf.get("/fake-url/")
         request.user = user
         view.request = request
 
-        expected_url = f"/users/{user.username}/"
+        expected_url = reverse("sde_collections:list")
         assert view.get_redirect_url() == expected_url, (
             f"Expected {expected_url}, got {view.get_redirect_url()}"
         )
@@ -128,4 +128,3 @@ def test_not_authenticated(self, user: User, rf: RequestFactory):
         assert isinstance(response, HttpResponseRedirect)
         assert response.status_code == 302
         assert response.url == f"{login_url}?next=/fake-url/"
-

From 97e71745e1a52ada0b9ae17aa5eafa071ee5ec79 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Dec 2024 00:30:55 +0000
Subject: [PATCH 278/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_indexing_helper/users/tests/test_views.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/sde_indexing_helper/users/tests/test_views.py b/sde_indexing_helper/users/tests/test_views.py
index 2ee96990..dc03ddde 100644
--- a/sde_indexing_helper/users/tests/test_views.py
+++ b/sde_indexing_helper/users/tests/test_views.py
@@ -40,9 +40,7 @@ def test_get_success_url(self, user: User, rf: RequestFactory):
         view.request = request
 
         expected_url = f"/users/{user.username}/"
-        assert view.get_success_url() == expected_url, (
-            f"Expected {expected_url}, got {view.get_success_url()}"
-        )
+        assert view.get_success_url() == expected_url, f"Expected {expected_url}, got {view.get_success_url()}"
 
     def test_get_object(self, user: User, rf: RequestFactory):
         """
@@ -94,9 +92,7 @@ def test_get_redirect_url(self, user: User, rf: RequestFactory):
         view.request = request
 
         expected_url = reverse("sde_collections:list")
-        assert view.get_redirect_url() == expected_url, (
-            f"Expected {expected_url}, got {view.get_redirect_url()}"
-        )
+        assert view.get_redirect_url() == expected_url, f"Expected {expected_url}, got {view.get_redirect_url()}"
 
 
 class TestUserDetailView:

From ce978c5888f87ac5ebf077184e6092b9c6ec104c Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 19:05:05 -0600
Subject: [PATCH 279/441] tst5

---
 .github/workflows/run_full_test_suite.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 9158a38c..717221e5 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -3,7 +3,7 @@ name: Django Test Suite on PR
 on:
   pull_request:
     branches:
-      - feature/add-github-actions
+      - feature/add-github-actions  # Trigger workflow on PRs to this branch
 
 jobs:
   run-tests:
@@ -17,19 +17,26 @@ jobs:
           - 5432:5432
 
     steps:
-      - name: Check out repository
+      # Check out the merged code (default behavior for pull_request events)
+      - name: Check out merged code
         uses: actions/checkout@v2
 
+      # Install Docker Compose
       - name: Set up Docker Compose
         run: |
           sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
           sudo chmod +x /usr/local/bin/docker-compose
 
+      # Build the Docker environment
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
+      # Run the test suite
       - name: Run test suite
+        env:
+          DJANGO_ENV: test  # Example environment variable
         run: docker-compose -f local.yml run --rm django bash ./init.sh
 
+      # Cleanup Docker resources
       - name: Cleanup
-        run: docker-compose -f local.yml down
+        run: docker-compose -f local.yml down --volumes

From bc961efdd03d01b233ba424e1b83b5b129363a5f Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 9 Dec 2024 19:43:44 -0600
Subject: [PATCH 280/441] latest

---
 .github/workflows/run_full_test_suite.yml     | 9 ++-------
 sde_indexing_helper/users/tests/test_views.py | 1 -
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 717221e5..188c01ac 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -3,7 +3,7 @@ name: Django Test Suite on PR
 on:
   pull_request:
     branches:
-      - feature/add-github-actions  # Trigger workflow on PRs to this branch
+      - dev
 
 jobs:
   run-tests:
@@ -17,26 +17,21 @@ jobs:
           - 5432:5432
 
     steps:
-      # Check out the merged code (default behavior for pull_request events)
       - name: Check out merged code
         uses: actions/checkout@v2
 
-      # Install Docker Compose
       - name: Set up Docker Compose
         run: |
           sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
           sudo chmod +x /usr/local/bin/docker-compose
 
-      # Build the Docker environment
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
-      # Run the test suite
       - name: Run test suite
         env:
-          DJANGO_ENV: test  # Example environment variable
+          DJANGO_ENV: test
         run: docker-compose -f local.yml run --rm django bash ./init.sh
 
-      # Cleanup Docker resources
       - name: Cleanup
         run: docker-compose -f local.yml down --volumes
diff --git a/sde_indexing_helper/users/tests/test_views.py b/sde_indexing_helper/users/tests/test_views.py
index dc03ddde..9b5411df 100644
--- a/sde_indexing_helper/users/tests/test_views.py
+++ b/sde_indexing_helper/users/tests/test_views.py
@@ -10,7 +10,6 @@
 
 from sde_indexing_helper.users.forms import UserAdminChangeForm
 from sde_indexing_helper.users.models import User
-from sde_indexing_helper.users.tests.factories import UserFactory
 from sde_indexing_helper.users.views import (
     UserRedirectView,
     UserUpdateView,

From bbb0d4a7e331bd65cf371c18816222f05c71257e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 9 Dec 2024 19:59:03 -0600
Subject: [PATCH 281/441] refactor and add tests for database restores

---
 .../management/commands/database_backup.py    |  35 ++-
 .../management/commands/database_restore.py   |  24 ++
 sde_collections/tests/test_database_backup.py | 190 +++++++++++++
 .../tests/test_database_restore.py            | 269 ++++++++++++++++++
 4 files changed, 512 insertions(+), 6 deletions(-)
 create mode 100644 sde_collections/tests/test_database_backup.py
 create mode 100644 sde_collections/tests/test_database_restore.py

diff --git a/sde_collections/management/commands/database_backup.py b/sde_collections/management/commands/database_backup.py
index edb93351..5f6551b3 100644
--- a/sde_collections/management/commands/database_backup.py
+++ b/sde_collections/management/commands/database_backup.py
@@ -4,6 +4,7 @@
 Usage:
     docker-compose -f local.yml run --rm django python manage.py database_backup
     docker-compose -f local.yml run --rm django python manage.py database_backup --no-compress
+    docker-compose -f local.yml run --rm django python manage.py database_backup --output /path/to/output.sql
     docker-compose -f production.yml run --rm django python manage.py database_backup
 """
 
@@ -54,19 +55,41 @@ def add_arguments(self, parser):
             action="store_true",
             help="Disable backup file compression (enabled by default)",
         )
+        parser.add_argument(
+            "--output",
+            type=str,
+            help="Output file path (default: auto-generated based on server name and date)",
+        )
 
-    def get_backup_filename(self, server: Server, compress: bool) -> tuple[str, str]:
+    def get_backup_filename(self, server: Server, compress: bool, custom_output: str = None) -> tuple[str, str]:
         """Generate backup filename and actual dump path.
 
+        Args:
+            server: Server enum indicating the environment
+            compress: Whether the output should be compressed
+            custom_output: Optional custom output path
+
         Returns:
             tuple[str, str]: A tuple containing (final_filename, temp_filename)
                 - final_filename: The name of the final backup file (with .gz if compressed)
                 - temp_filename: The name of the temporary dump file (always without .gz)
         """
-        date_str = datetime.now().strftime("%Y%m%d")
-        temp_filename = f"{server.value.lower()}_backup_{date_str}.sql"
-        final_filename = f"{temp_filename}.gz" if compress else temp_filename
-        return final_filename, temp_filename
+        if custom_output:
+            # Ensure the output directory exists
+            output_dir = os.path.dirname(custom_output)
+            if output_dir:
+                os.makedirs(output_dir, exist_ok=True)
+
+            if compress:
+                return custom_output + (".gz" if not custom_output.endswith(".gz") else ""), custom_output.removesuffix(
+                    ".gz"
+                )
+            return custom_output, custom_output
+        else:
+            date_str = datetime.now().strftime("%Y%m%d")
+            temp_filename = f"{server.value.lower()}_backup_{date_str}.sql"
+            final_filename = f"{temp_filename}.gz" if compress else temp_filename
+            return final_filename, temp_filename
 
     def run_pg_dump(self, output_file: str, env: dict) -> None:
         """Execute pg_dump with given parameters."""
@@ -95,7 +118,7 @@ def compress_file(self, input_file: str, output_file: str) -> None:
     def handle(self, *args, **options):
         server = detect_server()
         compress = not options["no_compress"]
-        backup_file, dump_file = self.get_backup_filename(server, compress)
+        backup_file, dump_file = self.get_backup_filename(server, compress, options.get("output"))
 
         env = os.environ.copy()
         env["PGPASSWORD"] = settings.DATABASES["default"]["PASSWORD"]
diff --git a/sde_collections/management/commands/database_restore.py b/sde_collections/management/commands/database_restore.py
index 2779cf51..ece94cce 100644
--- a/sde_collections/management/commands/database_restore.py
+++ b/sde_collections/management/commands/database_restore.py
@@ -16,6 +16,7 @@
 
 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError
+from django.db import connections
 
 
 class Server(enum.Enum):
@@ -65,9 +66,32 @@ def run_psql_command(self, command: str, db_name: str = "postgres", env: dict =
         cmd = ["psql", "-h", db["host"], "-U", db["user"], "-d", db_name, "-c", command]
         subprocess.run(cmd, env=env, check=True)
 
+    def terminate_database_connections(self, env: dict) -> None:
+        """Terminate all connections to the database."""
+        db = self.get_db_settings()
+        # Close Django's connection first
+        connections.close_all()
+
+        # Terminate any remaining PostgreSQL connections
+        terminate_conn_sql = f"""
+        SELECT pg_terminate_backend(pid)
+        FROM pg_stat_activity
+        WHERE datname = '{db["name"]}'
+        AND pid <> pg_backend_pid();
+        """
+        try:
+            self.run_psql_command(terminate_conn_sql, env=env)
+        except subprocess.CalledProcessError:
+            # If this fails, it's usually because there are no connections to terminate
+            pass
+
     def reset_database(self, env: dict) -> None:
         """Drop and recreate the database."""
         db = self.get_db_settings()
+
+        self.stdout.write(f"Terminating connections to {db['name']}...")
+        self.terminate_database_connections(env)
+
         self.stdout.write(f"Dropping database {db['name']}...")
         self.run_psql_command(f"DROP DATABASE IF EXISTS {db['name']}", env=env)
 
diff --git a/sde_collections/tests/test_database_backup.py b/sde_collections/tests/test_database_backup.py
new file mode 100644
index 00000000..d8a7be54
--- /dev/null
+++ b/sde_collections/tests/test_database_backup.py
@@ -0,0 +1,190 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_database_backup.py
+import gzip
+import os
+import subprocess
+from datetime import datetime
+from unittest.mock import Mock, patch
+
+import pytest
+from django.core.management import call_command
+
+from sde_collections.management.commands import database_backup
+from sde_collections.management.commands.database_backup import (
+    Server,
+    temp_file_handler,
+)
+
+
+@pytest.fixture
+def mock_subprocess():
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value.returncode = 0
+        yield mock_run
+
+
+@pytest.fixture
+def mock_date():
+    with patch("sde_collections.management.commands.database_backup.datetime") as mock_dt:
+        mock_dt.now.return_value = datetime(2024, 1, 15)
+        yield mock_dt
+
+
+@pytest.fixture
+def mock_settings(settings):
+    """Configure test database settings."""
+    settings.DATABASES = {
+        "default": {
+            "HOST": "test-db-host",
+            "NAME": "test_db",
+            "USER": "test_user",
+            "PASSWORD": "test_password",
+        }
+    }
+    return settings
+
+
+@pytest.fixture
+def command():
+    return database_backup.Command()
+
+
+class TestBackupCommand:
+    def test_get_backup_filename_compressed(self, command, mock_date):
+        """Test backup filename generation with compression."""
+        backup_file, dump_file = command.get_backup_filename(Server.STAGING, compress=True)
+        assert backup_file == "staging_backup_20240115.sql.gz"
+        assert dump_file == "staging_backup_20240115.sql"
+
+    def test_get_backup_filename_uncompressed(self, command, mock_date):
+        """Test backup filename generation without compression."""
+        backup_file, dump_file = command.get_backup_filename(Server.PRODUCTION, compress=False)
+        assert backup_file == "production_backup_20240115.sql"
+        assert dump_file == backup_file
+
+    def test_run_pg_dump(self, command, mock_subprocess, mock_settings):
+        """Test pg_dump command execution."""
+        env = {"PGPASSWORD": "test_password"}
+        command.run_pg_dump("test_output.sql", env)
+
+        mock_subprocess.assert_called_once()
+        cmd_args = mock_subprocess.call_args[0][0]
+        assert cmd_args == [
+            "pg_dump",
+            "-h",
+            "test-db-host",
+            "-U",
+            "test_user",
+            "-d",
+            "test_db",
+            "--no-owner",
+            "--no-privileges",
+            "-f",
+            "test_output.sql",
+        ]
+
+    def test_compress_file(self, command, tmp_path):
+        """Test file compression."""
+        input_file = tmp_path / "test.sql"
+        output_file = tmp_path / "test.sql.gz"
+        test_content = b"Test database content"
+
+        # Create test input file
+        input_file.write_bytes(test_content)
+
+        # Compress the file
+        command.compress_file(str(input_file), str(output_file))
+
+        # Verify compression
+        assert output_file.exists()
+        with gzip.open(output_file, "rb") as f:
+            assert f.read() == test_content
+
+    def test_temp_file_handler_cleanup(self, tmp_path):
+        """Test temporary file cleanup."""
+        test_file = tmp_path / "temp.sql"
+        test_file.touch()
+
+        with temp_file_handler(str(test_file)):
+            assert test_file.exists()
+        assert not test_file.exists()
+
+    def test_temp_file_handler_cleanup_on_error(self, tmp_path):
+        """Test temporary file cleanup when an error occurs."""
+        test_file = tmp_path / "temp.sql"
+        test_file.touch()
+
+        with pytest.raises(ValueError):
+            with temp_file_handler(str(test_file)):
+                assert test_file.exists()
+                raise ValueError("Test error")
+        assert not test_file.exists()
+
+    @patch("socket.gethostname")
+    def test_server_detection(self, mock_hostname):
+        """Test server environment detection."""
+        test_cases = [
+            ("PRODUCTION-SERVER", Server.PRODUCTION),
+            ("STAGING-DB", Server.STAGING),
+            ("DEV-HOST", Server.UNKNOWN),
+        ]
+
+        for hostname, expected_server in test_cases:
+            mock_hostname.return_value = hostname
+            with patch("sde_collections.management.commands.database_backup.detect_server") as mock_detect:
+                mock_detect.return_value = expected_server
+                server = database_backup.detect_server()
+                assert server == expected_server
+
+    @pytest.mark.parametrize(
+        "compress,hostname",
+        [
+            (True, "PRODUCTION-SERVER"),
+            (False, "STAGING-SERVER"),
+            (True, "UNKNOWN-SERVER"),
+        ],
+    )
+    def test_handle_integration(self, compress, hostname, mock_subprocess, mock_date, mock_settings):
+        """Test full backup process integration."""
+        with patch("socket.gethostname", return_value=hostname):
+            call_command("database_backup", no_compress=not compress)
+
+        # Verify correct command execution
+        mock_subprocess.assert_called_once()
+
+        # Verify correct filename used
+        cmd_args = mock_subprocess.call_args[0][0]
+        date_str = "20240115"
+        server_type = hostname.split("-")[0].lower()
+        expected_base = f"{server_type}_backup_{date_str}.sql"
+
+        if compress:
+            assert cmd_args[-1] == expected_base  # Temporary file
+            # Verify cleanup attempted
+            assert not os.path.exists(expected_base)
+        else:
+            assert cmd_args[-1] == expected_base
+
+    def test_handle_pg_dump_error(self, mock_subprocess, mock_date):
+        """Test error handling when pg_dump fails."""
+        mock_subprocess.side_effect = subprocess.CalledProcessError(1, "pg_dump")
+
+        with patch("socket.gethostname", return_value="STAGING-SERVER"):
+            call_command("database_backup")
+
+        # Verify error handling and cleanup
+        date_str = "20240115"
+        temp_file = f"staging_backup_{date_str}.sql"
+        assert not os.path.exists(temp_file)
+
+    def test_handle_compression_error(self, mock_subprocess, mock_date, command):
+        """Test error handling during compression."""
+        # Mock compression to fail
+        command.compress_file = Mock(side_effect=Exception("Compression failed"))
+
+        with patch("socket.gethostname", return_value="STAGING-SERVER"):
+            call_command("database_backup")
+
+        # Verify cleanup
+        date_str = "20240115"
+        temp_file = f"staging_backup_{date_str}.sql"
+        assert not os.path.exists(temp_file)
diff --git a/sde_collections/tests/test_database_restore.py b/sde_collections/tests/test_database_restore.py
new file mode 100644
index 00000000..21088ad0
--- /dev/null
+++ b/sde_collections/tests/test_database_restore.py
@@ -0,0 +1,269 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_database_restore.py
+import gzip
+from unittest.mock import patch
+
+import pytest
+from django.core.management import call_command
+from django.core.management.base import CommandError
+from django.db import connections
+
+from sde_collections.management.commands import database_restore
+from sde_collections.models.collection import Collection
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+from sde_collections.tests.factories import (
+    CollectionFactory,
+    CuratedUrlFactory,
+    DeltaUrlFactory,
+    DumpUrlFactory,
+)
+
+# Register the integration mark
+pytest.mark.integration = pytest.mark.django_db(transaction=True)
+
+
+@pytest.fixture
+def mock_subprocess():
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value.returncode = 0
+        yield mock_run
+
+
+@pytest.fixture
+def mock_settings(settings):
+    """Configure test database settings."""
+    settings.DATABASES = {
+        "default": {
+            "HOST": "test-db-host",
+            "NAME": "test_db",
+            "USER": "test_user",
+            "PASSWORD": "test_password",
+        }
+    }
+    return settings
+
+
+@pytest.fixture
+def command():
+    return database_restore.Command()
+
+
+@pytest.fixture
+def backup_file(tmp_path):
+    """Create a temporary backup file."""
+    backup_path = tmp_path / "test_backup.sql"
+    backup_path.write_text("-- Test backup content")
+    return str(backup_path)
+
+
+@pytest.fixture
+def compressed_backup_file(tmp_path):
+    """Create a temporary compressed backup file."""
+    backup_path = tmp_path / "test_backup.sql.gz"
+    with gzip.open(backup_path, "wt") as f:
+        f.write("-- Test backup content")
+    return str(backup_path)
+
+
+class TestRestoreCommand:
+    def test_get_db_settings(self, command, mock_settings):
+        """Test database settings retrieval."""
+        settings = command.get_db_settings()
+        assert settings == {
+            "host": "test-db-host",
+            "name": "test_db",
+            "user": "test_user",
+            "password": "test_password",
+        }
+
+    def test_run_psql_command(self, command, mock_subprocess, mock_settings):
+        """Test psql command execution."""
+        env = {"PGPASSWORD": "test_password"}
+        command.run_psql_command("SELECT 1;", "test_db", env)
+
+        mock_subprocess.assert_called_once()
+        cmd_args = mock_subprocess.call_args[0][0]
+        assert cmd_args == [
+            "psql",
+            "-h",
+            "test-db-host",
+            "-U",
+            "test_user",
+            "-d",
+            "test_db",
+            "-c",
+            "SELECT 1;",
+        ]
+
+    def test_reset_database(self, command, mock_subprocess, mock_settings):
+        """Test database reset process."""
+        env = {"PGPASSWORD": "test_password"}
+        command.reset_database(env)
+
+        # Verify drop, create and terminate connections commands were executed
+        assert mock_subprocess.call_count >= 2
+        calls = mock_subprocess.call_args_list
+        assert any("DROP DATABASE" in call[0][0][-1] for call in calls)
+        assert any("CREATE DATABASE" in call[0][0][-1] for call in calls)
+
+    def test_restore_backup(self, command, mock_subprocess, mock_settings, backup_file):
+        """Test backup restoration."""
+        env = {"PGPASSWORD": "test_password"}
+        command.restore_backup(backup_file, env)
+
+        mock_subprocess.assert_called_once()
+        cmd_args = mock_subprocess.call_args[0][0]
+        assert cmd_args == [
+            "psql",
+            "-h",
+            "test-db-host",
+            "-U",
+            "test_user",
+            "-d",
+            "test_db",
+            "-f",
+            backup_file,
+        ]
+
+    def test_decompress_file(self, command, tmp_path, compressed_backup_file):
+        """Test backup file decompression."""
+        output_file = str(tmp_path / "decompressed.sql")
+        command.decompress_file(compressed_backup_file, output_file)
+
+        with open(output_file) as f:
+            content = f.read()
+            assert content == "-- Test backup content"
+
+    def test_handle_file_not_found(self, command):
+        """Test error handling for non-existent backup file."""
+        with pytest.raises(CommandError):
+            call_command("database_restore", "nonexistent.sql")
+
+
+@pytest.mark.django_db
+class TestDatabaseIntegration:
+    """Integration tests for backup and restore functionality."""
+
+    def create_test_data(self):
+        """Create a set of test data using factories."""
+        collection = CollectionFactory()
+
+        # Create some URLs
+        dump_urls = DumpUrlFactory.create_batch(3, collection=collection)
+        curated_urls = CuratedUrlFactory.create_batch(3, collection=collection)
+        delta_urls = DeltaUrlFactory.create_batch(3, collection=collection)
+
+        return {
+            "collection": collection,
+            "dump_urls": dump_urls,
+            "curated_urls": curated_urls,
+            "delta_urls": delta_urls,
+        }
+
+    def verify_data_integrity(self, original_data):
+        """Verify that all data matches the original after restore."""
+        # Close all existing database connections before verification
+        connections.close_all()
+
+        # Verify collection
+        restored_collection = Collection.objects.get(pk=original_data["collection"].pk)
+        assert restored_collection.name == original_data["collection"].name
+        assert restored_collection.config_folder == original_data["collection"].config_folder
+
+        # Verify URLs
+        for original_url in original_data["dump_urls"]:
+            restored_url = DumpUrl.objects.get(pk=original_url.pk)
+            assert restored_url.url == original_url.url
+            assert restored_url.scraped_title == original_url.scraped_title
+
+        for original_url in original_data["curated_urls"]:
+            restored_url = CuratedUrl.objects.get(pk=original_url.pk)
+            assert restored_url.url == original_url.url
+            assert restored_url.scraped_title == original_url.scraped_title
+
+        for original_url in original_data["delta_urls"]:
+            restored_url = DeltaUrl.objects.get(pk=original_url.pk)
+            assert restored_url.url == original_url.url
+            assert restored_url.scraped_title == original_url.scraped_title
+
+    @pytest.mark.integration
+    def test_full_backup_restore_cycle(self, tmp_path):
+        """Test complete backup and restore cycle with actual data."""
+        # Create test data
+        original_data = self.create_test_data()
+
+        # Create backup
+        backup_file = str(tmp_path / "integration_test_backup.sql")
+        with patch("socket.gethostname", return_value="TEST-SERVER"):
+            connections.close_all()  # Close connections before backup
+            call_command("database_backup", "--no-compress", output=backup_file)
+
+        # Clear the database
+        for Model in [Collection, DumpUrl, CuratedUrl, DeltaUrl]:
+            Model.objects.all().delete()
+
+        assert Collection.objects.count() == 0
+        assert DumpUrl.objects.count() == 0
+        assert CuratedUrl.objects.count() == 0
+        assert DeltaUrl.objects.count() == 0
+
+        # Restore from backup
+        connections.close_all()  # Close connections before restore
+        call_command("database_restore", backup_file)
+
+        # Verify data integrity
+        self.verify_data_integrity(original_data)
+
+    @pytest.mark.integration
+    def test_compressed_backup_restore_cycle(self, tmp_path):
+        """Test backup and restore cycle with compression."""
+        # Create test data
+        original_data = self.create_test_data()
+
+        # Create compressed backup
+        backup_file = str(tmp_path / "integration_test_backup.sql.gz")
+        with patch("socket.gethostname", return_value="TEST-SERVER"):
+            connections.close_all()  # Close connections before backup
+            call_command("database_backup", output=backup_file)  # Compression is enabled by default
+
+        # Clear the database
+        connections.close_all()  # Close connections before clearing
+        Collection.objects.all().delete()
+
+        # Restore from compressed backup
+        connections.close_all()  # Close connections before restore
+        call_command("database_restore", backup_file)
+
+        # Verify data integrity
+        self.verify_data_integrity(original_data)
+
+    @pytest.mark.integration
+    def test_partial_data_integrity(self, tmp_path):
+        """Test backup and restore with partial data modifications."""
+        # Create initial data
+        original_data = self.create_test_data()
+        original_name = original_data["collection"].name
+        original_url_id = original_data["curated_urls"][0].id  # Store the ID explicitly
+
+        # Create backup
+        backup_file = str(tmp_path / "partial_test_backup.sql")
+        with patch("socket.gethostname", return_value="TEST-SERVER"):
+            connections.close_all()  # Close connections before backup
+            call_command("database_backup", "--no-compress", output=backup_file)
+
+        # Modify some data
+        collection = original_data["collection"]
+        collection.name = "Modified Name"
+        collection.save()
+
+        new_curated_url = CuratedUrlFactory(collection=collection)
+        original_data["curated_urls"][0].delete()
+
+        # Restore from backup
+        connections.close_all()  # Close connections before restore
+        call_command("database_restore", backup_file)
+
+        # Verify original state is restored
+        restored_collection = Collection.objects.get(pk=collection.pk)
+        assert restored_collection.name == original_name
+        assert not CuratedUrl.objects.filter(pk=new_curated_url.pk).exists()
+        assert CuratedUrl.objects.filter(pk=original_url_id).exists()  # Use the stored ID

From a9e63bb23a410339de5d76951dbe850bbadeb4ee Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 9 Dec 2024 20:05:26 -0600
Subject: [PATCH 282/441] add database backup and restore information to main
 readme

---
 README.md | 95 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 68 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 8e05366b..c69ac9fc 100644
--- a/README.md
+++ b/README.md
@@ -70,56 +70,97 @@ $ docker-compose -f local.yml run --rm django python manage.py createsuperuser
 
 Create additional users through the admin interface (/admin).
 
-### Loading Fixtures
+### Database Backup and Restore
 
-To load collections:
+COSMOS provides dedicated management commands for backing up and restoring your PostgreSQL database. These commands handle both compressed and uncompressed backups and automatically detect your server environment from your configuration.
 
-```bash
-$ docker-compose -f local.yml run --rm django python manage.py loaddata sde_collections/fixtures/collections.json
-```
+#### Creating a Database Backup
 
-### Manually Creating and Loading a ContentTypeless Backup
-Navigate to the server running prod, then to the project folder. Run the following command to create a backup:
+To create a backup of your database:
 
 ```bash
-docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20241114.json
+# Create a compressed backup (recommended)
+docker-compose -f local.yml run --rm django python manage.py database_backup
+
+# Create an uncompressed backup
+docker-compose -f local.yml run --rm django python manage.py database_backup --no-compress
+
+# Specify custom output location
+docker-compose -f local.yml run --rm django python manage.py database_backup --output /path/to/output.sql
 ```
-This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine.
+
+The backup command will automatically:
+- Detect your server environment (Production/Staging/Local)
+- Use database credentials from your environment settings
+- Generate a dated filename if no output path is specified
+- Compress the backup by default (can be disabled with --no-compress)
+
+#### Restoring from a Database Backup
+
+To restore your database from a backup:
 
 ```bash
-mv ~/prod_backup-20240812.json <project_path>/prod_backup-20240812.json
-scp sde:/home/ec2-user/sde_indexing_helper/backups/prod_backup-20240812.json prod_backup-20240812.json
+# Restore from a backup (handles both .sql and .sql.gz files)
+docker-compose -f local.yml run --rm django python manage.py database_restore path/to/backup.sql[.gz]
 ```
 
-Finally, load the backup into your local database:
+The restore command will:
+- Automatically detect if the backup is compressed (.gz)
+- Terminate existing database connections
+- Drop and recreate the database
+- Restore all data from the backup
+- Handle all database credentials from your environment settings
+
+#### Working with Remote Servers
+
+When working with production or staging servers:
 
+1. First, SSH into the appropriate server:
 ```bash
-docker-compose -f local.yml run --rm django python manage.py loaddata prod_backup-20240812.json
+# For production
+ssh user@production-server
+cd /path/to/project
+
+# For staging
+ssh user@staging-server
+cd /path/to/project
 ```
 
-### Loading the Database from an Arbitrary Backup
+2. Then run the backup command with the production configuration:
+```bash
+docker-compose -f production.yml run --rm django python manage.py database_backup
+```
 
-1. Build the project and run the necessary containers (as documented above).
-2. Clear out content types using the Django shell:
+3. Copy the backup to your local machine:
+```bash
+scp user@remote-server:/path/to/backup.sql.gz ./local-backup.sql.gz
+```
 
+4. Finally, restore locally:
 ```bash
-$ docker-compose -f local.yml run --rm django python manage.py shell
->>> from django.contrib.contenttypes.models import ContentType
->>> ContentType.objects.all().delete()
->>> exit()
+docker-compose -f local.yml run --rm django python manage.py database_restore local-backup.sql.gz
 ```
 
-3. Load your backup database:
+#### Alternative Methods
+
+While the database_backup and database_restore commands are the recommended approach, there are alternative methods available:
+
+##### Using JSON Fixtures (for smaller datasets)
+If you're working with a smaller dataset, you can use Django's built-in fixtures:
 
 ```bash
-$ docker cp /path/to/your/backup.json container_name:/path/inside/container/backup.json
-$ docker-compose -f local.yml run --rm django python manage.py loaddata /path/inside/the/container/backup.json
-$ docker-compose -f local.yml run --rm django python manage.py migrate
+# Create a backup excluding content types
+docker-compose -f production.yml run --rm --user root django python manage.py dumpdata \
+    --natural-foreign --natural-primary \
+    --exclude=contenttypes --exclude=auth.Permission \
+    --indent 2 \
+    --output /app/backups/prod_backup-$(date +%Y%m%d).json
+
+# Restore from a fixture
+docker-compose -f local.yml run --rm django python manage.py loaddata /path/to/backup.json
 ```
-### Restoring the Database from a SQL Dump
-If the JSON file is particularly large (>1.5GB), Docker might struggle with this method. In such cases, you can use SQL dump and restore commands as an alternative, as described [here](./SQLDumpRestoration.md).
-
 
+Note: For large databases (>1.5GB), the database_backup and database_restore commands are strongly recommended over JSON fixtures as they handle large datasets more efficiently.
 
 ## Additional Commands
 

From 7cd8c4dba1a6d4d99a1c1722754189ee965b148e Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 10 Dec 2024 11:51:59 +0530
Subject: [PATCH 283/441] Updated database restore command

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c69ac9fc..e82263db 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ To restore your database from a backup:
 
 ```bash
 # Restore from a backup (handles both .sql and .sql.gz files)
-docker-compose -f local.yml run --rm django python manage.py database_restore path/to/backup.sql[.gz]
+docker-compose -f local.yml run -v local/path/to/directory:/backups --rm django python manage.py database_restore /backups/backup_file_name.sql[.gz]
 ```
 
 The restore command will:

From b65e278c29b2fd082a3d69468be733259bafb2d1 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Tue, 10 Dec 2024 09:33:20 -0600
Subject: [PATCH 284/441] remove count calculations defined as properties

---
 sde_collections/models/collection.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 55d9b6da..3c539bf5 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -266,20 +266,6 @@ def add_to_public_query(self):
         scraper_content = scraper_editor.update_config_xml()
         gh.create_or_update_file(query_path, scraper_content)
 
-    @property
-    def included_urls_count(self):
-        return self.candidate_urls.filter(excluded=False).count()
-
-    @property
-    def delta_urls_count(self):
-        """get the total number of delta urls"""
-        return self.delta_urls.filter(excluded=False).count()
-
-    @property
-    def included_curated_urls_count(self):
-        """get the number of included, curated urls"""
-        return self.curated_urls.filter(excluded=False).count()
-
     @property
     def _scraper_config_path(self) -> str:
         return f"sources/scrapers/{self.config_folder}/default.xml"

From 69950af60e0d68c44d6c6d6e34e52197d69b3135 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Tue, 10 Dec 2024 09:34:15 -0600
Subject: [PATCH 285/441] code to calculate the URL counts within the admin

---
 sde_collections/admin.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/sde_collections/admin.py b/sde_collections/admin.py
index 739c2699..02ba0900 100644
--- a/sde_collections/admin.py
+++ b/sde_collections/admin.py
@@ -244,7 +244,9 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
     list_display = (
         "name",
         "candidate_urls_count",
+        "included_candidate_urls_count",
         "delta_urls_count",
+        "included_delta_urls_count",
         "included_curated_urls_count",
         "config_folder",
         "url",
@@ -254,6 +256,26 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
         "reindexing_status",
     )
 
+    def included_candidate_urls_count(self, obj) -> int:
+        return obj.candidate_urls.filter(excluded=False).count()
+
+    included_candidate_urls_count.short_description = "Included Candidate URLs Count"
+
+    def delta_urls_count(self, obj) -> int:
+        return obj.delta_urls.count()
+
+    delta_urls_count.short_description = "Total Delta URLs Count"
+
+    def included_delta_urls_count(self, obj) -> int:
+        return obj.delta_urls.filter(excluded=False).count()
+
+    included_delta_urls_count.short_description = "Included Delta URLs Count"
+
+    def included_curated_urls_count(self, obj) -> int:
+        return obj.curated_urls.filter(excluded=False).count()
+
+    included_curated_urls_count.short_description = "Included Curated URLs Count"
+
     readonly_fields = ("config_folder",)
     list_filter = (
         "division",

From 09d7438296266a7354e2341540341504fedb7ddf Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 10:06:06 -0600
Subject: [PATCH 286/441] update all backup docstrings and readme with clearer
 volume mount info

---
 README.md                                     | 58 +++++++-------
 .../management/commands/database_backup.py    | 61 +++++++-------
 .../management/commands/database_restore.py   |  7 +-
 sde_collections/tests/test_database_backup.py | 79 ++++++++-----------
 4 files changed, 100 insertions(+), 105 deletions(-)

diff --git a/README.md b/README.md
index e82263db..ede13f22 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,6 @@ $ docker-compose -f local.yml build
 ```bash
 $ docker-compose -f local.yml up
 ```
-
 ### Non-Docker Local Setup
 
 If you prefer to run the project without Docker, follow these steps:
@@ -69,12 +68,23 @@ $ docker-compose -f local.yml run --rm django python manage.py createsuperuser
 #### Creating Additional Users
 
 Create additional users through the admin interface (/admin).
+## Database Backup and Restore
+
+COSMOS provides dedicated management commands for backing up and restoring your PostgreSQL database. These commands handle both compressed and uncompressed backups and work seamlessly in both local and production environments using Docker.
+
+### Backup Directory Structure
 
-### Database Backup and Restore
+All backups are stored in the `/backups` directory at the root of your project. This directory is mounted as a volume in both local and production Docker configurations, making it easy to manage backups across different environments.
 
-COSMOS provides dedicated management commands for backing up and restoring your PostgreSQL database. These commands handle both compressed and uncompressed backups and automatically detect your server environment from your configuration.
+- Local development: `./backups/`
+- Production server: `/path/to/project/backups/`
 
-#### Creating a Database Backup
+If the directory doesn't exist, create it:
+```bash
+mkdir backups
+```
+
+### Creating a Database Backup
 
 To create a backup of your database:
 
@@ -85,23 +95,24 @@ docker-compose -f local.yml run --rm django python manage.py database_backup
 # Create an uncompressed backup
 docker-compose -f local.yml run --rm django python manage.py database_backup --no-compress
 
-# Specify custom output location
-docker-compose -f local.yml run --rm django python manage.py database_backup --output /path/to/output.sql
+# Specify custom output location within backups directory
+docker-compose -f local.yml run --rm django python manage.py database_backup --output my_custom_backup.sql
 ```
 
 The backup command will automatically:
 - Detect your server environment (Production/Staging/Local)
 - Use database credentials from your environment settings
 - Generate a dated filename if no output path is specified
+- Save the backup to the mounted `/backups` directory
 - Compress the backup by default (can be disabled with --no-compress)
 
-#### Restoring from a Database Backup
+### Restoring from a Database Backup
 
-To restore your database from a backup:
+To restore your database from a backup, it will need to be in the `/backups` directory. You can then run the following command:
 
 ```bash
 # Restore from a backup (handles both .sql and .sql.gz files)
-docker-compose -f local.yml run -v local/path/to/directory:/backups --rm django python manage.py database_restore /backups/backup_file_name.sql[.gz]
+docker-compose -f local.yml run --rm django python manage.py database_restore backups/backup_file_name.sql.gz
 ```
 
 The restore command will:
@@ -111,7 +122,7 @@ The restore command will:
 - Restore all data from the backup
 - Handle all database credentials from your environment settings
 
-#### Working with Remote Servers
+### Working with Remote Servers
 
 When working with production or staging servers:
 
@@ -120,44 +131,37 @@ When working with production or staging servers:
 # For production
 ssh user@production-server
 cd /path/to/project
-
-# For staging
-ssh user@staging-server
-cd /path/to/project
 ```
 
-2. Then run the backup command with the production configuration:
+2. Create a backup on the remote server:
 ```bash
 docker-compose -f production.yml run --rm django python manage.py database_backup
 ```
 
-3. Copy the backup to your local machine:
+3. Copy the backup from the remote server's backup directory to your local machine:
 ```bash
-scp user@remote-server:/path/to/backup.sql.gz ./local-backup.sql.gz
+scp user@remote-server:/path/to/project/backups/backup_name.sql.gz ./backups/
 ```
 
-4. Finally, restore locally:
+4. Restore locally:
 ```bash
-docker-compose -f local.yml run --rm django python manage.py database_restore local-backup.sql.gz
+docker-compose -f local.yml run --rm django python manage.py database_restore backups/backup_name.sql.gz
 ```
 
-#### Alternative Methods
-
-While the database_backup and database_restore commands are the recommended approach, there are alternative methods available:
+### Alternative Methods
 
-##### Using JSON Fixtures (for smaller datasets)
-If you're working with a smaller dataset, you can use Django's built-in fixtures:
+While the database_backup and database_restore commands are the recommended approach, you can also use Django's built-in fixtures for smaller datasets:
 
 ```bash
 # Create a backup excluding content types
-docker-compose -f production.yml run --rm --user root django python manage.py dumpdata \
+docker-compose -f production.yml run --rm django python manage.py dumpdata \
     --natural-foreign --natural-primary \
     --exclude=contenttypes --exclude=auth.Permission \
     --indent 2 \
-    --output /app/backups/prod_backup-$(date +%Y%m%d).json
+    --output backups/prod_backup-$(date +%Y%m%d).json
 
 # Restore from a fixture
-docker-compose -f local.yml run --rm django python manage.py loaddata /path/to/backup.json
+docker-compose -f local.yml run --rm django python manage.py loaddata backups/backup_name.json
 ```
 
 Note: For large databases (>1.5GB), the database_backup and database_restore commands are strongly recommended over JSON fixtures as they handle large datasets more efficiently.
diff --git a/sde_collections/management/commands/database_backup.py b/sde_collections/management/commands/database_backup.py
index 5f6551b3..090de63e 100644
--- a/sde_collections/management/commands/database_backup.py
+++ b/sde_collections/management/commands/database_backup.py
@@ -4,15 +4,16 @@
 Usage:
     docker-compose -f local.yml run --rm django python manage.py database_backup
     docker-compose -f local.yml run --rm django python manage.py database_backup --no-compress
-    docker-compose -f local.yml run --rm django python manage.py database_backup --output /path/to/output.sql
+    docker-compose -f local.yml run --rm django python manage.py database_backup --output my_backup.sql
     docker-compose -f production.yml run --rm django python manage.py database_backup
+
+All backups are stored in the /backups directory, which is mounted as a volume in both local
+and production environments. If specifying a custom output path, it will be relative to this directory.
 """
 
-import enum
 import gzip
 import os
 import shutil
-import socket
 import subprocess
 from contextlib import contextmanager
 from datetime import datetime
@@ -21,21 +22,6 @@
 from django.core.management.base import BaseCommand
 
 
-class Server(enum.Enum):
-    PRODUCTION = "PRODUCTION"
-    STAGING = "STAGING"
-    UNKNOWN = "UNKNOWN"
-
-
-def detect_server() -> Server:
-    hostname = socket.gethostname().upper()
-    if "PRODUCTION" in hostname:
-        return Server.PRODUCTION
-    elif "STAGING" in hostname:
-        return Server.STAGING
-    return Server.UNKNOWN
-
-
 @contextmanager
 def temp_file_handler(filename: str):
     """Context manager to handle temporary files, ensuring cleanup."""
@@ -58,36 +44,45 @@ def add_arguments(self, parser):
         parser.add_argument(
             "--output",
             type=str,
-            help="Output file path (default: auto-generated based on server name and date)",
+            help="Output file path (default: auto-generated in /app/backups directory)",
         )
 
-    def get_backup_filename(self, server: Server, compress: bool, custom_output: str = None) -> tuple[str, str]:
+    def get_backup_filename(self, compress: bool, custom_output: str = None) -> tuple[str, str]:
         """Generate backup filename and actual dump path.
 
         Args:
-            server: Server enum indicating the environment
             compress: Whether the output should be compressed
             custom_output: Optional custom output path
 
         Returns:
-            tuple[str, str]: A tuple containing (final_filename, temp_filename)
-                - final_filename: The name of the final backup file (with .gz if compressed)
-                - temp_filename: The name of the temporary dump file (always without .gz)
+            tuple[str, str]: A tuple containing:
+                - final_filename: Full path for the final backup file (with .gz if compressed)
+                - temp_filename: Full path for the temporary dump file (without .gz)
         """
+        backup_dir = "/app/backups"
+        os.makedirs(backup_dir, exist_ok=True)
+
         if custom_output:
+            # If custom_output is relative, make it relative to backup_dir
+            if not custom_output.startswith("/"):
+                custom_output = os.path.join(backup_dir, custom_output)
+
             # Ensure the output directory exists
             output_dir = os.path.dirname(custom_output)
             if output_dir:
                 os.makedirs(output_dir, exist_ok=True)
 
             if compress:
-                return custom_output + (".gz" if not custom_output.endswith(".gz") else ""), custom_output.removesuffix(
+                return custom_output + (
+                    ".gz" if not custom_output.endswith(".gz") else ""
+                ), custom_output.removesuffix(  # noqa
                     ".gz"
                 )
             return custom_output, custom_output
         else:
             date_str = datetime.now().strftime("%Y%m%d")
-            temp_filename = f"{server.value.lower()}_backup_{date_str}.sql"
+            env_name = os.getenv("BACKUP_ENVIRONMENT", "unknown")
+            temp_filename = os.path.join(backup_dir, f"{env_name}_backup_{date_str}.sql")
             final_filename = f"{temp_filename}.gz" if compress else temp_filename
             return final_filename, temp_filename
 
@@ -116,9 +111,15 @@ def compress_file(self, input_file: str, output_file: str) -> None:
                 shutil.copyfileobj(f_in, f_out)
 
     def handle(self, *args, **options):
-        server = detect_server()
+        if not os.getenv("BACKUP_ENVIRONMENT"):
+            self.stdout.write(
+                self.style.WARNING(
+                    "Note: Set BACKUP_ENVIRONMENT in your env if you want automatic environment-based filenames"
+                )
+            )
+
         compress = not options["no_compress"]
-        backup_file, dump_file = self.get_backup_filename(server, compress, options.get("output"))
+        backup_file, dump_file = self.get_backup_filename(compress, options.get("output"))
 
         env = os.environ.copy()
         env["PGPASSWORD"] = settings.DATABASES["default"]["PASSWORD"]
@@ -133,10 +134,10 @@ def handle(self, *args, **options):
 
             self.stdout.write(
                 self.style.SUCCESS(
-                    f"Successfully created {'compressed ' if compress else ''}backup for {server.value}: {backup_file}"
+                    f"Successfully created {'compressed ' if compress else ''}backup at: backups/{os.path.basename(backup_file)}"  # noqa
                 )
             )
         except subprocess.CalledProcessError as e:
-            self.stdout.write(self.style.ERROR(f"Backup failed on {server.value}: {str(e)}"))
+            self.stdout.write(self.style.ERROR(f"Backup failed: {str(e)}"))
         except Exception as e:
             self.stdout.write(self.style.ERROR(f"Error during backup process: {str(e)}"))
diff --git a/sde_collections/management/commands/database_restore.py b/sde_collections/management/commands/database_restore.py
index ece94cce..7410484d 100644
--- a/sde_collections/management/commands/database_restore.py
+++ b/sde_collections/management/commands/database_restore.py
@@ -2,8 +2,11 @@
 Management command to restore PostgreSQL database from backup.
 
 Usage:
-    docker-compose -f local.yml run --rm django python manage.py database_restore path/to/backup.sql[.gz]
-    docker-compose -f production.yml run --rm django python manage.py database_restore path/to/backup.sql[.gz]
+    docker-compose -f local.yml run --rm django python manage.py database_restore backups/backup.sql[.gz]
+    docker-compose -f production.yml run --rm django python manage.py database_restore backups/backup.sql[.gz]
+
+The backup file should be located in the /backups directory, which is mounted as a volume in both
+local and production environments.
 """
 
 import enum
diff --git a/sde_collections/tests/test_database_backup.py b/sde_collections/tests/test_database_backup.py
index d8a7be54..680da2f1 100644
--- a/sde_collections/tests/test_database_backup.py
+++ b/sde_collections/tests/test_database_backup.py
@@ -9,10 +9,7 @@
 from django.core.management import call_command
 
 from sde_collections.management.commands import database_backup
-from sde_collections.management.commands.database_backup import (
-    Server,
-    temp_file_handler,
-)
+from sde_collections.management.commands.database_backup import temp_file_handler
 
 
 @pytest.fixture
@@ -49,18 +46,27 @@ def command():
 
 
 class TestBackupCommand:
-    def test_get_backup_filename_compressed(self, command, mock_date):
+    def test_get_backup_filename_compressed(self, command, mock_date, monkeypatch):
         """Test backup filename generation with compression."""
-        backup_file, dump_file = command.get_backup_filename(Server.STAGING, compress=True)
-        assert backup_file == "staging_backup_20240115.sql.gz"
-        assert dump_file == "staging_backup_20240115.sql"
+        monkeypatch.setenv("BACKUP_ENVIRONMENT", "staging")
+        backup_file, dump_file = command.get_backup_filename(compress=True)
+        assert backup_file.endswith("staging_backup_20240115.sql.gz")
+        assert dump_file.endswith("staging_backup_20240115.sql")
 
-    def test_get_backup_filename_uncompressed(self, command, mock_date):
+    def test_get_backup_filename_uncompressed(self, command, mock_date, monkeypatch):
         """Test backup filename generation without compression."""
-        backup_file, dump_file = command.get_backup_filename(Server.PRODUCTION, compress=False)
-        assert backup_file == "production_backup_20240115.sql"
+        monkeypatch.setenv("BACKUP_ENVIRONMENT", "production")
+        backup_file, dump_file = command.get_backup_filename(compress=False)
+        assert backup_file.endswith("production_backup_20240115.sql")
         assert dump_file == backup_file
 
+    def test_get_backup_filename_no_environment(self, command, mock_date, monkeypatch):
+        """Test backup filename generation with no environment set."""
+        monkeypatch.delenv("BACKUP_ENVIRONMENT", raising=False)
+        backup_file, dump_file = command.get_backup_filename(compress=True)
+        assert backup_file.endswith("unknown_backup_20240115.sql.gz")
+        assert dump_file.endswith("unknown_backup_20240115.sql")
+
     def test_run_pg_dump(self, command, mock_subprocess, mock_settings):
         """Test pg_dump command execution."""
         env = {"PGPASSWORD": "test_password"}
@@ -119,34 +125,18 @@ def test_temp_file_handler_cleanup_on_error(self, tmp_path):
                 raise ValueError("Test error")
         assert not test_file.exists()
 
-    @patch("socket.gethostname")
-    def test_server_detection(self, mock_hostname):
-        """Test server environment detection."""
-        test_cases = [
-            ("PRODUCTION-SERVER", Server.PRODUCTION),
-            ("STAGING-DB", Server.STAGING),
-            ("DEV-HOST", Server.UNKNOWN),
-        ]
-
-        for hostname, expected_server in test_cases:
-            mock_hostname.return_value = hostname
-            with patch("sde_collections.management.commands.database_backup.detect_server") as mock_detect:
-                mock_detect.return_value = expected_server
-                server = database_backup.detect_server()
-                assert server == expected_server
-
     @pytest.mark.parametrize(
-        "compress,hostname",
+        "compress,env_name",
         [
-            (True, "PRODUCTION-SERVER"),
-            (False, "STAGING-SERVER"),
-            (True, "UNKNOWN-SERVER"),
+            (True, "production"),
+            (False, "staging"),
+            (True, "carson_local"),
         ],
     )
-    def test_handle_integration(self, compress, hostname, mock_subprocess, mock_date, mock_settings):
+    def test_handle_integration(self, compress, env_name, mock_subprocess, mock_date, mock_settings, monkeypatch):
         """Test full backup process integration."""
-        with patch("socket.gethostname", return_value=hostname):
-            call_command("database_backup", no_compress=not compress)
+        monkeypatch.setenv("BACKUP_ENVIRONMENT", env_name)
+        call_command("database_backup", no_compress=not compress)
 
         # Verify correct command execution
         mock_subprocess.assert_called_once()
@@ -154,35 +144,32 @@ def test_handle_integration(self, compress, hostname, mock_subprocess, mock_date
         # Verify correct filename used
         cmd_args = mock_subprocess.call_args[0][0]
         date_str = "20240115"
-        server_type = hostname.split("-")[0].lower()
-        expected_base = f"{server_type}_backup_{date_str}.sql"
+        expected_base = f"{env_name}_backup_{date_str}.sql"
+        assert cmd_args[-1].endswith(expected_base)
 
+        # Verify cleanup attempted if compressed
         if compress:
-            assert cmd_args[-1] == expected_base  # Temporary file
-            # Verify cleanup attempted
             assert not os.path.exists(expected_base)
-        else:
-            assert cmd_args[-1] == expected_base
 
-    def test_handle_pg_dump_error(self, mock_subprocess, mock_date):
+    def test_handle_pg_dump_error(self, mock_subprocess, mock_date, monkeypatch):
         """Test error handling when pg_dump fails."""
         mock_subprocess.side_effect = subprocess.CalledProcessError(1, "pg_dump")
+        monkeypatch.setenv("BACKUP_ENVIRONMENT", "staging")
 
-        with patch("socket.gethostname", return_value="STAGING-SERVER"):
-            call_command("database_backup")
+        call_command("database_backup")
 
         # Verify error handling and cleanup
         date_str = "20240115"
         temp_file = f"staging_backup_{date_str}.sql"
         assert not os.path.exists(temp_file)
 
-    def test_handle_compression_error(self, mock_subprocess, mock_date, command):
+    def test_handle_compression_error(self, mock_subprocess, mock_date, command, monkeypatch):
         """Test error handling during compression."""
+        monkeypatch.setenv("BACKUP_ENVIRONMENT", "staging")
         # Mock compression to fail
         command.compress_file = Mock(side_effect=Exception("Compression failed"))
 
-        with patch("socket.gethostname", return_value="STAGING-SERVER"):
-            call_command("database_backup")
+        call_command("database_backup")
 
         # Verify cleanup
         date_str = "20240115"

From e934a9cab1eea2637b5da2a8060bf6b2919ae70b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 14:52:54 -0600
Subject: [PATCH 287/441] refactor status change logic and add indexing
 complete status

---
 ...ter_collection_workflow_status_and_more.py | 100 ++++++++++++++++++
 sde_collections/models/collection.py          |  59 ++++++-----
 .../models/collection_choice_fields.py        |   1 +
 sde_collections/tasks.py                      |  46 ++++++--
 4 files changed, 170 insertions(+), 36 deletions(-)
 create mode 100644 sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py

diff --git a/sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py b/sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py
new file mode 100644
index 00000000..df71a2b6
--- /dev/null
+++ b/sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py
@@ -0,0 +1,100 @@
+# Generated by Django 4.2.9 on 2024-12-10 19:18
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0072_collection_reindexing_status_reindexinghistory"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="collection",
+            name="workflow_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Research in Progress"),
+                    (2, "Ready for Engineering"),
+                    (3, "Engineering in Progress"),
+                    (4, "Ready for Curation"),
+                    (5, "Curation in Progress"),
+                    (6, "Curated"),
+                    (7, "Quality Fixed"),
+                    (8, "Secret Deployment Started"),
+                    (9, "Secret Deployment Failed"),
+                    (10, "Ready for LRM Quality Check"),
+                    (11, "Ready for Quality Check"),
+                    (12, "QC: Failed"),
+                    (18, "QC: Minor Issues"),
+                    (13, "QC: Perfect"),
+                    (14, "Prod: Perfect"),
+                    (15, "Prod: Minor Issues"),
+                    (16, "Prod: Major Issues"),
+                    (17, "Code Merge Pending"),
+                    (19, "Delete from Prod"),
+                    (20, "Indexing Finished on LRM Dev"),
+                ],
+                default=1,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="workflowhistory",
+            name="old_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Research in Progress"),
+                    (2, "Ready for Engineering"),
+                    (3, "Engineering in Progress"),
+                    (4, "Ready for Curation"),
+                    (5, "Curation in Progress"),
+                    (6, "Curated"),
+                    (7, "Quality Fixed"),
+                    (8, "Secret Deployment Started"),
+                    (9, "Secret Deployment Failed"),
+                    (10, "Ready for LRM Quality Check"),
+                    (11, "Ready for Quality Check"),
+                    (12, "QC: Failed"),
+                    (18, "QC: Minor Issues"),
+                    (13, "QC: Perfect"),
+                    (14, "Prod: Perfect"),
+                    (15, "Prod: Minor Issues"),
+                    (16, "Prod: Major Issues"),
+                    (17, "Code Merge Pending"),
+                    (19, "Delete from Prod"),
+                    (20, "Indexing Finished on LRM Dev"),
+                ],
+                null=True,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="workflowhistory",
+            name="workflow_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Research in Progress"),
+                    (2, "Ready for Engineering"),
+                    (3, "Engineering in Progress"),
+                    (4, "Ready for Curation"),
+                    (5, "Curation in Progress"),
+                    (6, "Curated"),
+                    (7, "Quality Fixed"),
+                    (8, "Secret Deployment Started"),
+                    (9, "Secret Deployment Failed"),
+                    (10, "Ready for LRM Quality Check"),
+                    (11, "Ready for Quality Check"),
+                    (12, "QC: Failed"),
+                    (18, "QC: Minor Issues"),
+                    (13, "QC: Perfect"),
+                    (14, "Prod: Perfect"),
+                    (15, "Prod: Minor Issues"),
+                    (16, "Prod: Major Issues"),
+                    (17, "Code Merge Pending"),
+                    (19, "Delete from Prod"),
+                    (20, "Indexing Finished on LRM Dev"),
+                ],
+                default=1,
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 3c539bf5..5899aefa 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -11,6 +11,7 @@
 from slugify import slugify
 
 from config_generation.db_to_xml import XmlEditor
+from sde_collections.tasks import fetch_and_replace_full_text
 
 from ..utils.github_helper import GitHubHandler
 from ..utils.slack_utils import (
@@ -161,12 +162,6 @@ def migrate_dump_to_delta(self):
         # self.refresh_url_lists_for_all_patterns() # TODO: I'm pretty confident we shouldn't be running this
         self.apply_all_patterns()
 
-        # After migrating, check if we should update reindexing status
-        curated_urls_count = self.curated_urls.count()
-        if curated_urls_count > 0:
-            self.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION
-            self.save()
-
     def create_or_update_delta_url(self, url_instance, to_delete=False):
         """
         Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object.
@@ -236,12 +231,6 @@ def promote_to_curated(self):
         # Step 4: Reapply patterns to DeltaUrls
         self.refresh_url_lists_for_all_patterns()
 
-        # After promoting, check if we should update reindexing status
-        curated_urls_count = self.curated_urls.count()
-        if curated_urls_count > 0:
-            self.reindexing_status = ReindexingStatusChoices.REINDEXING_CURATED
-            self.save()
-
     def add_to_public_query(self):
         """Add the collection to the public query."""
         if self.workflow_status not in [
@@ -788,20 +777,32 @@ def reindexing_status_button_color(self) -> str:
 
 @receiver(post_save, sender=Collection)
 def create_configs_on_status_change(sender, instance, created, **kwargs):
-    """
-    Creates various config files on certain workflow status changes
-    """
-
-    if "workflow_status" in instance.tracker.changed():
-        if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION:
-            instance.create_plugin_config(overwrite=True)
-        elif instance.workflow_status == WorkflowStatusChoices.CURATED:
-            instance.promote_to_curated()
-        elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
-            instance.create_scraper_config(overwrite=False)
-            instance.create_indexer_config(overwrite=False)
-        elif instance.workflow_status in [
-            WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
-            WorkflowStatusChoices.QUALITY_CHECK_MINOR,
-        ]:
-            instance.add_to_public_query()
+    """Creates various config files on certain workflow status changes"""
+
+    if getattr(instance, "_handling_status_change", False):
+        return
+
+    try:
+        instance._handling_status_change = True
+
+        if "workflow_status" in instance.tracker.changed():
+            if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION:
+                instance.create_plugin_config(overwrite=True)
+            elif instance.workflow_status == WorkflowStatusChoices.CURATED:
+                instance.promote_to_curated()
+            elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING:
+                instance.create_scraper_config(overwrite=False)
+                instance.create_indexer_config(overwrite=False)
+            elif instance.workflow_status == WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV:
+                fetch_and_replace_full_text.delay(instance.id, "lrm_dev")
+            elif instance.workflow_status in [
+                WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+                WorkflowStatusChoices.QUALITY_CHECK_MINOR,
+            ]:
+                instance.add_to_public_query()
+
+        if "reindexing_status" in instance.tracker.changed():
+            if instance.reindexing_status == ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV:
+                fetch_and_replace_full_text.delay(instance.id, "lrm_dev")
+    finally:
+        instance._handling_status_change = False
diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index 8bd824c1..a123a132 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -97,6 +97,7 @@ class WorkflowStatusChoices(models.IntegerChoices):
     PROD_MAJOR = 16, "Prod: Major Issues"
     MERGE_PENDING = 17, "Code Merge Pending"
     NEEDS_DELETE = 19, "Delete from Prod"
+    INDEXING_FINISHED_ON_DEV = 20, "Indexing Finished on LRM Dev"
 
 
 class ReindexingStatusChoices(models.IntegerChoices):
diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py
index 8d4a4c4d..86605124 100644
--- a/sde_collections/tasks.py
+++ b/sde_collections/tasks.py
@@ -10,8 +10,11 @@
 from django.db import transaction
 
 from config import celery_app
+from sde_collections.models.collection_choice_fields import (
+    ReindexingStatusChoices,
+    WorkflowStatusChoices,
+)
 
-from .models.collection import Collection, WorkflowStatusChoices
 from .models.delta_url import DumpUrl
 from .sinequa_api import Api
 from .utils.github_helper import GitHubHandler
@@ -68,6 +71,7 @@ def _get_data_to_import(collection, server_name):
 def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
     TEMP_FOLDER_NAME = "temp"
     os.makedirs(TEMP_FOLDER_NAME, exist_ok=True)
+    Collection = apps.get_model("sde_collections", "Collection")
 
     collections = Collection.objects.filter(id__in=collection_ids)
 
@@ -106,6 +110,8 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]):
 
 @celery_app.task()
 def push_to_github_task(collection_ids):
+    Collection = apps.get_model("sde_collections", "Collection")
+
     collections = Collection.objects.filter(id__in=collection_ids)
     github_handler = GitHubHandler(collections)
     github_handler.push_to_github()
@@ -113,12 +119,16 @@ def push_to_github_task(collection_ids):
 
 @celery_app.task()
 def sync_with_production_webapp():
+    Collection = apps.get_model("sde_collections", "Collection")
+
     for collection in Collection.objects.all():
         collection.sync_with_production_webapp()
 
 
 @celery_app.task()
 def pull_latest_collection_metadata_from_github():
+    Collection = apps.get_model("sde_collections", "Collection")
+
     FILENAME = "github_collections.json"
 
     gh = GitHubHandler(collections=Collection.objects.none())
@@ -149,21 +159,25 @@ def resolve_title_pattern(title_pattern_id):
 def fetch_and_replace_full_text(collection_id, server_name):
     """
     Task to fetch and replace full text and metadata for a collection.
-    Handles data in batches to manage memory usage.
+    Handles data in batches to manage memory usage and updates appropriate statuses
+    upon completion.
     """
+    Collection = apps.get_model("sde_collections", "Collection")
+
     collection = Collection.objects.get(id=collection_id)
     api = Api(server_name)
 
+    initial_workflow_status = collection.workflow_status
+    initial_reindexing_status = collection.reindexing_status
+
     # Step 1: Delete existing DumpUrl entries
     deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete()
     print(f"Deleted {deleted_count} old records.")
 
-    # Step 2: Process data in batches
-    total_processed = 0
-
     try:
+        # Step 2: Process data in batches
+        total_processed = 0
         for batch in api.get_full_texts(collection.config_folder):
-            # Use bulk_create for efficiency, with a transaction per batch
             with transaction.atomic():
                 DumpUrl.objects.bulk_create(
                     [
@@ -176,13 +190,31 @@ def fetch_and_replace_full_text(collection_id, server_name):
                         for record in batch
                     ]
                 )
-
             total_processed += len(batch)
             print(f"Processed batch of {len(batch)} records. Total: {total_processed}")
 
         # Step 3: Migrate dump URLs to delta URLs
         collection.migrate_dump_to_delta()
 
+        # Step 4: Update statuses if needed
+        collection.refresh_from_db()
+
+        # Check workflow status transition
+        pre_workflow_statuses = [
+            WorkflowStatusChoices.RESEARCH_IN_PROGRESS,
+            WorkflowStatusChoices.READY_FOR_ENGINEERING,
+            WorkflowStatusChoices.ENGINEERING_IN_PROGRESS,
+            WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV,
+        ]
+        if initial_workflow_status in pre_workflow_statuses:
+            collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
+            collection.save()
+
+        # Check reindexing status transition
+        if initial_reindexing_status == ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV:
+            collection.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION
+            collection.save()
+
         return f"Successfully processed {total_processed} records and updated the database."
 
     except Exception as e:

From 623834e43c6742c04fbb46a6b2da97397c2908ac Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 16:29:32 -0600
Subject: [PATCH 288/441] initial addition of frontend status change

---
 .../models/collection_choice_fields.py        | 12 +--
 sde_collections/views.py                      |  1 +
 .../static/js/delta_url_list.js               | 83 ++++++++++++++++-
 .../sde_collections/delta_urls_list.html      | 90 +++++++++++++++----
 4 files changed, 163 insertions(+), 23 deletions(-)

diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index a123a132..689d7b95 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -101,12 +101,12 @@ class WorkflowStatusChoices(models.IntegerChoices):
 
 
 class ReindexingStatusChoices(models.IntegerChoices):
-    REINDEXING_NOT_NEEDED = 1, "Reindexing Not Needed"
-    REINDEXING_NEEDED_ON_DEV = 2, "Reindexing Needed on LRM Dev"
-    REINDEXING_FINISHED_ON_DEV = 3, "Reindexing Finished on LRM Dev"
-    REINDEXING_READY_FOR_CURATION = 4, "Ready for Curation"
-    REINDEXING_CURATED = 5, "Curated"
-    REINDEXING_INDEXED_ON_PROD = 6, "Indexed on Prod"
+    REINDEXING_NOT_NEEDED = 1, "Re-Indexing Not Needed"
+    REINDEXING_NEEDED_ON_DEV = 2, "Re-Indexing Needed on LRM Dev"
+    REINDEXING_FINISHED_ON_DEV = 3, "Re-Indexing Finished on LRM Dev"
+    REINDEXING_READY_FOR_CURATION = 4, "Ready for Re-Curation"
+    REINDEXING_CURATED = 5, "Re-Curation Finished"
+    REINDEXING_INDEXED_ON_PROD = 6, "Re-Indexed on Prod"
 
     @classmethod
     def get_status_string(cls, value):
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 60dc9a4e..215a7353 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -227,6 +227,7 @@ def get_context_data(self, **kwargs):
         )  # 2=regex patterns
         context["title_patterns"] = self.collection.titlepattern.all()
         context["workflow_status_choices"] = WorkflowStatusChoices
+        context["reindexing_status_choices"] = ReindexingStatusChoices
         context["is_multi_division"] = self.collection.is_multi_division
 
         return context
diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 2ed445ad..c6cac7dd 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -1039,6 +1039,7 @@ function setupClickHandlers() {
   handleTabsClick();
 
   handleWorkflowStatusSelect();
+  handleReindexingStatusSelect();
 }
 
 function getDivisionColumn() {
@@ -2093,11 +2094,11 @@ function postWorkflowStatus(collection_id, workflow_status) {
       toastr.success("Workflow Status Updated!");
 
       // Refresh page after modal closes and success message shows
-      setTimeout(function() {
+      setTimeout(function () {
         window.location = window.location.href;
       }, 1500);
     },
-    error: function(xhr, status, error) {
+    error: function (xhr, status, error) {
       $('#workflowStatusChangeModal button').blur();
       $("#workflowStatusChangeModal").modal('hide');
       $('.modal-backdrop').remove();
@@ -2159,3 +2160,81 @@ function handleWorkflowStatusSelect() {
     });
   });
 }
+
+function postReindexingStatus(collection_id, reindexing_status) {
+  var url = `/api/collections/${collection_id}/`;
+  $.ajax({
+    url: url,
+    type: "PUT",
+    data: {
+      reindexing_status: reindexing_status,
+      csrfmiddlewaretoken: csrftoken,
+    },
+    headers: {
+      "X-CSRFToken": csrftoken,
+    },
+    success: function (data) {
+      $('#reindexingStatusChangeModal button').blur();
+      $("#reindexingStatusChangeModal")
+        .removeClass('show')
+        .removeAttr('aria-hidden')
+        .modal('hide');
+      $('.modal-backdrop').remove();
+      $('body').removeClass('modal-open');
+      toastr.success("Reindexing Status Updated!");
+
+      // Refresh page after modal closes and success message shows
+      setTimeout(function () {
+        window.location = window.location.href;
+      }, 1500);
+    },
+    error: function (xhr, status, error) {
+      $('#reindexingStatusChangeModal button').blur();
+      $("#reindexingStatusChangeModal").modal('hide');
+      $('.modal-backdrop').remove();
+      toastr.error("Error updating reindexing status: " + error);
+    }
+  });
+}
+
+function handleReindexingStatusSelect() {
+  $("body").on("click", ".reindexing_status_select", function () {
+    $("#reindexingStatusChangeModal").modal();
+    var collectionName = $(".urlStyle").text();
+    var collection_id = $(this).data("collection-id");
+    var reindexing_status = $(this).attr("value");
+    var new_reindexing_status = $(this).text();
+
+    $(".reindexing-status-change-caption").html(
+      `<div>Reindexing status for <b class="bold">${collectionName}</b> will change to <b class="bold">${new_reindexing_status}</b></div>`
+    );
+
+    $("#reindexingStatusChangeModalForm").on("click", "button", function (event) {
+      event.preventDefault();
+      var buttonId = $(this).attr("id");
+
+      switch (buttonId) {
+        case "cancelreindexingStatusChange":
+          $("#reindexingStatusChangeModal").modal("hide");
+          break;
+        case "changeReindexingStatus":
+          var color_choices = {
+            1: "btn-light",    // NOT_NEEDED
+            2: "btn-warning",  // NEEDED
+            3: "btn-secondary",// FINISHED
+            4: "btn-info",     // READY_FOR_CURATION
+            5: "btn-primary",  // CURATED
+            6: "btn-success"   // INDEXED_ON_PROD
+          };
+
+          $button = $(`#reindexing-status-button-${collection_id}`);
+          $button.text(new_reindexing_status);
+          $button.removeClass("btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary");
+          $button.addClass(color_choices[parseInt(reindexing_status)]);
+          postReindexingStatus(collection_id, reindexing_status);
+          $("#reindexingStatusChangeModal").modal("hide");
+          break;
+      }
+    });
+  });
+}
diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index 23d9c2fc..c041dde4 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -13,21 +13,56 @@
 {% endblock stylesheets %}
 {% block content %}
 {% csrf_token %}
-<div class="headerDiv">
-<h1 class="pageTitle">URLs</h1>
-<button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
-type="button"
-data-toggle="dropdown"
-aria-haspopup="true"
-id="workflow-status-button-{{ collection.id }}"
-aria-expanded="false">{{ collection.get_workflow_status_display }}</button>
-<div class="dropdown-menu"
-aria-labelledby="workflow-status-button-{{ collection.id }}">
-{% for choice in workflow_status_choices %}
-<a class="dropdown-item workflow_status_select" value="{{ choice }}" data-collection-id={{ collection.id }} >{{ choice.label }}</a>
-{% endfor %}
-</div>
-</div>
+
+
+<div class="headerDiv d-flex justify-content-between align-items-center">
+    <h1 class="pageTitle mb-0">URLs</h1>
+
+    <div class="d-flex">
+      <div class="btn-group mr-2">
+        <button class="btn badge {{ collection.workflow_status_button_color }} dropdown-toggle title-dropdown btn-sm"
+                type="button"
+                data-toggle="dropdown"
+                aria-haspopup="true"
+                id="workflow-status-button-{{ collection.id }}"
+                aria-expanded="false">
+          {{ collection.get_workflow_status_display }}
+        </button>
+        <div class="dropdown-menu" aria-labelledby="workflow-status-button-{{ collection.id }}">
+          {% for choice in workflow_status_choices %}
+            <a class="dropdown-item workflow_status_select"
+               value="{{ choice }}"
+               data-collection-id="{{ collection.id }}">
+              {{ choice.label }}
+            </a>
+          {% endfor %}
+        </div>
+      </div>
+
+      <div class="btn-group">
+        <button type="button"
+                class="btn {{ collection.reindexing_status_button_color }} dropdown-toggle btn-sm"
+                id="reindexing-status-button-{{ collection.id }}"
+                data-toggle="dropdown"
+                aria-haspopup="true"
+                aria-expanded="false">
+          {{ collection.get_reindexing_status_display }}
+        </button>
+        <div class="dropdown-menu" aria-labelledby="reindexing-status-button-{{ collection.id }}">
+          {% for choice in reindexing_status_choices.choices %}
+            <a class="dropdown-item reindexing_status_select"
+               href="#"
+               value="{{ choice.0 }}"
+               data-collection-id="{{ collection.id }}">
+              {{ choice.1 }}
+            </a>
+          {% endfor %}
+        </div>
+      </div>
+    </div>
+  </div>
+
+
 <div class="deltaUrlContainer">
 <h3 class="whiteText deltaTitle">
     <!-- {{ delta_urls.count|intcomma }} Delta URLs for  -->
@@ -620,6 +655,31 @@ <h5 class="modal-title">Are you sure?</h5>
     </div>
 </div>
 
+<div class="modal" id="reindexingStatusChangeModal" tabindex="-1"
+aria-labelledby="reindexingStatusChangeModal" aria-hidden="true">
+<div class="modal-dialog">
+    <div class="modal-content">
+        <div class="modalHeader">
+            <button type="button" class="close modal-close" data-dismiss="modal" aria-label="Close" id="closereindexingStatusChangeModal">
+                <span aria-hidden="true">&times;</span>
+            </button>
+        </div>
+            <div class="modal-body" id="modal-body">
+                <h5 class="modal-title">Are you sure?</h5>
+                <p class="reindexing-status-change-caption" id="caption"></p>
+            </div>
+            <div class="modal-footer">
+                <form id="reindexingStatusChangeModalForm">
+                    <div class="button-wrapper">
+                    <button type="submit" class="btn btn-secondary modal-button-1" id="cancelreindexingStatusChange">No</button>
+                    <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="changeReindexingStatus">Yes</button>
+                    </div>
+                    </form>
+            </div>
+    </div>
+</div>
+
+
 {% endblock content %}
 
 {% block javascripts %}

From 5ca98a48afb6b36ef3aee6c9c93b7f9d2c8e5947 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 16:35:06 -0600
Subject: [PATCH 289/441] shorten display names for reindexing statuses

---
 sde_collections/models/collection_choice_fields.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index 689d7b95..f5e0bb9f 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -102,8 +102,8 @@ class WorkflowStatusChoices(models.IntegerChoices):
 
 class ReindexingStatusChoices(models.IntegerChoices):
     REINDEXING_NOT_NEEDED = 1, "Re-Indexing Not Needed"
-    REINDEXING_NEEDED_ON_DEV = 2, "Re-Indexing Needed on LRM Dev"
-    REINDEXING_FINISHED_ON_DEV = 3, "Re-Indexing Finished on LRM Dev"
+    REINDEXING_NEEDED_ON_DEV = 2, "Re-Indexing Needed"
+    REINDEXING_FINISHED_ON_DEV = 3, "Re-Indexing Finished"
     REINDEXING_READY_FOR_CURATION = 4, "Ready for Re-Curation"
     REINDEXING_CURATED = 5, "Re-Curation Finished"
     REINDEXING_INDEXED_ON_PROD = 6, "Re-Indexed on Prod"

From b5c1c42aba891fbf401519747e610555594baf45 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 21:13:35 -0600
Subject: [PATCH 290/441] fix modal bug and add workflow status

---
 ...r_collection_reindexing_status_and_more.py | 59 ++++++++++++++
 .../models/collection_choice_fields.py        | 12 +--
 sde_collections/serializers.py                | 10 ++-
 sde_collections/views.py                      |  1 +
 .../static/js/delta_url_list.js               | 78 ++++++++++---------
 .../sde_collections/delta_urls_list.html      | 44 +++++------
 6 files changed, 136 insertions(+), 68 deletions(-)
 create mode 100644 sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py

diff --git a/sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py b/sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py
new file mode 100644
index 00000000..06963e83
--- /dev/null
+++ b/sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py
@@ -0,0 +1,59 @@
+# Generated by Django 4.2.9 on 2024-12-11 02:41
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0073_alter_collection_workflow_status_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="collection",
+            name="reindexing_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Re-Indexing Not Needed"),
+                    (2, "Re-Indexing Needed"),
+                    (3, "Re-Indexing Finished"),
+                    (4, "Ready for Re-Curation"),
+                    (5, "Re-Curation Finished"),
+                    (6, "Re-Indexed on Prod"),
+                ],
+                default=1,
+                verbose_name="Reindexing Status",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="reindexinghistory",
+            name="old_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Re-Indexing Not Needed"),
+                    (2, "Re-Indexing Needed"),
+                    (3, "Re-Indexing Finished"),
+                    (4, "Ready for Re-Curation"),
+                    (5, "Re-Curation Finished"),
+                    (6, "Re-Indexed on Prod"),
+                ],
+                null=True,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="reindexinghistory",
+            name="reindexing_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Re-Indexing Not Needed"),
+                    (2, "Re-Indexing Needed"),
+                    (3, "Re-Indexing Finished"),
+                    (4, "Ready for Re-Curation"),
+                    (5, "Re-Curation Finished"),
+                    (6, "Re-Indexed on Prod"),
+                ],
+                default=1,
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index f5e0bb9f..6bba7846 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -108,12 +108,12 @@ class ReindexingStatusChoices(models.IntegerChoices):
     REINDEXING_CURATED = 5, "Re-Curation Finished"
     REINDEXING_INDEXED_ON_PROD = 6, "Re-Indexed on Prod"
 
-    @classmethod
-    def get_status_string(cls, value):
-        for choice in cls.choices:
-            if choice[0] == value:
-                return choice[1]
-        return "N/A"
+    # @classmethod
+    # def get_status_string(cls, value):
+    #     for choice in cls.choices:
+    #         if choice[0] == value:
+    #             return choice[1]
+    #     return "N/A"
 
 
 class TDAMMTags(models.TextChoices):
diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index a27482c0..1a4ac099 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -1,6 +1,6 @@
 from rest_framework import serializers
 
-from .models.collection import Collection, WorkflowHistory
+from .models.collection import Collection, ReindexingHistory, WorkflowHistory
 from .models.collection_choice_fields import Divisions, DocumentTypes
 from .models.delta_patterns import (
     DeltaDivisionPattern,
@@ -36,7 +36,7 @@ class Meta:
             "division": {"required": False},
             "document_type": {"required": False},
             "name": {"required": False},
-            "reindexing_status": {"required": False},
+            # "reindexing_status": {"required": False},
         }
 
         # extra_kwargs = {
@@ -58,6 +58,12 @@ class Meta:
         fields = "__all__"
 
 
+class ReindexingHistorySerializer(serializers.ModelSerializer):
+    class Meta:
+        model = ReindexingHistory
+        fields = "__all__"
+
+
 class DeltaURLSerializer(serializers.ModelSerializer):
     excluded = serializers.BooleanField(required=False)
     document_type_display = serializers.CharField(source="get_document_type_display", read_only=True)
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 215a7353..8b3f1727 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -179,6 +179,7 @@ def get_context_data(self, **kwargs):
             "-created_at"
         )
         context["workflow_status_choices"] = WorkflowStatusChoices
+        context["reindexing_status_choices"] = ReindexingStatusChoices
 
         return context
 
diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index c6cac7dd..aff3e86a 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -2107,6 +2107,42 @@ function postWorkflowStatus(collection_id, workflow_status) {
   });
 }
 
+function postReindexingStatus(collection_id, reindexing_status) {
+  var url = `/api/collections/${collection_id}/`;
+  $.ajax({
+    url: url,
+    type: "PUT",
+    data: {
+      reindexing_status: reindexing_status,
+      csrfmiddlewaretoken: csrftoken,
+    },
+    headers: {
+      "X-CSRFToken": csrftoken,
+    },
+    success: function (data) {
+      $('#reindexingStatusChangeModal button').blur();
+      $("#reindexingStatusChangeModal")
+        .removeClass('show')
+        .removeAttr('aria-hidden')
+        .modal('hide');
+      $('.modal-backdrop').remove();
+      $('body').removeClass('modal-open');
+      toastr.success("Reindexing Status Updated!");
+
+      // Refresh page after modal closes and success message shows
+      setTimeout(function () {
+        window.location = window.location.href;
+      }, 1500);
+    },
+    error: function (xhr, status, error) {
+      $('#reindexingStatusChangeModal button').blur();
+      $("#reindexingStatusChangeModal").modal('hide');
+      $('.modal-backdrop').remove();
+      toastr.error("Error updating reindexing status: " + error);
+    }
+  });
+}
+
 function handleWorkflowStatusSelect() {
   $("body").on("click", ".workflow_status_select", function () {
     $("#workflowStatusChangeModal").modal();
@@ -2161,42 +2197,6 @@ function handleWorkflowStatusSelect() {
   });
 }
 
-function postReindexingStatus(collection_id, reindexing_status) {
-  var url = `/api/collections/${collection_id}/`;
-  $.ajax({
-    url: url,
-    type: "PUT",
-    data: {
-      reindexing_status: reindexing_status,
-      csrfmiddlewaretoken: csrftoken,
-    },
-    headers: {
-      "X-CSRFToken": csrftoken,
-    },
-    success: function (data) {
-      $('#reindexingStatusChangeModal button').blur();
-      $("#reindexingStatusChangeModal")
-        .removeClass('show')
-        .removeAttr('aria-hidden')
-        .modal('hide');
-      $('.modal-backdrop').remove();
-      $('body').removeClass('modal-open');
-      toastr.success("Reindexing Status Updated!");
-
-      // Refresh page after modal closes and success message shows
-      setTimeout(function () {
-        window.location = window.location.href;
-      }, 1500);
-    },
-    error: function (xhr, status, error) {
-      $('#reindexingStatusChangeModal button').blur();
-      $("#reindexingStatusChangeModal").modal('hide');
-      $('.modal-backdrop').remove();
-      toastr.error("Error updating reindexing status: " + error);
-    }
-  });
-}
-
 function handleReindexingStatusSelect() {
   $("body").on("click", ".reindexing_status_select", function () {
     $("#reindexingStatusChangeModal").modal();
@@ -2208,7 +2208,6 @@ function handleReindexingStatusSelect() {
     $(".reindexing-status-change-caption").html(
       `<div>Reindexing status for <b class="bold">${collectionName}</b> will change to <b class="bold">${new_reindexing_status}</b></div>`
     );
-
     $("#reindexingStatusChangeModalForm").on("click", "button", function (event) {
       event.preventDefault();
       var buttonId = $(this).attr("id");
@@ -2228,8 +2227,11 @@ function handleReindexingStatusSelect() {
           };
 
           $button = $(`#reindexing-status-button-${collection_id}`);
+
           $button.text(new_reindexing_status);
-          $button.removeClass("btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary");
+          $button.removeClass(
+            "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
+          );
           $button.addClass(color_choices[parseInt(reindexing_status)]);
           postReindexingStatus(collection_id, reindexing_status);
           $("#reindexingStatusChangeModal").modal("hide");
diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index c041dde4..fb0555fa 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -39,28 +39,27 @@ <h1 class="pageTitle mb-0">URLs</h1>
         </div>
       </div>
 
-      <div class="btn-group">
-        <button type="button"
-                class="btn {{ collection.reindexing_status_button_color }} dropdown-toggle btn-sm"
-                id="reindexing-status-button-{{ collection.id }}"
+      <div class="btn-group mr-2">
+        <button class="btn badge {{ collection.reindexing_status_button_color }} dropdown-toggle title-dropdown btn-sm"
+                type="button"
                 data-toggle="dropdown"
                 aria-haspopup="true"
+                id="reindexing-status-button-{{ collection.id }}"
                 aria-expanded="false">
           {{ collection.get_reindexing_status_display }}
         </button>
         <div class="dropdown-menu" aria-labelledby="reindexing-status-button-{{ collection.id }}">
-          {% for choice in reindexing_status_choices.choices %}
+          {% for choice in reindexing_status_choices %}
             <a class="dropdown-item reindexing_status_select"
-               href="#"
-               value="{{ choice.0 }}"
+               value="{{ choice }}"
                data-collection-id="{{ collection.id }}">
-              {{ choice.1 }}
+              {{ choice.label }}
             </a>
           {% endfor %}
         </div>
       </div>
     </div>
-  </div>
+</div>
 
 
 <div class="deltaUrlContainer">
@@ -640,20 +639,21 @@ <h5 class="modal-title">Are you sure?</h5>
                 <span aria-hidden="true">&times;</span>
             </button>
         </div>
-            <div class="modal-body" id="modal-body">
-                <h5 class="modal-title">Are you sure?</h5>
-                <p class="workflow-status-change-caption" id="caption"></p>
-            </div>
-            <div class="modal-footer">
-                <form id="workflowStatusChangeModalForm">
-                    <div class="button-wrapper">
+        <div class="modal-body" id="workflow-modal-body">
+            <h5 class="modal-title">Are you sure?</h5>
+            <p class="workflow-status-change-caption" id="workflow-caption"></p>
+        </div>
+        <div class="modal-footer">
+            <form id="workflowStatusChangeModalForm">
+                <div class="button-wrapper">
                     <button type="submit" class="btn btn-secondary modal-button-1" id="cancelworkflowStatusChange">No</button>
                     <button type="submit" class="btn btn-primary modal-button-2" data-dismiss="modal" id="changeWorkflowStatus">Yes</button>
-                    </div>
-                    </form>
-            </div>
+                </div>
+            </form>
+        </div>
     </div>
 </div>
+</div>
 
 <div class="modal" id="reindexingStatusChangeModal" tabindex="-1"
 aria-labelledby="reindexingStatusChangeModal" aria-hidden="true">
@@ -664,9 +664,9 @@ <h5 class="modal-title">Are you sure?</h5>
                 <span aria-hidden="true">&times;</span>
             </button>
         </div>
-            <div class="modal-body" id="modal-body">
+            <div class="modal-body" id="reindexing-modal-body">
                 <h5 class="modal-title">Are you sure?</h5>
-                <p class="reindexing-status-change-caption" id="caption"></p>
+                <p class="reindexing-status-change-caption" id="reindexing-caption"></p>
             </div>
             <div class="modal-footer">
                 <form id="reindexingStatusChangeModalForm">
@@ -678,7 +678,7 @@ <h5 class="modal-title">Are you sure?</h5>
             </div>
     </div>
 </div>
-
+</div>
 
 {% endblock content %}
 

From 8de43c7e662232666b1fd767d70b6e4be0f61e2b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 21:31:50 -0600
Subject: [PATCH 291/441] register reindexing_status in the field tracker and
 add curation promotion to postsave

---
 sde_collections/models/collection.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 5899aefa..93d6e2db 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -82,7 +82,7 @@ class Collection(models.Model):
         default=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
         verbose_name="Reindexing Status",
     )
-    tracker = FieldTracker(fields=["workflow_status"])
+    tracker = FieldTracker(fields=["workflow_status", "reindexing_status"])
 
     curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True)
     curation_started = models.DateTimeField("Curation Started", null=True, blank=True)
@@ -804,5 +804,8 @@ def create_configs_on_status_change(sender, instance, created, **kwargs):
         if "reindexing_status" in instance.tracker.changed():
             if instance.reindexing_status == ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV:
                 fetch_and_replace_full_text.delay(instance.id, "lrm_dev")
+            elif instance.reindexing_status == ReindexingStatusChoices.REINDEXING_CURATED:
+                instance.promote_to_curated()
+
     finally:
         instance._handling_status_change = False

From a8a33782f842eb931f63fefcec134cdfcb19b10e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 22:02:01 -0600
Subject: [PATCH 292/441] rename readme to pattern overview

---
 sde_collections/models/{README.md => README_PATTERN_OVERVIEW.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sde_collections/models/{README.md => README_PATTERN_OVERVIEW.md} (100%)

diff --git a/sde_collections/models/README.md b/sde_collections/models/README_PATTERN_OVERVIEW.md
similarity index 100%
rename from sde_collections/models/README.md
rename to sde_collections/models/README_PATTERN_OVERVIEW.md

From 4e288ac80b68bd8e5e8024c48ab089ccf81e7c94 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 22:02:27 -0600
Subject: [PATCH 293/441] add readme for status triggers

---
 .../models/README_STATUS_TRIGGERS.md          | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 sde_collections/models/README_STATUS_TRIGGERS.md

diff --git a/sde_collections/models/README_STATUS_TRIGGERS.md b/sde_collections/models/README_STATUS_TRIGGERS.md
new file mode 100644
index 00000000..8f8b8397
--- /dev/null
+++ b/sde_collections/models/README_STATUS_TRIGGERS.md
@@ -0,0 +1,68 @@
+# Collection Status Workflows
+
+This document outlines the automated workflows triggered by status changes in Collections.
+
+## Workflow Status Transitions
+
+Collections progress through workflow statuses that trigger specific automated actions:
+
+### Initial Flow
+1. `RESEARCH_IN_PROGRESS` → `READY_FOR_ENGINEERING`
+   - Triggers: Creation of initial scraper and indexer configs
+
+2. `READY_FOR_ENGINEERING` → `ENGINEERING_IN_PROGRESS` → `INDEXING_FINISHED_ON_DEV`
+   - When indexing finishes, a developer changes the status to `INDEXING_FINISHED_ON_DEV`
+   - This will trigger a full text fetch from LRM dev
+   - If the fetch completes successfully, it updates the status to `READY_FOR_CURATION`
+
+3. `READY_FOR_CURATION`
+   - Triggers creation/update of plugin config
+
+4. `READY_FOR_CURATION` → `CURATION_IN_PROGRESS` → `CURATED`
+   - When curation finishes, the curator marks the collection as `CURATED`
+   - This triggers the promotion of DeltaUrls to CuratedUrls
+
+5. Quality Check Flow:
+   - During quality checks the curator can put the status as `QUALITY_CHECK_PERFECT/MINOR`
+   - These passing quality statuses will trigger the addition of the collection to the public query
+   - After the PR is merged and SDE Prod server is updated with the latest code, this collection will become visible
+
+### Reindexing Flow
+
+After the main workflow, collections can enter a reindexing cycle:
+
+1. `REINDEXING_NOT_NEEDED` → `REINDEXING_NEEDED_ON_DEV`
+   - By default collections do not need reindexing
+   - They can be manually marked as reindexing needed on dev
+
+2. `REINDEXING_NEEDED_ON_DEV` → `REINDEXING_FINISHED_ON_DEV`
+   - When re-indexing finishes, a developer changes the status to `REINDEXING_FINISHED_ON_DEV`
+   - This will trigger a full text fetch from LRM dev
+   - If the fetch completes successfully, it updates the status to `REINDEXING_READY_FOR_CURATION`
+
+3. `REINDEXING_READY_FOR_CURATION` → `REINDEXING_CURATED`
+   - When re-curation finishes, the curator marks the collection as `REINDEXING_CURATED`
+   - This triggers the promotion of DeltaUrls to CuratedUrls
+
+4. `REINDEXING_CURATED` → `REINDEXING_INDEXED_ON_PROD`
+   - After the collection has been indexed on Prod, a dev marks it as `REINDEXING_INDEXED_ON_PROD`
+
+## Full Text Import Process
+
+The full text import process integrates with both workflows:
+
+1. Clears existing DumpUrls for the collection
+2. Fetches and processes new full text data in batches
+3. Creates new DumpUrls
+4. Migrates DumpUrls to DeltaUrls
+5. Updates collection status based on context:
+   - In main workflow: Updates to `READY_FOR_CURATION`
+   - In reindexing: Updates to `REINDEXING_READY_FOR_CURATION`
+
+## Key Models and Files
+
+- `Collection`: Main model handling status transitions
+- `WorkflowStatusChoices`: Enum defining main workflow states
+- `ReindexingStatusChoices`: Enum defining reindexing states
+- `tasks.py`: Contains full text import logic and status updates
+- Signal handler in Collection model manages status change triggers

From 9f9654f14eaa2f7058990b11937b0e07cf0f3f1b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 22:05:02 -0600
Subject: [PATCH 294/441] add overview readme to link all the documentation

---
 sde_collections/models/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 sde_collections/models/README.md

diff --git a/sde_collections/models/README.md b/sde_collections/models/README.md
new file mode 100644
index 00000000..3c326f6a
--- /dev/null
+++ b/sde_collections/models/README.md
@@ -0,0 +1,15 @@
+# COSMOS Curation System
+
+A system for managing collections of URLs through pattern-based rules and status workflows.
+
+## Documentation
+
+
+- [URL Pattern Overview](./README_PATTERN_OVERVIEW.md) - Core pattern system for URL filtering and modification
+    - [Pattern System Details](./README_PATTERN_SYSTEM.md)
+    - [URL Lifecycle Management](./README_LIFECYCLE.md)
+    - [Pattern Resolution](./README_PATTERN_RESOLUTION.md)
+    - [URL Inclusion/Exclusion](./README_INCLUSION.md)
+    - [Pattern Unapplication Logic](./README_UNAPPLY_LOGIC.md)
+- [Collection Status Workflows](./README_STATUS_TRIGGERS.md) - Collection progression and automated triggers
+- [Reindexing Status System](./README_REINDEXING_STATUSES.md) - Status management for reindexing collections

From 5efa64d96fafaaab2ff88603c8b14664374b74be Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 22:34:36 -0600
Subject: [PATCH 295/441] add initial tests for workflow status triggers

---
 .../tests/test_workflow_status_triggers.py    | 195 ++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 sde_collections/tests/test_workflow_status_triggers.py

diff --git a/sde_collections/tests/test_workflow_status_triggers.py b/sde_collections/tests/test_workflow_status_triggers.py
new file mode 100644
index 00000000..d6fb8d85
--- /dev/null
+++ b/sde_collections/tests/test_workflow_status_triggers.py
@@ -0,0 +1,195 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_workflow_status_triggers.py
+from unittest.mock import Mock, patch
+
+import pytest
+from django.db import transaction
+from django.test import TestCase, TransactionTestCase
+
+from sde_collections.models.collection_choice_fields import (
+    ReindexingStatusChoices,
+    WorkflowStatusChoices,
+)
+from sde_collections.models.delta_url import DeltaUrl, DumpUrl
+from sde_collections.tasks import fetch_and_replace_full_text
+from sde_collections.tests.factories import CollectionFactory, DumpUrlFactory
+
+
+class TestWorkflowStatusTransitions(TestCase):
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    @patch("sde_collections.models.collection.Collection.create_scraper_config")
+    @patch("sde_collections.models.collection.Collection.create_indexer_config")
+    def test_ready_for_engineering_triggers_config_creation(self, mock_indexer, mock_scraper):
+        """When status changes to READY_FOR_ENGINEERING, it should create configs"""
+        self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_ENGINEERING
+        self.collection.save()
+
+        mock_scraper.assert_called_once_with(overwrite=False)
+        mock_indexer.assert_called_once_with(overwrite=False)
+
+    @patch("sde_collections.tasks.fetch_and_replace_full_text.delay")
+    def test_indexing_finished_triggers_full_text_fetch(self, mock_fetch):
+        """When status changes to INDEXING_FINISHED_ON_DEV, it should trigger full text fetch"""
+        self.collection.workflow_status = WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV
+        self.collection.save()
+
+        mock_fetch.assert_called_once_with(self.collection.id, "lrm_dev")
+
+    @patch("sde_collections.models.collection.Collection.create_plugin_config")
+    def test_ready_for_curation_triggers_plugin_config(self, mock_plugin):
+        """When status changes to READY_FOR_CURATION, it should create plugin config"""
+        self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION
+        self.collection.save()
+
+        mock_plugin.assert_called_once_with(overwrite=True)
+
+    @patch("sde_collections.models.collection.Collection.promote_to_curated")
+    def test_curated_triggers_promotion(self, mock_promote):
+        """When status changes to CURATED, it should promote DeltaUrls to CuratedUrls"""
+        self.collection.workflow_status = WorkflowStatusChoices.CURATED
+        self.collection.save()
+
+        mock_promote.assert_called_once()
+
+    @patch("sde_collections.models.collection.Collection.add_to_public_query")
+    def test_quality_check_perfect_triggers_public_query(self, mock_add):
+        """When status changes to QUALITY_CHECK_PERFECT, it should add to public query"""
+        self.collection.workflow_status = WorkflowStatusChoices.QUALITY_CHECK_PERFECT
+        self.collection.save()
+
+        mock_add.assert_called_once()
+
+
+class TestReindexingStatusTransitions(TestCase):
+    def setUp(self):
+        self.collection = CollectionFactory(
+            workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+            reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
+        )
+
+    @patch("sde_collections.tasks.fetch_and_replace_full_text.delay")
+    def test_reindexing_finished_triggers_full_text_fetch(self, mock_fetch):
+        """When reindexing status changes to FINISHED, it should trigger full text fetch"""
+        self.collection.reindexing_status = ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV
+        self.collection.save()
+
+        mock_fetch.assert_called_once_with(self.collection.id, "lrm_dev")
+
+    @patch("sde_collections.models.collection.Collection.promote_to_curated")
+    def test_reindexing_curated_triggers_promotion(self, mock_promote):
+        """When reindexing status changes to CURATED, it should promote DeltaUrls"""
+        self.collection.reindexing_status = ReindexingStatusChoices.REINDEXING_CURATED
+        self.collection.save()
+
+        mock_promote.assert_called_once()
+
+
+class TestFullTextImport(TestCase):
+    def setUp(self):
+        self.collection = CollectionFactory()
+        self.existing_dump = DumpUrlFactory(collection=self.collection)
+        self.api_response = [
+            {"url": "http://example.com/1", "title": "Title 1", "full_text": "Content 1"},
+            {"url": "http://example.com/2", "title": "Title 2", "full_text": "Content 2"},
+        ]
+
+    @patch("sde_collections.tasks.Api")
+    @patch("sde_collections.models.collection.GitHubHandler")
+    def test_full_text_import_workflow(self, MockGitHub, MockApi):
+        """Test the full process of importing full text data"""
+        # Setup mock GitHub handler with proper XML content
+        mock_github = Mock()
+        mock_github.check_file_exists.return_value = True
+        mock_file_contents = Mock()
+        # Include all the fields that convert_template_to_plugin_indexer checks for
+        mock_xml = """<?xml version="1.0" encoding="UTF-8"?>
+        <Sinequa>
+            <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
+            <CorrectDomainCookies>false</CorrectDomainCookies>
+            <IgnoreSessionCookies>false</IgnoreSessionCookies>
+            <DownloadImages>false</DownloadImages>
+            <DownloadMedia>false</DownloadMedia>
+            <DownloadCss>false</DownloadCss>
+            <DownloadFtp>true</DownloadFtp>
+            <DownloadFile>true</DownloadFile>
+            <IndexJs>false</IndexJs>
+            <FollowJs>true</FollowJs>
+            <CrawlFlash>true</CrawlFlash>
+            <NormalizeUrls>true</NormalizeUrls>
+            <NormalizeSecureSchemesWhenTestingVisited>True</NormalizeSecureSchemesWhenTestingVisited>
+            <UrlAccess>
+                <AllowXPathCookies>false</AllowXPathCookies>
+                <UseBrowserForWebRequests>true</UseBrowserForWebRequests>
+                <UseHttpClientForWebRequests>false</UseHttpClientForWebRequests>
+            </UrlAccess>
+            <RetryCount></RetryCount>
+            <RetryPause></RetryPause>
+            <AddBaseHref></AddBaseHref>
+            <AddMetaContentType></AddMetaContentType>
+        </Sinequa>"""
+        mock_file_contents.decoded_content = mock_xml.encode("utf-8")
+        mock_github._get_file_contents.return_value = mock_file_contents
+        MockGitHub.return_value = mock_github
+
+        # Setup mock API
+        mock_api = Mock()
+        mock_api.get_full_texts.return_value = [self.api_response]
+        MockApi.return_value = mock_api
+
+        # Setup initial workflow state
+        self.collection.workflow_status = WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV
+        self.collection.save()
+
+        # Run the import
+        fetch_and_replace_full_text(self.collection.id, "lrm_dev")
+
+        # Verify old DumpUrls were cleared
+        assert not DumpUrl.objects.filter(id=self.existing_dump.id).exists()
+
+        # Verify new Delta urls were created
+        new_deltas = DeltaUrl.objects.filter(collection=self.collection)
+        assert new_deltas.count() == 2
+        assert {dump.url for dump in new_deltas} == {"http://example.com/1", "http://example.com/2"}
+
+        # Verify status updates
+        self.collection.refresh_from_db()
+        assert self.collection.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION
+
+
+class TestErrorHandling(TransactionTestCase):
+    def setUp(self):
+        self.collection = CollectionFactory(workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS)
+
+    @patch("sde_collections.models.collection.Collection.create_scraper_config")
+    @patch("sde_collections.models.collection.Collection.create_indexer_config")
+    def test_config_creation_failure_handling(self, mock_indexer, mock_scraper):
+        """Test handling of config creation failures"""
+        mock_scraper.side_effect = Exception("Config creation failed")
+
+        initial_status = self.collection.workflow_status
+
+        with pytest.raises(Exception):
+            with transaction.atomic():
+                self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_ENGINEERING
+                self.collection.save()
+
+        # Verify status wasn't changed on error
+        self.collection.refresh_from_db()
+        assert self.collection.workflow_status == initial_status
+
+    @patch("sde_collections.tasks.Api")
+    def test_full_text_fetch_failure_handling(self, MockApi):
+        """Test handling of full text fetch failures"""
+        mock_api = Mock()
+        mock_api.get_full_texts.side_effect = Exception("API error")
+        MockApi.return_value = mock_api
+
+        initial_status = self.collection.workflow_status
+
+        with pytest.raises(Exception):
+            fetch_and_replace_full_text(self.collection.id, "lrm_dev")
+
+        # Verify status wasn't changed on error
+        self.collection.refresh_from_db()
+        assert self.collection.workflow_status == initial_status

From 85574aaac0c94275ef96f3cf0fb9433e7383eba7 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 10 Dec 2024 22:35:08 -0600
Subject: [PATCH 296/441] update fulltext tests to break signals

---
 sde_collections/tests/test_import_fulltexts.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/sde_collections/tests/test_import_fulltexts.py b/sde_collections/tests/test_import_fulltexts.py
index d39f1633..17df38ea 100644
--- a/sde_collections/tests/test_import_fulltexts.py
+++ b/sde_collections/tests/test_import_fulltexts.py
@@ -3,14 +3,25 @@
 from unittest.mock import patch
 
 import pytest
+from django.db.models.signals import post_save
 
+from sde_collections.models.collection import create_configs_on_status_change
 from sde_collections.models.delta_url import DeltaUrl, DumpUrl
 from sde_collections.tasks import fetch_and_replace_full_text
 from sde_collections.tests.factories import CollectionFactory
 
 
+@pytest.fixture
+def disconnect_signals():
+    # Disconnect the signal before each test
+    post_save.disconnect(create_configs_on_status_change, sender="sde_collections.Collection")
+    yield
+    # Reconnect the signal after each test
+    post_save.connect(create_configs_on_status_change, sender="sde_collections.Collection")
+
+
 @pytest.mark.django_db
-def test_fetch_and_replace_full_text():
+def test_fetch_and_replace_full_text(disconnect_signals):
     collection = CollectionFactory(config_folder="test_folder")
 
     mock_batch = [
@@ -31,7 +42,7 @@ def mock_generator():
 
 
 @pytest.mark.django_db
-def test_fetch_and_replace_full_text_large_dataset():
+def test_fetch_and_replace_full_text_large_dataset(disconnect_signals):
     """Test processing a large number of records with proper pagination and batching."""
     collection = CollectionFactory(config_folder="test_folder")
 

From f231ba8b52939504a0b8ae300e1ae10ab2e006fd Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 11 Dec 2024 12:23:48 +0530
Subject: [PATCH 297/441] Updated dockerignore and gitignore

---
 .dockerignore | 1 +
 .gitignore    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.dockerignore b/.dockerignore
index 7369480e..40fc442c 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -9,3 +9,4 @@
 .travis.yml
 venv
 .git
+/backups
diff --git a/.gitignore b/.gitignore
index 12fec5ec..a1c24c9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -294,6 +294,7 @@ Document_Classifier_inference/model.pt
 
 # Database backup
 backup.json
+/backups
 
 # Prod backup
 prod_backup-20240423.json

From 2ea04940f5dbd834a672927339c16851b363ab26 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 11 Dec 2024 06:12:16 -0600
Subject: [PATCH 298/441] Fixes #1112

---
 docs/architecture-decisions/README.md         | 48 ++++++++++++
 .../uniform-error-handling.md                 | 76 +++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 docs/architecture-decisions/README.md
 create mode 100644 docs/architecture-decisions/uniform-error-handling.md

diff --git a/docs/architecture-decisions/README.md b/docs/architecture-decisions/README.md
new file mode 100644
index 00000000..5fda8cd0
--- /dev/null
+++ b/docs/architecture-decisions/README.md
@@ -0,0 +1,48 @@
+# Architecture Decision Records (ADRs)
+
+This directory contains Architecture Decision Records (ADRs) documenting key architectural decisions made in this project.
+
+## Index of ADRs (To be updated when an ADR is added, modified, or updated)
+
+- [Uniform Error Handling](uniform-error-handling.md)  
+  **Status**: Proposed | **Date**: 2024-12-12  
+  **Related Links**: [Issue #1112](#)
+
+## Maintenance Guidelines
+
+1. Keep Index Updated: Always update the index above when a new ADR is added or its status changes.
+2. Use Consistent Formatting: Follow the provided template to ensure clarity and uniformity.
+3. Cross-Reference Decisions: Link to related Issues, PRs, or other ADRs for better traceability.
+
+## Format for New ADRs
+
+To add a new ADR to this directory:
+
+1. Create a new markdown file in this directory with a descriptive filename (e.g., `use-graphql.md`).
+2. Use the following template for the ADR content:
+
+```markdown
+# [Title of the Decision]
+
+## Status
+[Proposed | Accepted | Deprecated | Rejected]
+
+## Context
+[Explain why this decision is being made. Provide background information, such as the problem to be solved, goals, and relevant constraints.]
+
+## Decision
+[Clearly describe the decision made. Include details about what was chosen and how it will be implemented.]
+
+## Consequences
+### Positive
+[Describe the benefits of the decision.]
+
+### Negative
+[Describe the trade-offs, risks, or potential issues resulting from this decision.]
+
+## Alternatives Considered
+1. [Alternative 1]: [Brief description of the alternative, its pros, and cons.]
+2. [Alternative 2]: [Brief description of the alternative, its pros, and cons.]
+
+## References
+[Provide links to relevant documents, discussions, RFCs, PRs, Issues or resources that support this decision.]
diff --git a/docs/architecture-decisions/uniform-error-handling.md b/docs/architecture-decisions/uniform-error-handling.md
new file mode 100644
index 00000000..f111e11f
--- /dev/null
+++ b/docs/architecture-decisions/uniform-error-handling.md
@@ -0,0 +1,76 @@
+# Uniform Error Handling
+
+## Status
+Proposed
+
+## Context
+The current error handling system logs errors only in the terminal, which are neither preserved nor useful for developers or users. This approach fails to inform users of encountered issues and does not support debugging by developers. A consistent and efficient error-handling strategy is required to enhance user experience and simplify debugging.
+
+## Decision
+The **Error Dashboard** approach is proposed to be adopted as the method for handling system errors. This decision aligns with the need to consolidate real-time and asynchronous errors in a centralized location for better tracking and resolution.
+
+### Why This Decision Was Taken
+Since there are tasks that run asynchronously (e.g., Celery tasks), errors from these operations cannot be shown to the user in real-time. To ensure that both real-time and asynchronous errors are recorded and displayed, this approach was chosen.
+
+- **Real-Time Errors**: These will be updated in the notification dashboard immediately as they occur.
+- **Asynchronous Errors**: Errors encountered during asynchronous tasks (e.g., Celery tasks) will be recorded and updated in the dashboard once the task completes. 
+
+To achieve this, a targeted script will be developed to monitor asynchronous tasks. Details of this script are outlined in the **Monitoring Asynchronous Tasks** section of this ADR.
+
+## Consequences
+
+### Positive
+- **Centralization**: All errors are consolidated in one place, reducing the chances of overlooked issues.
+- **Improved User Experience**: Users have a clear view of errors affecting their operations without relying on backend logs or email notifications.
+- **Scalability**: Suitable for large-scale operations involving asynchronous and real-time tasks.
+
+### Negative
+- **Navigation Overhead**: Users must navigate to the dashboard, which could be less convenient than inline error notifications.
+- **Resource Requirements**: Developing and maintaining the dashboard requires additional frontend and backend resources.
+
+## Alternatives Considered
+
+### 1. **Logging System**
+- **Approach**: Use Python’s built-in logging library to save structured logs with details like timestamps and severity levels.
+- **Advantages**:
+  - Preserves logs for debugging.
+  - Differentiates critical and minor errors for better prioritization.
+  - Efficient for monitoring asynchronous tasks.
+- **Disadvantage**:
+  - Primarily benefits developers; does not directly improve user interaction.
+
+### 2. **Frontend Notifications**
+- **Approach**: Display error messages directly on the frontend.
+- **Advantages**:
+  - Enhances user interaction by providing immediate feedback.
+- **Disadvantages**:
+  - Resource-intensive to implement.
+  - Real-time feedback is challenging for asynchronous tasks.
+
+### 3. **Email Notifications**
+- **Approach**: Send email alerts for critical failures.
+- **Advantages**:
+  - Simple to implement.
+  - Suitable for asynchronous task monitoring.
+- **Disadvantage**:
+  - Users need to register their emails.
+  - Requires users to check emails, reducing immediacy of feedback.
+
+### 4. **Error Dashboard**
+- **Approach**: Display errors on a dedicated frontend dashboard.
+- **Advantages**:
+  - Consolidates both real-time and asynchronous errors.
+  - Provides a centralized location for error tracking.
+- **Disadvantages**:
+  - Requires navigation away from the operational page to view errors.
+
+## Monitoring Asynchronous Tasks
+A targeted script will be developed to monitor asynchronous tasks for errors during import operations:
+- **Trigger**: Activated at the start of an import operation.
+- **Duration**: Runs for 10 minutes, polling the Flower API every minute.
+- **Functionality**:
+  - Detects failed tasks related to the import.
+  - Notifies developers or users as appropriate.
+
+## References
+- Issue: [#1112](#)

From 1cd038b8040be14af1ba83fc1c4cfbfd299f6f4b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 11 Dec 2024 10:11:10 -0600
Subject: [PATCH 299/441] add ignores for more venvs and backup files

---
 .dockerignore | 9 ++++++++-
 .gitignore    | 8 +++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 40fc442c..ffcc7fd6 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -7,6 +7,13 @@
 .pre-commit-config.yaml
 .readthedocs.yml
 .travis.yml
-venv
 .git
+
+# ignore local python environments
+venv
+.venv
+
+# prevent large backup files from being copied into the image
 /backups
+*.sql
+*.gz
diff --git a/.gitignore b/.gitignore
index a1c24c9d..24e03b4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -292,9 +292,7 @@ config_generation/config.py
 # Model's inference files
 Document_Classifier_inference/model.pt
 
-# Database backup
-backup.json
+# Ignore Database Backup files
 /backups
-
-# Prod backup
-prod_backup-20240423.json
+*.sql
+*.gz

From 018c2b7bc907fa1a93ffe09fb0472be138957c42 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 11 Dec 2024 10:34:43 -0600
Subject: [PATCH 300/441] change production docker volume permissions

---
 production.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/production.yml b/production.yml
index 3a17e853..cf9a5244 100644
--- a/production.yml
+++ b/production.yml
@@ -10,7 +10,7 @@ services:
       dockerfile: ./compose/production/django/Dockerfile
     image: sde_indexing_helper_production_django
     volumes:
-      - ./backups:/app/backups
+      - ./backups:/app/backups:z
     depends_on:
       - postgres
     env_file:

From 178b5bcf776db44a8cfde22d4a82f853906fcc6a Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 11 Dec 2024 15:43:42 -0600
Subject: [PATCH 301/441] add promotion tests for overlapping title patterns
 and title changes

---
 .../tests/test_promote_collection.py          | 94 ++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py
index 7e350525..45cf7b53 100644
--- a/sde_collections/tests/test_promote_collection.py
+++ b/sde_collections/tests/test_promote_collection.py
@@ -1,10 +1,10 @@
 # docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
-
 import pytest
 
 from sde_collections.models.delta_patterns import (
     DeltaExcludePattern,
     DeltaIncludePattern,
+    DeltaTitlePattern,
 )
 from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
 from sde_collections.tests.factories import CollectionFactory
@@ -116,3 +116,95 @@ def test_patterns_reapplied_after_promotion(collection):
 
     # Verify exclusion status
     assert curated_urls.filter(url="https://exclude.com", excluded=True).exists()
+
+
+@pytest.mark.django_db
+def test_promotion_with_overlapping_patterns_and_deletion():
+    """Test complex scenario with multiple overlapping patterns and URL deletion."""
+    collection = CollectionFactory()
+
+    # Create a more complex set of URLs that might trigger overlapping patterns
+    urls = [
+        "https://example.com/docs/guide1",
+        "https://example.com/docs/guide2",
+        "https://example.com/api/v1/doc1",
+        "https://example.com/api/v1/doc2",
+    ]
+
+    # Create initial DeltaUrls
+    for url in urls:
+        DeltaUrl.objects.create(collection=collection, url=url, scraped_title=f"Title for {url}")
+
+    # Create overlapping patterns that will affect the same URLs
+    patterns = [
+        {"pattern": ".*docs.*", "title": "Documentation: {title}"},
+        {"pattern": ".*guide.*", "title": "Guide: {title}"},
+        {"pattern": ".*api.*", "title": "API: {title}"},
+        {"pattern": ".*doc[0-9]", "title": "Doc Number: {title}"},
+    ]
+
+    # Create and apply multiple patterns
+    title_patterns = []
+    for p in patterns:
+        pattern = DeltaTitlePattern.objects.create(
+            collection=collection,
+            match_pattern=p["pattern"],
+            match_pattern_type=2,  # Multi-URL Pattern
+            title_pattern=p["title"],
+        )
+        pattern.apply()
+        title_patterns.append(pattern)
+
+    # Initial promotion
+    collection.promote_to_curated()
+
+    # Verify our complex setup
+    for pattern in title_patterns:
+        matching_urls = pattern.curated_urls.all()
+        print(f"\nPattern '{pattern.match_pattern}' matches {matching_urls.count()} URLs:")
+        for url in matching_urls:
+            print(f"- {url.url}")
+
+    # Now create deletion DeltaUrls but with overlapping pattern matches
+    urls_to_delete = ["https://example.com/docs/guide1", "https://example.com/api/v1/doc1"]
+    for url in urls_to_delete:
+        DeltaUrl.objects.create(collection=collection, url=url, to_delete=True)
+
+    # Try the promotion - this should trigger similar conditions to production
+    collection.promote_to_curated()
+
+    # Print final state for debugging
+    print("\nFinal state:")
+    for pattern in title_patterns:
+        print(f"\nPattern '{pattern.match_pattern}':")
+        for url in pattern.curated_urls.all():
+            print(f"- {url.url}")
+
+
+@pytest.mark.django_db
+def test_promotion_with_title_change():
+    """Test updating a CuratedUrl that has active title pattern relationships."""
+    collection = CollectionFactory()
+
+    # Create initial DeltaUrl and promote it
+    url = "https://example.com/doc1"
+    DeltaUrl.objects.create(collection=collection, url=url, scraped_title="Original Title")
+
+    # Create and apply a title pattern
+    pattern = DeltaTitlePattern.objects.create(
+        collection=collection, match_pattern=".*doc1", match_pattern_type=2, title_pattern="Pattern: {title}"
+    )
+    pattern.apply()
+
+    # Initial promotion
+    collection.promote_to_curated()
+
+    # Verify pattern relationship exists
+    curated = CuratedUrl.objects.get(url=url)
+    assert pattern.curated_urls.filter(id=curated.id).exists()
+
+    # Now create new DeltaUrl with updated title
+    DeltaUrl.objects.create(collection=collection, url=url, scraped_title="New Title")  # Changed title
+
+    # This should trigger the same error we're seeing in production
+    collection.promote_to_curated()

From 0b83d17947bb615bb1d32d3db8cf56197f7444cb Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 11 Dec 2024 15:44:34 -0600
Subject: [PATCH 302/441] prevent promotion from copying id's over to
 curatedurls

---
 sde_collections/models/collection.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 93d6e2db..ddc5b241 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -207,7 +207,7 @@ def promote_to_curated(self):
                 updated_fields = {}
                 for field in delta._meta.fields:
                     field_name = field.name
-                    if field_name == "to_delete":
+                    if field_name in ["to_delete", "id"]:
                         continue
 
                     delta_value = getattr(delta, field_name)
@@ -221,7 +221,8 @@ def promote_to_curated(self):
                 new_data = {
                     field.name: getattr(delta, field.name)
                     for field in delta._meta.fields
-                    if field.name not in ["to_delete", "collection"] and getattr(delta, field.name) not in [None, ""]
+                    if field.name not in ["to_delete", "collection", "id"]
+                    and getattr(delta, field.name) not in [None, ""]
                 }
                 CuratedUrl.objects.create(collection=self, **new_data)
 

From 430a1c1fb41fbe44f73a8e147e5ffb3aa11b3276 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 11 Dec 2024 16:20:35 -0600
Subject: [PATCH 303/441] add additional promotion tests

---
 .../tests/test_promote_collection.py          | 184 +++++++++++++++++-
 1 file changed, 183 insertions(+), 1 deletion(-)

diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py
index 45cf7b53..8791efae 100644
--- a/sde_collections/tests/test_promote_collection.py
+++ b/sde_collections/tests/test_promote_collection.py
@@ -1,12 +1,15 @@
 # docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py
 import pytest
 
+from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes
 from sde_collections.models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
     DeltaExcludePattern,
     DeltaIncludePattern,
     DeltaTitlePattern,
 )
-from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from sde_collections.tests.factories import CollectionFactory
 
 
@@ -208,3 +211,182 @@ def test_promotion_with_title_change():
 
     # This should trigger the same error we're seeing in production
     collection.promote_to_curated()
+
+
+@pytest.mark.django_db
+def test_promotion_maintains_pattern_relationships_through_updates(collection):
+    """Test that pattern relationships survive multiple promotions with updates"""
+    # Initial setup
+    DeltaUrl.objects.create(collection=collection, url="https://example.com", scraped_title="Title")
+    pattern = DeltaTitlePattern.objects.create(
+        collection=collection, match_pattern="example.com", match_pattern_type=1, title_pattern="Pattern: {title}"
+    )
+
+    collection.promote_to_curated()
+
+    # Record initial state
+    curated = CuratedUrl.objects.get(url="https://example.com")
+    initial_id = curated.id
+    initial_pattern_relations = list(pattern.curated_urls.all())
+
+    # Create new delta with changes
+    DeltaUrl.objects.create(collection=collection, url="https://example.com", scraped_title="New Title")
+    collection.promote_to_curated()
+
+    # Verify relationships maintained
+    curated.refresh_from_db()
+    assert curated.id == initial_id  # ID should not change
+    assert list(pattern.curated_urls.all()) == initial_pattern_relations
+
+
+@pytest.mark.django_db
+def test_sequential_promotions_with_multiple_patterns(collection):
+    """Test complex scenario with multiple promotions and pattern changes"""
+    # Initial setup with two URLs
+    urls = ["https://example.com/doc", "https://example.com/guide"]
+    for url in urls:
+        DeltaUrl.objects.create(collection=collection, url=url, scraped_title=f"Title for {url}")
+
+    # First pattern and promotion
+    pattern1 = DeltaTitlePattern.objects.create(
+        collection=collection, match_pattern="doc", match_pattern_type=2, title_pattern="Doc: {title}"
+    )
+
+    collection.promote_to_curated()
+
+    # Record state after first promotion
+    initial_ids = {url: CuratedUrl.objects.get(url=url).id for url in urls}
+
+    DeltaUrl.objects.create(collection=collection, url="https://example.com/guide", scraped_title="Updated guide")
+    collection.promote_to_curated()
+
+    pattern2 = DeltaTitlePattern.objects.create(
+        collection=collection, match_pattern="guide", match_pattern_type=2, title_pattern="Guide: {title}"
+    )
+    assert not pattern2.curated_urls.filter(url__contains="guide").exists()
+
+    # Verify state
+    for url in urls:
+        curated = CuratedUrl.objects.get(url=url)
+        assert curated.id == initial_ids[url]  # IDs should be preserved
+
+    collection.promote_to_curated()
+
+    # Verify pattern relationships
+    assert pattern1.curated_urls.filter(url__contains="doc").exists()
+    assert pattern2.curated_urls.filter(url__contains="guide").exists()
+
+
+@pytest.mark.django_db
+def test_promotion_with_division_changes(collection):
+    """Test that division patterns are correctly promoted and applied"""
+    # Initial setup
+    DeltaUrl.objects.create(collection=collection, url="https://example.com/astrophysics", division=Divisions.GENERAL)
+    DeltaUrl.objects.create(collection=collection, url="https://example.com/helio", division=Divisions.GENERAL)
+
+    # Create and apply division patterns
+    DeltaDivisionPattern.objects.create(
+        collection=collection,
+        match_pattern="astrophysics",
+        match_pattern_type=2,  # Multi-URL pattern
+        division=Divisions.ASTROPHYSICS,
+    )
+
+    DeltaDivisionPattern.objects.create(
+        collection=collection, match_pattern="helio", match_pattern_type=2, division=Divisions.HELIOPHYSICS
+    )
+
+    # Promote and verify divisions were set
+    collection.promote_to_curated()
+
+    assert CuratedUrl.objects.get(url__contains="astrophysics").division == Divisions.ASTROPHYSICS
+    assert CuratedUrl.objects.get(url__contains="helio").division == Divisions.HELIOPHYSICS
+
+
+@pytest.mark.django_db
+def test_promotion_with_document_type_changes(collection):
+    """Test document type patterns through promotion"""
+    # Create URLs with default doc type
+    DeltaUrl.objects.create(
+        collection=collection, url="https://example.com/data/set1", document_type=DocumentTypes.DOCUMENTATION
+    )
+    DeltaUrl.objects.create(
+        collection=collection, url="https://example.com/tools/tool1", document_type=DocumentTypes.DOCUMENTATION
+    )
+
+    # Set up patterns for different doc types
+    DeltaDocumentTypePattern.objects.create(
+        collection=collection, match_pattern="data/*", match_pattern_type=2, document_type=DocumentTypes.DATA
+    ).apply()
+
+    DeltaDocumentTypePattern.objects.create(
+        collection=collection, match_pattern="tools/*", match_pattern_type=2, document_type=DocumentTypes.SOFTWARETOOLS
+    ).apply()
+
+    collection.promote_to_curated()
+
+    # Verify document types were correctly set
+    assert CuratedUrl.objects.get(url__contains="/data/").document_type == DocumentTypes.DATA
+    assert CuratedUrl.objects.get(url__contains="/tools/").document_type == DocumentTypes.SOFTWARETOOLS
+
+
+@pytest.mark.django_db
+def test_promotion_with_multiple_metadata_changes_dump(collection):
+    """Test complex scenario with multiple metadata changes through multiple promotions"""
+    # Initial URL we'll be working with
+    url = "https://example.com/helio/data"
+
+    # Create initial DumpUrl
+    DumpUrl.objects.create(
+        collection=collection,
+        url=url,
+        division=Divisions.GENERAL,
+        document_type=DocumentTypes.DOCUMENTATION,
+        scraped_title="Raw Data Title",
+    )
+
+    # Migrate DumpUrls to DeltaUrls
+    collection.migrate_dump_to_delta()
+
+    # Create patterns that will affect this URL
+    DeltaDivisionPattern.objects.create(
+        collection=collection, match_pattern="*helio*", match_pattern_type=2, division=Divisions.HELIOPHYSICS
+    )
+
+    DeltaDocumentTypePattern.objects.create(
+        collection=collection, match_pattern="*data*", match_pattern_type=2, document_type=DocumentTypes.DATA
+    )
+
+    DeltaTitlePattern.objects.create(
+        collection=collection, match_pattern="*data*", match_pattern_type=2, title_pattern="Heliophysics Data: {title}"
+    )
+
+    # First promotion
+    collection.promote_to_curated()
+
+    # Verify initial promotion worked correctly
+    curated = CuratedUrl.objects.get(url=url)
+    assert curated.division == Divisions.HELIOPHYSICS
+    assert curated.document_type == DocumentTypes.DATA
+    assert curated.generated_title == "Heliophysics Data: Raw Data Title"
+
+    # Create new DumpUrl with updated data to simulate a new crawl
+    DumpUrl.objects.create(
+        collection=collection,
+        url=url,
+        division=Divisions.GENERAL,  # These will be overridden by patterns
+        document_type=DocumentTypes.DOCUMENTATION,  # These will be overridden by patterns
+        scraped_title="Updated Data Title",
+    )
+
+    # Migrate new dump to delta
+    collection.migrate_dump_to_delta()
+
+    # Second promotion - should maintain pattern-applied metadata while updating the title
+    collection.promote_to_curated()
+
+    # Verify final state
+    curated = CuratedUrl.objects.get(url=url)
+    assert curated.division == Divisions.HELIOPHYSICS  # Should still be preserved from pattern
+    assert curated.document_type == DocumentTypes.DATA  # Should still be preserved from pattern
+    assert curated.generated_title == "Heliophysics Data: Updated Data Title"  # Should reflect new title

From 93392dfb55e7264b736a480c23bc42940b07a1a0 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 11 Dec 2024 16:27:16 -0600
Subject: [PATCH 304/441] add clarification about pattern behavior to the
 lifecycle readme

---
 sde_collections/models/README_LIFECYCLE.md | 82 +++++++---------------
 1 file changed, 25 insertions(+), 57 deletions(-)

diff --git a/sde_collections/models/README_LIFECYCLE.md b/sde_collections/models/README_LIFECYCLE.md
index 61afa3f7..21834cc7 100644
--- a/sde_collections/models/README_LIFECYCLE.md
+++ b/sde_collections/models/README_LIFECYCLE.md
@@ -13,7 +13,7 @@ This document explains the lifecycle of URLs in the system, focusing on two crit
 - **CuratedUrls**: Production-ready, approved content
 
 ### Fields That Transfer
-All fields are transferred between states, including:
+All fields transfer between states, including:
 - URL
 - Scraped Title
 - Generated Title
@@ -23,6 +23,21 @@ All fields are transferred between states, including:
 - Scraped Text
 - Any additional metadata
 
+## Pattern Application
+
+### When Patterns Are Applied
+Patterns are applied in two scenarios:
+1. During migration from Dump to Delta
+2. When a new pattern is created/updated
+
+Patterns are NOT applied during promotion. The effects of patterns (modified titles, document types, etc.) are carried through to CuratedUrls during promotion, but the patterns themselves don't reapply.
+
+### Pattern Effects
+- Patterns modify DeltaUrls when they are created or when DeltaUrls are created through migration
+- Pattern-modified fields (titles, document types, etc.) become part of the DeltaUrl's data
+- These modifications persist through promotion to CuratedUrls
+- Pattern relationships (which patterns affect which URLs) are maintained for tracking purposes
+
 ## Migration Process (Dump → Delta)
 
 ### Overview
@@ -43,49 +58,7 @@ Migration converts DumpUrls to DeltaUrls, preserving all fields and applying pat
 
 ### Examples
 
-#### Example 1: Basic Migration
-```python
-# Starting State
-dump_url = DumpUrl(
-    url="example.com/doc",
-    scraped_title="Original Title",
-    document_type=DocumentTypes.DOCUMENTATION
-)
-
-# After Migration
-delta_url = DeltaUrl(
-    url="example.com/doc",
-    scraped_title="Original Title",
-    document_type=DocumentTypes.DOCUMENTATION,
-    to_delete=False
-)
-```
-
-#### Example 2: Migration with Existing Curated
-```python
-# Starting State
-dump_url = DumpUrl(
-    url="example.com/doc",
-    scraped_title="New Title",
-    document_type=DocumentTypes.DOCUMENTATION
-)
-
-curated_url = CuratedUrl(
-    url="example.com/doc",
-    scraped_title="Old Title",
-    document_type=DocumentTypes.DOCUMENTATION
-)
-
-# After Migration
-delta_url = DeltaUrl(
-    url="example.com/doc",
-    scraped_title="New Title",  # Different from curated
-    document_type=DocumentTypes.DOCUMENTATION,
-    to_delete=False
-)
-```
-
-#### Example 3: Migration with Pattern Application
+#### Example 1: Migration with Pattern Application
 ```python
 # Starting State
 dump_url = DumpUrl(
@@ -111,15 +84,15 @@ delta_url = DeltaUrl(
 ## Promotion Process (Delta → Curated)
 
 ### Overview
-Promotion moves DeltaUrls to CuratedUrls, applying all changes including explicit NULL values. This occurs when:
-- A curator marks a collection as Curated.
+Promotion moves DeltaUrls to CuratedUrls, carrying forward all changes including pattern-applied modifications. This occurs when:
+- A curator marks a collection as Curated
 
 ### Steps
 1. Process each DeltaUrl:
    - If marked for deletion: Remove matching CuratedUrl
    - Otherwise: Update/create CuratedUrl with ALL fields
 2. Clear all DeltaUrls
-3. Refresh pattern relationships
+3. Update pattern relationship tracking
 
 ### Examples
 
@@ -186,18 +159,13 @@ curated_url = CuratedUrl(
 
 ## Important Notes
 
+
 ### Field Handling
 - ALL fields are copied during migration and promotion
 - NULL values in DeltaUrls are treated as explicit values
 - Pattern-set values take precedence over original values
 
-### Pattern Application
-- Patterns are applied after migration
-- Pattern effects persist through promotion
-- Multiple patterns can affect the same URL
-
-### Data Integrity
-- Migrations preserve all field values
-- Promotions apply all changes
-- Deletion flags are honored during promotion
-- Pattern relationships are maintained
+### Pattern Behavior
+- Patterns only apply during migration or when patterns themselves are created/updated
+- Pattern effects are preserved during promotion as regular field values
+- Patterns are NOT re-applied during promotion. This means you can't add a DeltaUrl outside of the migration process and expect patterns to apply. In this case, you would need to either add it as a DumpUrl and migrate it correctly, or add it as a DeltaUrl manually apply the pattern.

From 5ad6f4cde2799b10b29925522b0827c0a3c8ecbb Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 11 Dec 2024 22:23:07 -0600
Subject: [PATCH 305/441] add explanatory commentary to the lifecycle readme

---
 sde_collections/models/README_LIFECYCLE.md | 75 +++++++++++++++++++++-
 1 file changed, 72 insertions(+), 3 deletions(-)

diff --git a/sde_collections/models/README_LIFECYCLE.md b/sde_collections/models/README_LIFECYCLE.md
index 21834cc7..cd6bcc33 100644
--- a/sde_collections/models/README_LIFECYCLE.md
+++ b/sde_collections/models/README_LIFECYCLE.md
@@ -56,9 +56,77 @@ Migration converts DumpUrls to DeltaUrls, preserving all fields and applying pat
 4. Apply all patterns to new Deltas
 5. Clear DumpUrls
 
+## Migration Process (Dump → Delta)
+
+### Overview
+Migration converts DumpUrls to DeltaUrls, preserving all fields and applying patterns. This process happens when:
+- New content is scraped
+- Content is reindexed
+- Collection is being prepared for curation
+### Steps
+1. Clear existing DeltaUrls
+2. Process each DumpUrl:
+   - If matching CuratedUrl exists: Create Delta with all fields
+   - If no matching CuratedUrl: Create Delta as new URL
+3. Process missing CuratedUrls:
+   - Create deletion Deltas for any not in Dump
+4. Apply all patterns to new Deltas
+5. Clear DumpUrls
+
 ### Examples
 
-#### Example 1: Migration with Pattern Application
+#### Example 1: Basic Migration
+If there are no patterns or existing CuratedUrls, the DeltaUrl will be created from the DumpUrl.
+```python
+# Starting State
+dump_url = DumpUrl(
+    url="example.com/doc",
+    scraped_title="Original Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+
+# After Migration
+delta_url = DeltaUrl(
+    url="example.com/doc",
+    scraped_title="Original Title",
+    document_type=DocumentTypes.DOCUMENTATION,
+    to_delete=False
+)
+```
+
+#### Example 2: Migration with Existing Curated
+If a CuratedUrl exists for the URL, and the DumpUrl has changes, a DeltaUrl will be created.
+```python
+# Starting State
+dump_url = DumpUrl(
+    url="example.com/doc",
+    scraped_title="New Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+
+curated_url = CuratedUrl(
+    url="example.com/doc",
+    scraped_title="Old Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+
+# After Migration
+delta_url = DeltaUrl(
+    url="example.com/doc",
+    scraped_title="New Title",  # Different from curated
+    document_type=DocumentTypes.DOCUMENTATION,
+    to_delete=False
+)
+
+curated_url = CuratedUrl(
+    url="example.com/doc",
+    scraped_title="Old Title",
+    document_type=DocumentTypes.DOCUMENTATION
+)
+```
+
+#### Example 3: Migration with Pattern Application
+If a pattern exists that modifies the document type of a DumpUrl, that pattern will be applied and the DeltaUrl will reflect the pattern's changes.
 ```python
 # Starting State
 dump_url = DumpUrl(
@@ -66,7 +134,6 @@ dump_url = DumpUrl(
     scraped_title="Data File",
     document_type=None
 )
-
 document_type_pattern = DocumentTypePattern(
     match_pattern="*.pdf",
     document_type=DocumentTypes.DATA
@@ -97,6 +164,7 @@ Promotion moves DeltaUrls to CuratedUrls, carrying forward all changes including
 ### Examples
 
 #### Example 1: Basic Promotion
+If there ae no CuratedUrls for the URL, the DeltaUrl will be promoted to a new CuratedUrl.
 ```python
 # Starting State
 delta_url = DeltaUrl(
@@ -115,6 +183,7 @@ curated_url = CuratedUrl(
 ```
 
 #### Example 2: Promotion with NULL Override
+It's important to notice that the None value in the DeltaUrl is preserved in the CuratedUrl.
 ```python
 # Starting State
 delta_url = DeltaUrl(
@@ -139,6 +208,7 @@ curated_url = CuratedUrl(
 ```
 
 #### Example 3: Deletion During Promotion
+If there is no DumpUrl for an existing CuratedUrl, this signifies the url has been removed from the collection. A DeltaUrl with `to_delete=True` will be created, and on promotion the CuratedUrl will be deleted.
 ```python
 # Starting State
 delta_url = DeltaUrl(
@@ -159,7 +229,6 @@ curated_url = CuratedUrl(
 
 ## Important Notes
 
-
 ### Field Handling
 - ALL fields are copied during migration and promotion
 - NULL values in DeltaUrls are treated as explicit values

From 6f528b818afa19408ab922d23092948ef9da7a96 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 12 Dec 2024 14:02:10 +0530
Subject: [PATCH 306/441] Updated title pane to Delta URLs

---
 sde_indexing_helper/static/js/collection_list.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index f5c88606..7c8693a4 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -440,7 +440,7 @@ $(document).ready(function () {
     null,
     null,
     "Division",
-    "Candidate URLs",
+    "Delta URLs",
     "Workflow Status",
     "Curator",
     "Connector Type",

From 5f21aceb5b6f93232717466ea7243ee74d8cec3f Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 12 Dec 2024 10:45:16 -0600
Subject: [PATCH 307/441] write a draft testing guide

---
 .../models/README_MANUNAL_TESTING.md          | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 sde_collections/models/README_MANUNAL_TESTING.md

diff --git a/sde_collections/models/README_MANUNAL_TESTING.md b/sde_collections/models/README_MANUNAL_TESTING.md
new file mode 100644
index 00000000..6f8f371e
--- /dev/null
+++ b/sde_collections/models/README_MANUNAL_TESTING.md
@@ -0,0 +1,166 @@
+# COSMOS Curation System Testing Guide
+
+## Resources
+There are 14 collections which have been reindexed on dev and can have their statuses changed to `REINDEXING_FINISHED` to test url importing. The collections and their counts can be seen [here](https://docs.google.com/spreadsheets/d/1mJFqZXdIyAN8LTuVQMLRDuNgzm7GIMlYb_cPUtLKSCM/edit?gid=1316450061#gid=1316450061 ).
+
+## Test Flow 1: Basic URL Collection Lifecycle
+
+### Objective
+Verify the complete lifecycle of a URL collection from initial creation through curation to production.
+
+### Prerequisites
+- Access to dev environment
+- Test collection created
+- Sample URLs ready for testing
+
+### Test Cases
+
+#### 1.1 Collection Status Progression
+1. Create new collection in `RESEARCH_IN_PROGRESS` status
+2. Verify initial scraper and indexer configs are created when moved to `READY_FOR_ENGINEERING`
+3. Progress through `ENGINEERING_IN_PROGRESS` to `INDEXING_FINISHED_ON_DEV`
+4. Confirm full text fetch triggers automatically
+5. Verify status updates to `READY_FOR_CURATION`
+6. Check plugin config creation
+7. Move through `CURATION_IN_PROGRESS` to `CURATED`
+8. Verify DeltaUrls promotion to CuratedUrls
+9. Test quality check status changes (`QUALITY_CHECK_PERFECT/MINOR`)
+10. Confirm collection appears in public query after PR merge
+
+#### 1.2 Data State Transitions
+1. Verify DumpUrls are created during indexing
+2. Test migration from DumpUrls to DeltaUrls
+3. Confirm field preservation during transitions
+4. Check promotion from DeltaUrls to CuratedUrls
+5. Verify all metadata transfers correctly
+
+Expected Results:
+- Each status transition triggers appropriate automated actions
+- Data integrity maintained through all transitions
+- Correct config generation at each stage
+- Proper public visibility after final approval
+
+## Test Flow 2: Pattern System Functionality
+
+### Objective
+Test the creation, application, and interaction of different pattern types.
+
+### Prerequisites
+- Collection with sample URLs
+- Mix of different URL types and structures
+
+### Test Cases
+
+#### 2.1 Include/Exclude Patterns
+1. Create exclude pattern for specific directory
+   ```python
+   pattern = "https://example.com/internal/*"
+   ```
+2. Create include pattern for specific file within excluded directory
+   ```python
+   pattern = "https://example.com/internal/public-doc.html"
+   ```
+3. Verify include pattern overrides exclude pattern
+4. Test wildcard pattern matching
+5. Check pattern precedence rules
+
+#### 2.2 Modification Patterns
+1. Create overlapping title patterns:
+   ```python
+   pattern1 = "*/docs/* → title='Documentation'"
+   pattern2 = "*/docs/api/* → title='API Reference'"
+   ```
+2. Create division patterns with different specificity
+3. Test document type patterns with wildcards
+4. Verify "smallest set priority" resolution
+5. Check pattern application during migrations
+
+#### 2.3 Pattern Removal Scenarios
+1. Test removing pattern affecting only Delta URLs
+2. Remove pattern affecting Curated URLs
+3. Verify handling of multiple pattern effects
+4. Test manual change preservation
+5. Check cleanup procedures
+
+Expected Results:
+- Pattern precedence rules correctly applied
+- Proper handling of overlapping patterns
+- Manual changes preserved during pattern operations
+- Correct reversal of pattern effects on removal
+
+## Test Flow 3: Reindexing Workflow
+
+### Objective
+Verify the reindexing process and status management.
+
+### Prerequisites
+- Existing collection in production
+- Access to both dev and prod environments
+
+### Test Cases
+
+#### 3.1 Reindexing Status Progression
+1. Change status from `REINDEXING_NOT_NEEDED` to `REINDEXING_NEEDED_ON_DEV`
+2. Complete reindexing and update to `REINDEXING_FINISHED_ON_DEV`
+3. Verify automatic full text fetch
+4. Confirm status update to `REINDEXING_READY_FOR_CURATION`
+5. Progress through `REINDEXING_CURATED`
+6. Final update to `REINDEXING_INDEXED_ON_PROD`
+
+#### 3.2 Data Handling During Reindex
+1. Verify existing DumpUrls are cleared
+2. Check new full text data processing
+3. Test DumpUrl to DeltaUrl migration
+4. Verify pattern reapplication
+5. Confirm CuratedUrl updates
+
+Expected Results:
+- Proper status progression through reindexing
+- Data integrity maintained
+- Patterns correctly reapplied
+- Existing customizations preserved
+
+## Edge Cases and Stress Testing
+
+### URL Pattern Edge Cases
+1. Test URLs with/without trailing slashes
+2. Verify handling of overlapping wildcards
+3. Check pattern resolution with equal URL count matches
+4. Test maximum pattern chain depth
+5. Verify handling of malformed URLs
+
+### Status Transition Edge Cases
+1. Test interrupted transitions
+2. Verify handling of failed automated actions
+3. Check concurrent status updates
+4. Test invalid status progressions
+5. Verify recovery procedures
+
+### Data Volume Testing
+1. Test with large number of URLs (>100k)
+2. Check pattern application performance
+3. Verify migration speed with large datasets
+4. Test memory usage during bulk operations
+5. Check system response under heavy concurrent access
+
+## Common Issues to Watch For
+
+1. Pattern Precedence
+   - Multiple patterns affecting same URL
+   - Include/exclude pattern conflicts
+   - Resolution of equal-specificity patterns
+
+2. Data Integrity
+   - Field preservation during transitions
+   - Manual change retention
+   - Pattern effect tracking
+
+3. Performance
+   - Large collection handling
+   - Multiple pattern application
+   - Status transition timing
+
+4. Status Management
+   - Automated trigger reliability
+   - Status update race conditions
+   - Recovery from failed transitions

From 04071e9f07d2fe032eece56f167de5baf292b801 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 12 Dec 2024 10:48:12 -0600
Subject: [PATCH 308/441] change to an open link for testing doc

---
 sde_collections/models/README_MANUNAL_TESTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/models/README_MANUNAL_TESTING.md b/sde_collections/models/README_MANUNAL_TESTING.md
index 6f8f371e..2be75f50 100644
--- a/sde_collections/models/README_MANUNAL_TESTING.md
+++ b/sde_collections/models/README_MANUNAL_TESTING.md
@@ -1,7 +1,7 @@
 # COSMOS Curation System Testing Guide
 
 ## Resources
-There are 14 collections which have been reindexed on dev and can have their statuses changed to `REINDEXING_FINISHED` to test url importing. The collections and their counts can be seen [here](https://docs.google.com/spreadsheets/d/1mJFqZXdIyAN8LTuVQMLRDuNgzm7GIMlYb_cPUtLKSCM/edit?gid=1316450061#gid=1316450061 ).
+There are 14 collections which have been reindexed on dev and can have their statuses changed to `REINDEXING_FINISHED` to test url importing. The collections and their counts can be seen [here](https://docs.google.com/spreadsheets/d/1z_YeTwsyadW6ywPsahUElnf8X65gP7t7UyaO7sVqGiI/edit?gid=1316450061#gid=1316450061).
 
 ## Test Flow 1: Basic URL Collection Lifecycle
 

From a0a80e94d8185ad3c5cac2f343cc39e7ce1e860d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 12 Dec 2024 11:05:47 -0600
Subject: [PATCH 309/441] rename manual testing readme

---
 .../{README_MANUNAL_TESTING.md => README_MANUAL_TESTING.md}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sde_collections/models/{README_MANUNAL_TESTING.md => README_MANUAL_TESTING.md} (100%)

diff --git a/sde_collections/models/README_MANUNAL_TESTING.md b/sde_collections/models/README_MANUAL_TESTING.md
similarity index 100%
rename from sde_collections/models/README_MANUNAL_TESTING.md
rename to sde_collections/models/README_MANUAL_TESTING.md

From 434c9b03c0bcf7e4fedb52dc51dc4d998204852a Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 13 Dec 2024 15:18:02 +0530
Subject: [PATCH 310/441] Filters fixed

---
 sde_indexing_helper/static/js/delta_url_list.js                 | 2 +-
 .../templates/sde_collections/delta_urls_list.html              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index aff3e86a..800a0dce 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -229,7 +229,7 @@ function initializeDataTable() {
       },
     },
     initComplete: function (data) {
-      const addDropdownSelect = [1, 4, 5];
+      const addDropdownSelect = [1, 2, 4, 5];
       const dict = {
         1: "Images",
         2: "Data",
diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index fb0555fa..20bfc6e5 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -131,7 +131,7 @@ <h3 class="whiteText deltaTitle">
                             <option value="false">FALSE</option>
                             <option value="true">TRUE</option>
                         </select></td>
-                        <td ><select class="dropdown-1 select-dropdown selectStyling"><option value="">SELECT</option>
+                        <td ><select class="dropdown-2 select-dropdown selectStyling"><option value="">SELECT</option>
                             <option value="false">FALSE</option>
                             <option value="true">TRUE</option>
                         </select></td>

From eddad8479d411f7ee4f220595593ae845b42717b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 10:55:47 -0600
Subject: [PATCH 311/441] refactor readme for unapply logic

---
 .../models/README_UNAPPLY_LOGIC.md            | 105 +++++++++++++-----
 1 file changed, 77 insertions(+), 28 deletions(-)

diff --git a/sde_collections/models/README_UNAPPLY_LOGIC.md b/sde_collections/models/README_UNAPPLY_LOGIC.md
index f4c75f8f..536a2281 100644
--- a/sde_collections/models/README_UNAPPLY_LOGIC.md
+++ b/sde_collections/models/README_UNAPPLY_LOGIC.md
@@ -14,33 +14,61 @@
 - Delta URL exists with pattern effect
 - Pattern is removed
 ```
-Curated: None
-Delta: division=BIOLOGY (from pattern)
-[Pattern removed]
-Result: Delta remains with division=None
+Curated: None exists
+Delta: url=new.com, division=None
+```
+`[Pattern: division=BIOLOGY], created`
+```
+Curated: None exists
+Delta: url=new.com, division=BIOLOGY
+```
+`[Pattern: division=BIOLOGY], deleted`
+```
+Curated: None exists
+Delta: url=new.com, division=None
 ```
 
-### Case 2: Delta and Curated Exist
+### Case 2: Delta Created to Apply Pattern
 **Scenario:**
-- Both curated and delta URLs exist
+- A Curated with no division already exists
+- A pattern is created
+- A delta is created to  to apply a pattern
 - Pattern is removed
+- Delta should be deleted
+```
+Curated: division=None
 ```
-Curated: division=GENERAL
+`[Pattern: division=BIOLOGY], created`
+```
+Curated: division=None
 Delta: division=BIOLOGY (from pattern)
-[Pattern removed]
-Result: Delta reverts to curated value (division=GENERAL)
-If delta now matches curated exactly, delta is deleted
+```
+`[Pattern: division=BIOLOGY], deleted`
+```
+Curated: division=None
 ```
 
-### Case 3: Curated Only
-**Scenario:**
-- Only curated URL exists
+### Case 3: Pre-existing Delta
+- A Curated with no division already exists
+- A Delta with an updated scraped_title exists
+- A pattern is created to set division
+- A delta is created to apply a pattern
 - Pattern is removed
+- Delta should be maintained because of scraped_title
+
+```
+Curated: division=None
+Delta: scraped_title="Modified", division=None
+```
+`[Pattern: division=BIOLOGY], created`
+```
+Curated: division=None
+Delta: scraped_title="Modified", division=BIOLOGY (from pattern)
+```
+`[Pattern: division=BIOLOGY], deleted`
 ```
-Curated: division=GENERAL
-Delta: None
-[Pattern removed]
-Result: New delta created with division=None
+Curated: division=None
+Delta: scraped_title="Modified", division=None
 ```
 
 ### Case 4: Multiple Pattern Effects
@@ -48,22 +76,43 @@ Result: New delta created with division=None
 - Delta has changes from multiple patterns
 - One pattern is removed
 ```
-Curated: division=GENERAL, doc_type=DOCUMENTATION
 Delta: division=BIOLOGY, doc_type=DATA (from two patterns)
-[Division pattern removed]
-Result: Delta remains with division=GENERAL, doc_type=DATA preserved
+Pattern: division=BIOLOGY
+Pattern: doc_type=DATA
+```
+`[Pattern: division=BIOLOGY], deleted`
+```
+Delta: division=None, doc_type=DATA
+Pattern: doc_type=DATA
 ```
 
-### Case 5: Pattern Removal with Manual Changes
-**Scenario:**
-- Delta has both pattern effect and manual changes
-- Pattern is removed
+### Case 5: Overlapping Patterns, Specific Deleted
 ```
-Curated: division=GENERAL, title="Original"
-Delta: division=BIOLOGY, title="Modified" (pattern + manual)
-[Pattern removed]
-Result: Delta remains with division=GENERAL, title="Modified" preserved
+Curated: division=ASTROPHYSICS (because of specific pattern)
+Specific Pattern: division=ASTROPHYSICS
+General Pattern: division=BIOLOGY
 ```
+`[Specific Pattern: division=ASTROPHYSICS], deleted`
+
+```
+Curated: division=BIOLOGY (because of general pattern)
+General Pattern: division=BIOLOGY
+```
+
+
+### Case 6: Overlapping Patterns, General Deleted
+```
+Curated: division=ASTROPHYSICS (because of specific pattern)
+Specific Pattern: division=ASTROPHYSICS
+General Pattern: division=BIOLOGY
+```
+`[General Pattern: division=BIOLOGY], deleted`
+
+```
+Curated: division=ASTROPHYSICS (because of specific pattern)
+Specific Pattern: division=ASTROPHYSICS
+```
+
 
 ## Implementation Steps
 

From 6a52eafa95987fb12273856f1d607ba2c52a8048 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 13:00:56 -0600
Subject: [PATCH 312/441] update promotion code to treat empty stings and null
 values as meaningful

---
 sde_collections/models/collection.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index ddc5b241..937134dc 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -211,18 +211,18 @@ def promote_to_curated(self):
                         continue
 
                     delta_value = getattr(delta, field_name)
-                    if delta_value not in [None, ""] and getattr(curated, field_name) != delta_value:
+                    if getattr(curated, field_name) != delta_value:
                         updated_fields[field_name] = delta_value
 
                 if updated_fields:
                     CuratedUrl.objects.filter(pk=curated.pk).update(**updated_fields)
             else:
-                # If no matching CuratedUrl, create a new one using all non-null and non-empty fields
+                # Previously, we excluded fields with values of None and ""
+                # however, such null values are considered meaningful and should be copied over
                 new_data = {
                     field.name: getattr(delta, field.name)
                     for field in delta._meta.fields
-                    if field.name not in ["to_delete", "collection", "id"]
-                    and getattr(delta, field.name) not in [None, ""]
+                    if field.name not in ["to_delete", "collection", "id"] and getattr(delta, field.name)
                 }
                 CuratedUrl.objects.create(collection=self, **new_data)
 

From ec471b014507ac7ee78c77406ceaa0d81e131c5b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 13:01:12 -0600
Subject: [PATCH 313/441] correct examples 5 and 6 in unapply logic readme

---
 sde_collections/models/README_UNAPPLY_LOGIC.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sde_collections/models/README_UNAPPLY_LOGIC.md b/sde_collections/models/README_UNAPPLY_LOGIC.md
index 536a2281..1000f5f8 100644
--- a/sde_collections/models/README_UNAPPLY_LOGIC.md
+++ b/sde_collections/models/README_UNAPPLY_LOGIC.md
@@ -88,28 +88,28 @@ Pattern: doc_type=DATA
 
 ### Case 5: Overlapping Patterns, Specific Deleted
 ```
-Curated: division=ASTROPHYSICS (because of specific pattern)
+Delta: division=ASTROPHYSICS (because of specific pattern)
 Specific Pattern: division=ASTROPHYSICS
 General Pattern: division=BIOLOGY
 ```
 `[Specific Pattern: division=ASTROPHYSICS], deleted`
 
 ```
-Curated: division=BIOLOGY (because of general pattern)
+Delta: division=BIOLOGY (because of general pattern)
 General Pattern: division=BIOLOGY
 ```
 
 
 ### Case 6: Overlapping Patterns, General Deleted
 ```
-Curated: division=ASTROPHYSICS (because of specific pattern)
+Delta: division=ASTROPHYSICS (because of specific pattern)
 Specific Pattern: division=ASTROPHYSICS
 General Pattern: division=BIOLOGY
 ```
 `[General Pattern: division=BIOLOGY], deleted`
 
 ```
-Curated: division=ASTROPHYSICS (because of specific pattern)
+Delta: division=ASTROPHYSICS (because of specific pattern)
 Specific Pattern: division=ASTROPHYSICS
 ```
 

From 393402c0f53a67ff8a33d78f4638f62d5b3a9e27 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 13:18:48 -0600
Subject: [PATCH 314/441] update Field modifier unapply to handle pattern
 overlaps

---
 sde_collections/models/delta_patterns.py | 55 ++++++++++++++++++------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index dade950a..aa00f41c 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -333,16 +333,28 @@ def unapply(self) -> None:
         affected_deltas = self.delta_urls.all()
         affected_curated = self.curated_urls.all()
 
+        # Get all other patterns of same type for this collection
+        pattern_class = self.__class__
+        other_patterns = pattern_class.objects.filter(collection=self.collection).exclude(id=self.id)
+
         # Process each affected delta URL
         for delta in affected_deltas:
             curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()
 
-            if not curated:
-                # Scenario 1: Delta only - new URL
-                setattr(delta, field, None)
+            # Find next most specific matching pattern if any
+            matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)]
+
+            next_pattern = None
+            if matching_patterns:
+                # Sort by number of URLs matched (ascending) to find most specific
+                next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())
+
+            if next_pattern:
+                # Apply next most specific pattern's value
+                setattr(delta, field, next_pattern.get_new_value())
                 delta.save()
-            else:
-                # Scenario 2: Both exist
+            elif curated:
+                # No other patterns match, revert to curated value
                 setattr(delta, field, getattr(curated, field))
                 delta.save()
 
@@ -354,17 +366,36 @@ def unapply(self) -> None:
                 )
                 if fields_match:
                     delta.delete()
+            else:
+                # No curated URL or other patterns, set to None
+                setattr(delta, field, None)
+                delta.save()
 
         # Handle curated URLs that don't have deltas
         for curated in affected_curated:
             if not DeltaUrl.objects.filter(url=curated.url).exists():
-                # Scenario 3: Curated only
-                # Copy all fields from curated except the one we're nulling
-                fields = {
-                    f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
-                }
-                fields[field] = None  # Set the pattern's field to None
-                delta = DeltaUrl.objects.create(collection=self.collection, **fields)
+                # Find any matching patterns
+                matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)]
+
+                if matching_patterns:
+                    # Apply most specific pattern's value
+                    next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())
+                    fields = {
+                        f.name: getattr(curated, f.name)
+                        for f in curated._meta.fields
+                        if f.name not in ["id", "collection"]
+                    }
+                    fields[field] = next_pattern.get_new_value()
+                    DeltaUrl.objects.create(collection=self.collection, **fields)
+                else:
+                    # No other patterns, create delta with None
+                    fields = {
+                        f.name: getattr(curated, f.name)
+                        for f in curated._meta.fields
+                        if f.name not in ["id", "collection"]
+                    }
+                    fields[field] = None
+                    DeltaUrl.objects.create(collection=self.collection, **fields)
 
         # Clear pattern relationships
         self.delta_urls.clear()

From 48e0ac986046a5f14e4696061263c2d0b790e57e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 13:20:12 -0600
Subject: [PATCH 315/441] add dedicated test suite for field modifier unapply

---
 .../tests/test_field_modifier_unapply.py      | 252 ++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 sde_collections/tests/test_field_modifier_unapply.py

diff --git a/sde_collections/tests/test_field_modifier_unapply.py b/sde_collections/tests/test_field_modifier_unapply.py
new file mode 100644
index 00000000..9f9f5b01
--- /dev/null
+++ b/sde_collections/tests/test_field_modifier_unapply.py
@@ -0,0 +1,252 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_unapply.py
+
+from django.test import TestCase
+
+from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes
+from sde_collections.models.delta_patterns import (
+    DeltaDivisionPattern,
+    DeltaDocumentTypePattern,
+)
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
+
+from .factories import CollectionFactory, DumpUrlFactory
+
+
+class TestDeltaPatternUnapplyLogic(TestCase):
+    """Test complete lifecycle of pattern application and removal."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_dump_to_delta_migration_with_pattern_lifecycle(self):
+        """
+        Test complete lifecycle:
+        1. Create dump URLs
+        2. Migrate to delta URLs
+        3. Apply patterns
+        4. Promote to curated
+        5. Delete pattern
+        6. Verify deltas are created
+        7. Promote to curated
+        8. Verify curated URLs have division set to None
+        """
+        # Create initial dump URLs
+        [
+            DumpUrlFactory(
+                collection=self.collection,
+                url=f"https://example.com/science/data{i}.html",
+            )
+            for i in range(3)
+        ]
+
+        # Migrate dump to delta
+        self.collection.migrate_dump_to_delta()
+
+        # Verify dump URLs were migrated to delta URLs
+        self.assertEqual(DeltaUrl.objects.count(), 3)
+        self.assertEqual(DumpUrl.objects.count(), 0)
+
+        # Apply division pattern
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/science/*.html",
+            match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            division=Divisions.BIOLOGY,
+        )
+
+        # Verify pattern was applied to existing deltas
+        for delta_url in DeltaUrl.objects.all():
+            self.assertEqual(delta_url.division, Divisions.BIOLOGY)
+
+        # Promote to curated
+        self.collection.promote_to_curated()
+
+        # Verify promotion
+        self.assertEqual(CuratedUrl.objects.count(), 3)
+        self.assertEqual(DeltaUrl.objects.count(), 0)
+        for curated_url in CuratedUrl.objects.all():
+            self.assertEqual(curated_url.division, Divisions.BIOLOGY)
+
+        # Remove pattern
+        pattern.delete()
+
+        # Should have created new deltas for all URLs setting division to None
+        self.assertEqual(DeltaUrl.objects.count(), 3)
+        for delta_url in DeltaUrl.objects.all():
+            self.assertIsNone(delta_url.division)
+
+        # Promote to curated
+        self.collection.promote_to_curated()
+
+        # Should updated all Curated setting division to None
+        self.assertEqual(CuratedUrl.objects.count(), 3)
+        for delta_url in CuratedUrl.objects.all():
+            self.assertIsNone(delta_url.division)
+
+    # Test for README_UNNAPLY_LOGIC.md Case 1: Delta Only (New URL)
+    def test_pattern_removal_with_delta_only(self):
+        """Test pattern removal when delta exists without corresponding curated URL."""
+        # Create initial delta URL (simulating a new URL)
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/new.html")
+
+        # Create and apply pattern
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=delta_url.url, division=Divisions.BIOLOGY
+        )
+
+        # Verify pattern was applied
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertEqual(delta_url.division, Divisions.BIOLOGY)
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify delta still exists but with division set to None
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertIsNone(delta_url.division)
+        self.assertEqual(DeltaUrl.objects.count(), 1)
+
+    # Test for README_UNNAPLY_LOGIC.md Case 2: Delta Created to Apply Pattern
+    def test_pattern_removal_with_simple_delta(self):
+        """Test pattern removal when delta was created just to apply pattern."""
+        # Create initial curated URL
+        curated_url = CuratedUrl.objects.create(
+            collection=self.collection, url="https://example.com/doc.html", division=None
+        )
+
+        # Create and apply pattern
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY
+        )
+
+        # Verify delta was created with pattern's value
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertEqual(delta_url.division, Divisions.BIOLOGY)
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify delta was deleted since it would match curated
+        self.assertEqual(DeltaUrl.objects.filter(url=curated_url.url).count(), 0)
+
+    # Test for README_UNNAPLY_LOGIC.md Case 3: Pre-existing Delta
+    def test_pattern_removal_preserves_other_changes(self):
+        """Test pattern removal when delta has other changes that should be preserved."""
+        # Create curated URL
+        curated_url = CuratedUrl.objects.create(
+            collection=self.collection,
+            url="https://example.com/doc.html",
+            division=None,
+            scraped_title="Original Title",
+        )
+
+        # Create delta with modified title
+        delta_url = DeltaUrl.objects.create(
+            collection=self.collection, url=curated_url.url, division=None, scraped_title="Modified Title"
+        )
+
+        # Create and apply pattern
+        pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY
+        )
+
+        # Verify pattern was applied while preserving title
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertEqual(delta_url.division, Divisions.BIOLOGY)
+        self.assertEqual(delta_url.scraped_title, "Modified Title")
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify delta still exists with original changes but pattern effect removed
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertIsNone(delta_url.division)
+        self.assertEqual(delta_url.scraped_title, "Modified Title")
+
+    # Test for README_UNNAPLY_LOGIC.md Case 4: Multiple Pattern Effects
+    def test_pattern_removal_with_multiple_patterns(self):
+        """Test removal of one pattern when URL is affected by multiple patterns."""
+        # Create curated URL
+        curated_url = CuratedUrl.objects.create(collection=self.collection, url="https://example.com/doc.html")
+
+        # Create two patterns affecting the same URL
+        division_pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY
+        )
+
+        DeltaDocumentTypePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA
+        )
+
+        # Verify both patterns were applied
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertEqual(delta_url.division, Divisions.BIOLOGY)
+        self.assertEqual(delta_url.document_type, DocumentTypes.DATA)
+
+        # Remove division pattern
+        division_pattern.delete()
+
+        # Verify delta still exists with doc type but division removed
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertIsNone(delta_url.division)
+        self.assertEqual(delta_url.document_type, DocumentTypes.DATA)
+
+    # Test for Case 5: Overlapping Patterns, Specific Deleted
+    def test_specific_pattern_removal_with_overlapping_patterns(self):
+        """Test removal of specific pattern when more general pattern exists."""
+        # Create initial delta URL
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html")
+
+        # Create general pattern
+        DeltaDivisionPattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/docs/*.html",
+            match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            division=Divisions.BIOLOGY,
+        )
+
+        # Create specific pattern
+        specific_pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=delta_url.url, division=Divisions.ASTROPHYSICS
+        )
+
+        # Verify specific pattern took precedence
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS)
+
+        # Remove specific pattern
+        specific_pattern.delete()
+
+        # Verify general pattern now applies
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertEqual(delta_url.division, Divisions.BIOLOGY)
+
+    # Test for Case 6: Overlapping Patterns, General Deleted
+    def test_general_pattern_removal_with_overlapping_patterns(self):
+        """Test removal of general pattern when more specific pattern exists."""
+        # Create initial delta URL
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html")
+
+        # Create general pattern
+        general_pattern = DeltaDivisionPattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/docs/*.html",
+            match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            division=Divisions.BIOLOGY,
+        )
+
+        # Create specific pattern
+        DeltaDivisionPattern.objects.create(
+            collection=self.collection, match_pattern=delta_url.url, division=Divisions.ASTROPHYSICS
+        )
+
+        # Verify specific pattern takes precedence
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS)
+
+        # Remove general pattern
+        general_pattern.delete()
+
+        # Verify specific pattern still applies
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS)

From cf86eee8b0026d702ec5dce590366d15c6f0b840 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 13:30:31 -0600
Subject: [PATCH 316/441] add tests for title pattern unapply

---
 .../tests/test_title_pattern_unapply.py       | 281 ++++++++++++++++++
 1 file changed, 281 insertions(+)
 create mode 100644 sde_collections/tests/test_title_pattern_unapply.py

diff --git a/sde_collections/tests/test_title_pattern_unapply.py b/sde_collections/tests/test_title_pattern_unapply.py
new file mode 100644
index 00000000..db8ed7e5
--- /dev/null
+++ b/sde_collections/tests/test_title_pattern_unapply.py
@@ -0,0 +1,281 @@
+# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_pattern_unapply.py
+
+from django.test import TestCase
+
+from sde_collections.models.delta_patterns import (
+    DeltaResolvedTitle,
+    DeltaResolvedTitleError,
+    DeltaTitlePattern,
+)
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+
+from .factories import CollectionFactory, DumpUrlFactory
+
+
+class TestTitlePatternUnapplyLogic(TestCase):
+    """Test complete lifecycle of title pattern application and removal."""
+
+    def setUp(self):
+        self.collection = CollectionFactory()
+
+    def test_dump_to_delta_migration_with_pattern_lifecycle(self):
+        """
+        Test complete lifecycle:
+        1. Create dump URLs
+        2. Migrate to delta URLs
+        3. Apply title pattern
+        4. Promote to curated
+        5. Delete pattern
+        6. Verify deltas are created
+        7. Promote to curated
+        8. Verify curated URLs have empty generated titles
+        """
+        # Create initial dump URLs
+        [
+            DumpUrlFactory(
+                collection=self.collection,
+                url=f"https://example.com/science/data{i}.html",
+            )
+            for i in range(3)
+        ]
+
+        # Migrate dump to delta
+        self.collection.migrate_dump_to_delta()
+
+        # Apply title pattern
+        pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/science/*.html",
+            match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            title_pattern="Science Document {url}",
+        )
+
+        # Verify pattern was applied to all deltas and resolution tracked
+        for delta_url in DeltaUrl.objects.all():
+            self.assertTrue(delta_url.generated_title.startswith("Science Document"))
+            self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists())
+
+        # Promote to curated
+        self.collection.promote_to_curated()
+
+        # Verify promotion
+        self.assertEqual(CuratedUrl.objects.count(), 3)
+        self.assertEqual(DeltaUrl.objects.count(), 0)
+        for curated_url in CuratedUrl.objects.all():
+            self.assertTrue(curated_url.generated_title.startswith("Science Document"))
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify new deltas created with empty titles
+        self.assertEqual(DeltaUrl.objects.count(), 3)
+        for delta_url in DeltaUrl.objects.all():
+            self.assertEqual(delta_url.generated_title, "")
+
+        # Verify resolution tracking cleared
+        self.assertEqual(DeltaResolvedTitle.objects.count(), 0)
+        self.assertEqual(DeltaResolvedTitleError.objects.count(), 0)
+
+    def test_pattern_removal_with_delta_only(self):
+        """Test pattern removal when delta exists without corresponding curated URL."""
+        # Create initial delta URL
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/new.html")
+
+        # Create and apply pattern
+        pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection, match_pattern=delta_url.url, title_pattern="New Document {url}"
+        )
+
+        # Verify pattern was applied
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("New Document"))
+        self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists())
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify delta still exists but with empty title
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertEqual(delta_url.generated_title, "")
+        self.assertEqual(DeltaResolvedTitle.objects.count(), 0)
+
+    def test_pattern_removal_with_simple_delta(self):
+        """Test pattern removal when delta was created just to apply pattern."""
+        # Create initial curated URL
+        curated_url = CuratedUrl.objects.create(
+            collection=self.collection, url="https://example.com/doc.html", generated_title=""
+        )
+
+        # Create and apply pattern
+        pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, title_pattern="Documentation {url}"
+        )
+
+        # Verify delta was created with pattern's title
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("Documentation"))
+        self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists())
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify delta was deleted since it would match curated
+        self.assertEqual(DeltaUrl.objects.filter(url=curated_url.url).count(), 0)
+        self.assertEqual(DeltaResolvedTitle.objects.count(), 0)
+
+    def test_pattern_removal_preserves_other_changes(self):
+        """Test pattern removal when delta has other changes that should be preserved."""
+        # Create curated URL
+        curated_url = CuratedUrl.objects.create(
+            collection=self.collection,
+            url="https://example.com/doc.html",
+            generated_title="",
+            scraped_title="Original Title",
+        )
+
+        # Create delta with modified title
+        delta_url = DeltaUrl.objects.create(
+            collection=self.collection, url=curated_url.url, generated_title="", scraped_title="Modified Title"
+        )
+
+        # Create and apply pattern
+        pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection, match_pattern=curated_url.url, title_pattern="API Doc {url}"
+        )
+
+        # Verify pattern was applied while preserving scraped title
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("API Doc"))
+        self.assertEqual(delta_url.scraped_title, "Modified Title")
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify delta still exists with original changes but pattern effect removed
+        delta_url = DeltaUrl.objects.get(url=curated_url.url)
+        self.assertEqual(delta_url.generated_title, "")
+        self.assertEqual(delta_url.scraped_title, "Modified Title")
+
+    def test_pattern_removal_with_multiple_patterns(self):
+        """Test removal of one pattern when URL is affected by multiple patterns."""
+        # Create initial delta URL
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/doc.html")
+
+        # Create specific pattern
+        specific_pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection, match_pattern=delta_url.url, title_pattern="Specific Title {url}"
+        )
+
+        # Create another pattern for the same URL
+        generic_pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/*.html",
+            match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            title_pattern="Generic Title {url}",
+        )
+
+        # Verify specific pattern takes precedence
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("Specific Title"))
+
+        # Verify resolution tracking
+        self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists())
+
+        # Remove specific pattern
+        specific_pattern.delete()
+
+        # Verify general pattern is now applied
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("Generic Title"))
+
+        # Verify resolution tracking updated
+        self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=generic_pattern).exists())
+
+    def test_specific_pattern_removal_with_overlapping_patterns(self):
+        """Test removal of specific pattern when more general pattern exists."""
+        # Create initial delta URL
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html")
+
+        # Create general pattern
+        DeltaTitlePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/docs/*.html",
+            match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            title_pattern="General Document {url}",
+        )
+
+        # Create specific pattern
+        specific_pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection, match_pattern=delta_url.url, title_pattern="API Spec {url}"
+        )
+
+        # Verify specific pattern took precedence
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("API Spec"))
+
+        # Remove specific pattern
+        specific_pattern.delete()
+
+        # Verify general pattern now applies
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("General Document"))
+
+    def test_general_pattern_removal_with_overlapping_patterns(self):
+        """Test removal of general pattern when more specific pattern exists."""
+        # Create initial delta URL
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html")
+
+        # Create general pattern
+        general_pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection,
+            match_pattern="https://example.com/docs/*.html",
+            match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN,
+            title_pattern="General Document {url}",
+        )
+
+        # Create specific pattern
+        specific_pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection, match_pattern=delta_url.url, title_pattern="API Spec {url}"
+        )
+
+        # Verify specific pattern takes precedence
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("API Spec"))
+
+        # Verify correct resolution tracking
+        self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists())
+
+        # Remove general pattern
+        general_pattern.delete()
+
+        # Verify specific pattern still applies
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertTrue(delta_url.generated_title.startswith("API Spec"))
+
+        # Verify resolution tracking unchanged
+        self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists())
+
+    def test_pattern_removal_with_title_error(self):
+        """Test handling of title resolution errors during pattern removal."""
+        # Create initial delta URL
+        delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/doc.html")
+
+        # Create pattern that will cause error (invalid template)
+        pattern = DeltaTitlePattern.objects.create(
+            collection=self.collection,
+            match_pattern=delta_url.url,
+            title_pattern="{invalid}",  # This should cause an error
+        )
+
+        # Verify error was recorded
+        self.assertTrue(DeltaResolvedTitleError.objects.filter(delta_url=delta_url, title_pattern=pattern).exists())
+
+        # Remove pattern
+        pattern.delete()
+
+        # Verify error tracking cleared
+        self.assertEqual(DeltaResolvedTitleError.objects.count(), 0)
+
+        # Verify delta has empty title
+        delta_url = DeltaUrl.objects.get(url=delta_url.url)
+        self.assertEqual(delta_url.generated_title, "")

From 85a65d094e53652e690cc3035aabc87c6476f2d8 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 13:33:34 -0600
Subject: [PATCH 317/441] update unapply logic for title patterns to include
 overlapping patterns

---
 sde_collections/models/delta_patterns.py | 88 +++++++++++++++++++-----
 1 file changed, 69 insertions(+), 19 deletions(-)

diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py
index aa00f41c..61c8e9ea 100644
--- a/sde_collections/models/delta_patterns.py
+++ b/sde_collections/models/delta_patterns.py
@@ -554,11 +554,12 @@ def apply(self) -> None:
 
     def unapply(self) -> None:
         """
-        Remove title modifications:
-        1. Create Delta URLs for affected Curated URLs to explicitly clear titles
-        2. Remove generated titles from affected Delta URLs
-        3. Clean up Delta URLs that become identical to their Curated URL
-        4. Clear resolution tracking
+        Remove title modifications, maintaining pattern precedence:
+        1. Find any remaining patterns that match each URL
+        2. Apply most specific matching pattern's title if one exists
+        3. Otherwise revert to curated title or clear title
+        4. Update title resolution tracking
+        5. Clean up redundant deltas
         """
         DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
         CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
@@ -569,16 +570,36 @@ def unapply(self) -> None:
         affected_deltas = self.delta_urls.all()
         affected_curated = self.curated_urls.all()
 
+        # Get all other title patterns for this collection
+        other_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(id=self.id)
+
         # Process each affected delta URL
         for delta in affected_deltas:
             curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()
 
-            if not curated:
-                # Scenario 1: Delta only - clear generated title
-                delta.generated_title = ""
-                delta.save()
-            else:
-                # Scenario 2: Both exist - revert to curated title
+            # Find next most specific matching pattern if any
+            matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)]
+
+            next_pattern = None
+            if matching_patterns:
+                # Sort by number of URLs matched (ascending) to find most specific
+                next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())
+
+            if next_pattern:
+                # Apply next most specific pattern's title
+                new_title, error = next_pattern.generate_title_for_url(delta)
+                if error:
+                    DeltaResolvedTitleError.objects.update_or_create(
+                        delta_url=delta, defaults={"title_pattern": next_pattern, "error_string": error}
+                    )
+                else:
+                    delta.generated_title = new_title
+                    delta.save()
+                    DeltaResolvedTitle.objects.update_or_create(
+                        delta_url=delta, defaults={"title_pattern": next_pattern, "resolved_title": new_title}
+                    )
+            elif curated:
+                # No other patterns match, revert to curated title
                 delta.generated_title = curated.generated_title
                 delta.save()
 
@@ -590,18 +611,47 @@ def unapply(self) -> None:
                 )
                 if fields_match:
                     delta.delete()
+            else:
+                # No curated URL or other patterns, clear title
+                delta.generated_title = ""
+                delta.save()
 
         # Handle curated URLs that don't have deltas
         for curated in affected_curated:
             if not DeltaUrl.objects.filter(url=curated.url).exists():
-                # Scenario 3: Curated only - create delta with cleared title
-                fields = {
-                    f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
-                }
-                fields["generated_title"] = ""
-                DeltaUrl.objects.create(collection=self.collection, **fields)
-
-        # Clear resolution tracking
+                # Find any matching patterns
+                matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)]
+
+                if matching_patterns:
+                    # Apply most specific pattern's title
+                    next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())
+
+                    # Copy all fields from curated
+                    fields = {
+                        f.name: getattr(curated, f.name)
+                        for f in curated._meta.fields
+                        if f.name not in ["id", "collection"]
+                    }
+
+                    # Generate and apply new title
+                    new_title, error = next_pattern.generate_title_for_url(curated)
+                    if not error:
+                        fields["generated_title"] = new_title
+                        delta = DeltaUrl.objects.create(collection=self.collection, **fields)
+                        DeltaResolvedTitle.objects.create(
+                            title_pattern=next_pattern, delta_url=delta, resolved_title=new_title
+                        )
+                else:
+                    # No other patterns, create delta with cleared title
+                    fields = {
+                        f.name: getattr(curated, f.name)
+                        for f in curated._meta.fields
+                        if f.name not in ["id", "collection"]
+                    }
+                    fields["generated_title"] = ""
+                    DeltaUrl.objects.create(collection=self.collection, **fields)
+
+        # Clear resolution tracking for this pattern
         DeltaResolvedTitle.objects.filter(title_pattern=self).delete()
         DeltaResolvedTitleError.objects.filter(title_pattern=self).delete()
 

From 0e33fe50fb4677bc2be0c68db95fc1f347874b6d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 13 Dec 2024 14:04:08 -0600
Subject: [PATCH 318/441] add new field to reindexing statuses

---
 ...r_collection_reindexing_status_and_more.py | 99 +++++++++++++++++++
 .../models/README_REINDEXING_STATUSES.md      | 29 +++++-
 sde_collections/models/collection.py          |  5 +-
 .../models/collection_choice_fields.py        |  5 +-
 .../static/js/collection_list.js              | 11 ++-
 .../static/js/delta_url_list.js               | 13 +--
 6 files changed, 142 insertions(+), 20 deletions(-)
 create mode 100644 sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py

diff --git a/sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py b/sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py
new file mode 100644
index 00000000..5ecddbc9
--- /dev/null
+++ b/sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py
@@ -0,0 +1,99 @@
+# Generated by Django 4.2.9 on 2024-12-13 19:57
+
+from django.db import migrations, models
+
+
+def migrate_reindexing_statuses(apps, schema_editor):
+    Collection = apps.get_model("sde_collections", "Collection")
+    ReindexingHistory = apps.get_model("sde_collections", "ReindexingHistory")
+
+    # Update Collections
+    Collection.objects.filter(reindexing_status=6).update(reindexing_status=7)  # Move "Indexed on Prod" first
+    Collection.objects.filter(reindexing_status=5).update(reindexing_status=6)  # Then move "Curated"
+    # 5 is now free for "Curation in Progress"
+
+    # Update ReindexingHistory
+    ReindexingHistory.objects.filter(reindexing_status=6).update(reindexing_status=7)
+    ReindexingHistory.objects.filter(reindexing_status=5).update(reindexing_status=6)
+
+    ReindexingHistory.objects.filter(old_status=6).update(old_status=7)
+    ReindexingHistory.objects.filter(old_status=5).update(old_status=6)
+
+
+def reverse_migrate_reindexing_statuses(apps, schema_editor):
+    Collection = apps.get_model("sde_collections", "Collection")
+    ReindexingHistory = apps.get_model("sde_collections", "ReindexingHistory")
+
+    # Reverse Collections
+    Collection.objects.filter(reindexing_status=5).update(reindexing_status=None)  # Clear new status
+    Collection.objects.filter(reindexing_status=6).update(reindexing_status=5)
+    Collection.objects.filter(reindexing_status=7).update(reindexing_status=6)
+
+    # Reverse ReindexingHistory
+    ReindexingHistory.objects.filter(reindexing_status=5).update(reindexing_status=None)
+    ReindexingHistory.objects.filter(reindexing_status=6).update(reindexing_status=5)
+    ReindexingHistory.objects.filter(reindexing_status=7).update(reindexing_status=6)
+
+    ReindexingHistory.objects.filter(old_status=5).update(old_status=None)
+    ReindexingHistory.objects.filter(old_status=6).update(old_status=5)
+    ReindexingHistory.objects.filter(old_status=7).update(old_status=6)
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("sde_collections", "0074_alter_collection_reindexing_status_and_more"),
+    ]
+
+    operations = [
+        migrations.RunPython(migrate_reindexing_statuses, reverse_migrate_reindexing_statuses),
+        migrations.AlterField(
+            model_name="collection",
+            name="reindexing_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Re-Indexing Not Needed"),
+                    (2, "Re-Indexing Needed"),
+                    (3, "Re-Indexing Finished"),
+                    (4, "Ready for Re-Curation"),
+                    (5, "Re-Curation in Progress"),
+                    (6, "Re-Curation Finished"),
+                    (7, "Re-Indexed on Prod"),
+                ],
+                default=1,
+                verbose_name="Reindexing Status",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="reindexinghistory",
+            name="old_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Re-Indexing Not Needed"),
+                    (2, "Re-Indexing Needed"),
+                    (3, "Re-Indexing Finished"),
+                    (4, "Ready for Re-Curation"),
+                    (5, "Re-Curation in Progress"),
+                    (6, "Re-Curation Finished"),
+                    (7, "Re-Indexed on Prod"),
+                ],
+                null=True,
+            ),
+        ),
+        migrations.AlterField(
+            model_name="reindexinghistory",
+            name="reindexing_status",
+            field=models.IntegerField(
+                choices=[
+                    (1, "Re-Indexing Not Needed"),
+                    (2, "Re-Indexing Needed"),
+                    (3, "Re-Indexing Finished"),
+                    (4, "Ready for Re-Curation"),
+                    (5, "Re-Curation in Progress"),
+                    (6, "Re-Curation Finished"),
+                    (7, "Re-Indexed on Prod"),
+                ],
+                default=1,
+            ),
+        ),
+    ]
diff --git a/sde_collections/models/README_REINDEXING_STATUSES.md b/sde_collections/models/README_REINDEXING_STATUSES.md
index ee05592b..144a83c2 100644
--- a/sde_collections/models/README_REINDEXING_STATUSES.md
+++ b/sde_collections/models/README_REINDEXING_STATUSES.md
@@ -1,5 +1,18 @@
 # Reindexing Status Documentation
 
+### Status Flow
+
+The typical reindexing status flow is:
+
+1. `REINDEXING_NOT_NEEDED` ("Re-Indexing Not Needed") → Default state
+2. `REINDEXING_NEEDED_ON_DEV` ("Re-Indexing Needed") → When reindexing is required
+3. `REINDEXING_FINISHED_ON_DEV` ("Re-Indexing Finished") → After reindexing completes
+4. `REINDEXING_READY_FOR_CURATION` ("Ready for Re-Curation") → After dump URLs are migrated
+5. `REINDEXING_CURATION_IN_PROGRESS` ("Re-Curation in Progress") → During active re-curation
+6. `REINDEXING_CURATED` ("Re-Curation Finished") → After re-curation is complete
+7. `REINDEXING_INDEXED_ON_PROD` ("Re-Indexed on Prod") → After successful prod indexing
+
+## Status Descriptions
 ### Reindexing Not Needed
 - Variable name: `REINDEXING_NOT_NEEDED` (1)
 - Default status for new collections
@@ -15,20 +28,26 @@
 - For collections that have completed reindexing on LRM Dev
 - Currently managed manually by LRM team via admin interface
 
-### Ready for Curation
+### Ready for Re-Curation
 - Variable name: `REINDEXING_READY_FOR_CURATION` (4)
 - Automatically set when:
   - A collection's dump URLs are migrated to delta URLs AND there are curated URLs present
   - Triggered by Collection.migrate_dump_to_delta() method
 
-### Curated
-- Variable name: `REINDEXING_CURATED` (5)
+### Re-Curation in Progress
+- Variable name: `REINDEXING_CURATION_IN_PROGRESS` (5)
+- Indicates that collection is actively being re-curated
+- Manually set when curator begins re-curation work
+- Transitions to `REINDEXING_CURATED` when re-curation is complete
+
+### Re-Curation Finished
+- Variable name: `REINDEXING_CURATED` (6)
 - Automatically set when:
   - Delta URLs are promoted to curated URLs AND there are curated URLs present
   - Triggered by Collection.promote_to_curated() method
 
-### Indexed on Prod
-- Variable name: `REINDEXING_INDEXED_ON_PROD` (6)
+### Re-Indexed on Prod
+- Variable name: `REINDEXING_INDEXED_ON_PROD` (7)
 - Currently managed manually via command line
 - Future: Will be set automatically via plugin ping
 
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 937134dc..e9d4295e 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -350,8 +350,9 @@ def reindexing_status_button_color(self) -> str:
             2: "btn-warning",  # NEEDED
             3: "btn-secondary",  # FINISHED
             4: "btn-info",  # READY_FOR_CURATION
-            5: "btn-primary",  # CURATED
-            6: "btn-success",  # INDEXED_ON_PROD
+            5: "btn-warning",  # CURATION_IN_PROGRESS
+            6: "btn-primary",  # CURATED
+            7: "btn-success",  # INDEXED_ON_PROD
         }
         return color_choices[self.reindexing_status]
 
diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py
index 6bba7846..c907d08b 100644
--- a/sde_collections/models/collection_choice_fields.py
+++ b/sde_collections/models/collection_choice_fields.py
@@ -105,8 +105,9 @@ class ReindexingStatusChoices(models.IntegerChoices):
     REINDEXING_NEEDED_ON_DEV = 2, "Re-Indexing Needed"
     REINDEXING_FINISHED_ON_DEV = 3, "Re-Indexing Finished"
     REINDEXING_READY_FOR_CURATION = 4, "Ready for Re-Curation"
-    REINDEXING_CURATED = 5, "Re-Curation Finished"
-    REINDEXING_INDEXED_ON_PROD = 6, "Re-Indexed on Prod"
+    REINDEXING_CURATION_IN_PROGRESS = 5, "Re-Curation in Progress"
+    REINDEXING_CURATED = 6, "Re-Curation Finished"
+    REINDEXING_INDEXED_ON_PROD = 7, "Re-Indexed on Prod"
 
     # @classmethod
     # def get_status_string(cls, value):
diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index 7c8693a4..7500bad1 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -304,12 +304,13 @@ function handleReindexingStatusSelect() {
     var reindexing_status = $(this).attr("value");
     var reindexing_status_text = $(this).text();
     var color_choices = {
-      1: "btn-light",    // REINDEXING_NOT_NEEDED
-      2: "btn-warning",  // REINDEXING_NEEDED_ON_DEV
+      1: "btn-light",     // REINDEXING_NOT_NEEDED
+      2: "btn-warning",   // REINDEXING_NEEDED_ON_DEV
       3: "btn-secondary", // REINDEXING_FINISHED_ON_DEV
-      4: "btn-info",     // REINDEXING_READY_FOR_CURATION
-      5: "btn-primary",  // REINDEXING_CURATED
-      6: "btn-success",  // REINDEXING_INDEXED_ON_PROD
+      4: "btn-info",      // REINDEXING_READY_FOR_CURATION
+      5: "btn-warning",   // REINDEXING_CURATION_IN_PROGRESS
+      6: "btn-primary",   // REINDEXING_CURATED
+      7: "btn-success"    // REINDEXING_INDEXED_ON_PROD
     };
 
     $possible_buttons = $("body").find(
diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 800a0dce..060eff59 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -2218,12 +2218,13 @@ function handleReindexingStatusSelect() {
           break;
         case "changeReindexingStatus":
           var color_choices = {
-            1: "btn-light",    // NOT_NEEDED
-            2: "btn-warning",  // NEEDED
-            3: "btn-secondary",// FINISHED
-            4: "btn-info",     // READY_FOR_CURATION
-            5: "btn-primary",  // CURATED
-            6: "btn-success"   // INDEXED_ON_PROD
+            1: "btn-light",     // REINDEXING_NOT_NEEDED
+            2: "btn-warning",   // REINDEXING_NEEDED_ON_DEV
+            3: "btn-secondary", // REINDEXING_FINISHED_ON_DEV
+            4: "btn-info",      // REINDEXING_READY_FOR_CURATION
+            5: "btn-warning",   // REINDEXING_CURATION_IN_PROGRESS
+            6: "btn-primary",   // REINDEXING_CURATED
+            7: "btn-success"    // REINDEXING_INDEXED_ON_PROD
           };
 
           $button = $(`#reindexing-status-button-${collection_id}`);

From 5922d01a018ea63f07eef91c89bcd8211064f828 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Sun, 15 Dec 2024 19:52:47 -0600
Subject: [PATCH 319/441] Add documentation for PairedFieldDescriptor
 implementation

---
 .../utils/README_PAIRED_FIELD_DESCRIPTOR.md   | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md

diff --git a/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md b/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md
new file mode 100644
index 00000000..3003cd58
--- /dev/null
+++ b/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md
@@ -0,0 +1,90 @@
+# Paired Field Descriptor System
+
+## Overview
+
+The Paired Field Descriptor is a Django model descriptor designed to manage fields with both manual and machine learning (ML) generated variants. This system provides a flexible approach to handling metadata fields, with a focus on tag management and priority handling.
+
+## Core Concepts
+
+### Field Pairing Mechanism
+The descriptor automatically creates two associated fields for each defined descriptor:
+- **Manual Field**: Manually entered or curated metadata
+- **ML Field**: Machine learning generated metadata
+
+### Key Characteristics
+- Manual field takes precedence over ML field
+- Flexible field type support
+- Handles empty arrays and None values
+- Requires explicit setting of ML fields
+
+## Implementation
+
+### Creating a Paired Field Descriptor
+
+```python
+tdamm_tag = PairedFieldDescriptor(
+    field_name="tdamm_tag",
+    field_type=ArrayField(models.CharField(max_length=255, choices=TDAMMTags.choices), blank=True, null=True),
+    verbose_name="TDAMM Tags",
+)
+```
+
+#### Parameters
+- `field_name`: Base name for the descriptor
+- `field_type`: Django field type (supports various field types)
+- `verbose_name`: Optional human-readable name
+
+### Field Naming Convention
+When you define a descriptor, two additional fields are automatically created:
+- `{field_name}_manual`: For manually entered values
+- `{field_name}_ml`: For machine learning generated values
+
+## Characteristics
+
+### Field Priority
+1. Manual field always takes precedence
+2. ML field serves as a fallback
+3. Empty manual fields or None values defer to ML field
+
+### Field Retrieval
+```python
+# Retrieval automatically prioritizes manual field
+tags = url.tdamm_tag  # Returns manual tags if exist, otherwise ML tags
+```
+
+### Field Setting
+```python
+# Sets only the manual field
+url.tdamm_tag = ["MMA_M_EM", "MMA_M_G"]
+
+# ML field must be set explicitly
+url.tdamm_tag_ml = ["MMA_O_BH"]
+```
+
+### Field Deletion
+```python
+# Deletes both manual and ML fields
+del url.tdamm_tag
+```
+
+### Data Preservation
+- Paired fields maintain their state during:
+  - Dump to Delta migration
+  - Delta to Curated promotion
+- Manual entries take precedence in all migration stages
+
+## Serializer Integration
+
+Here's the way to configure the serializer to retrieve the paired field, seamlessly extracting either manual or ML tags based on the descriptor's priority rules.
+```python
+class DeltaUrlSerializer(serializers.ModelSerializer):
+    tdamm_tag = serializers.SerializerMethodField()
+
+    class Meta:
+        model = DeltaUrl
+        fields = ("url", "tdamm_tag")
+
+    def get_tdamm_tag(self, obj):
+        tags = obj.tdamm_tag
+        return tags if tags is not None else []
+```
\ No newline at end of file

From 3c8b985c4e517d1698b28e30047a9b871edd079f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 16 Dec 2024 01:56:34 +0000
Subject: [PATCH 320/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md b/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md
index 3003cd58..cd6cc4fc 100644
--- a/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md
+++ b/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md
@@ -87,4 +87,4 @@ class DeltaUrlSerializer(serializers.ModelSerializer):
     def get_tdamm_tag(self, obj):
         tags = obj.tdamm_tag
         return tags if tags is not None else []
-```
\ No newline at end of file
+```

From 633cc1558be9ec8d0a70b7dffae30bd750614190 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Mon, 16 Dec 2024 18:44:58 +0530
Subject: [PATCH 321/441] Conditional anchor updated for 0 Delta URLs

---
 .../templates/sde_collections/collection_list.html            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index 49515454..beae81d1 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -148,8 +148,8 @@ <h2 class="title">Welcome back!</h2>
 
                     <!-- Delta URLs Column - Shows count and links if > 0 -->
                     <td class="noBorder centerAlign">
-                        <a href=" {% if collection.num_delta_urls > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
-                           class="btn btn-sm {% if collection.num_delta_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
+                        <a href=" {% if collection.num_delta_urls >= 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
+                           class="btn btn-sm {% if collection.num_delta_urls >= 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
                            role="button">{{ collection.num_delta_urls|intcomma }}</a>
                     </td>
 

From 3e828b28d2c6045eed81414e257a94a2b1123163 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 18 Dec 2024 17:46:41 +0530
Subject: [PATCH 322/441] Javascript updated

---
 sde_indexing_helper/static/js/delta_url_list.js | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 060eff59..8caee0d6 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -2199,6 +2199,7 @@ function handleWorkflowStatusSelect() {
 
 function handleReindexingStatusSelect() {
   $("body").on("click", ".reindexing_status_select", function () {
+    console.log("Reindexing status select clicked");
     $("#reindexingStatusChangeModal").modal();
     var collectionName = $(".urlStyle").text();
     var collection_id = $(this).data("collection-id");
@@ -2219,13 +2220,13 @@ function handleReindexingStatusSelect() {
         case "changeReindexingStatus":
           var color_choices = {
             1: "btn-light",     // REINDEXING_NOT_NEEDED
-            2: "btn-warning",   // REINDEXING_NEEDED_ON_DEV
-            3: "btn-secondary", // REINDEXING_FINISHED_ON_DEV
-            4: "btn-info",      // REINDEXING_READY_FOR_CURATION
-            5: "btn-warning",   // REINDEXING_CURATION_IN_PROGRESS
-            6: "btn-primary",   // REINDEXING_CURATED
-            7: "btn-success"    // REINDEXING_INDEXED_ON_PROD
-          };
+            2: "btn-danger",    // REINDEXING_NEEDED_ON_DEV (matching Ready For Engineering)
+            3: "btn-info",      // REINDEXING_FINISHED_ON_DEV (matching Indexing Finished on LRM Dev)
+            4: "btn-info",      // REINDEXING_READY_FOR_CURATION (matching Ready for Curation)
+            5: "btn-success",   // REINDEXING_CURATION_IN_PROGRESS (matching Curation in Progress)
+            6: "btn-primary",   // REINDEXING_CURATED (matching Curated)
+            7: "btn-primary"    // REINDEXING_INDEXED_ON_PROD (matching Prod: Perfect)
+        };
 
           $button = $(`#reindexing-status-button-${collection_id}`);
 
@@ -2233,7 +2234,9 @@ function handleReindexingStatusSelect() {
           $button.removeClass(
             "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
           );
+          console.log("After remove class:", $button.attr('class'));
           $button.addClass(color_choices[parseInt(reindexing_status)]);
+          console.log("After add class:", $button.attr('class'));
           postReindexingStatus(collection_id, reindexing_status);
           $("#reindexingStatusChangeModal").modal("hide");
           break;

From 006440381cd781954759b2193e5cbd197f5c082f Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 18 Dec 2024 17:50:39 +0530
Subject: [PATCH 323/441] Updated collection.py

---
 sde_collections/models/collection.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index e9d4295e..0f1162e1 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -346,13 +346,13 @@ def workflow_status_button_color(self) -> str:
     @property
     def reindexing_status_button_color(self) -> str:
         color_choices = {
-            1: "btn-light",  # NOT_NEEDED
-            2: "btn-warning",  # NEEDED
-            3: "btn-secondary",  # FINISHED
-            4: "btn-info",  # READY_FOR_CURATION
-            5: "btn-warning",  # CURATION_IN_PROGRESS
-            6: "btn-primary",  # CURATED
-            7: "btn-success",  # INDEXED_ON_PROD
+            1: "btn-light",  # REINDEXING_NOT_NEEDED
+            2: "btn-danger",  # REINDEXING_NEEDED_ON_DEV (matching Ready For Engineering)
+            3: "btn-info",  # REINDEXING_FINISHED_ON_DEV (matching Indexing Finished on LRM Dev)
+            4: "btn-info",  # REINDEXING_READY_FOR_CURATION (matching Ready for Curation)
+            5: "btn-success",  # REINDEXING_CURATION_IN_PROGRESS (matching Curation in Progress)
+            6: "btn-primary",  # REINDEXING_CURATED (matching Curated)
+            7: "btn-primary",  # REINDEXING_INDEXED_ON_PROD (matching Prod: Perfect)
         }
         return color_choices[self.reindexing_status]
 

From a081574298b0dbd28a1959ddfa1928b3bf7aa0d1 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 18 Dec 2024 18:04:53 +0530
Subject: [PATCH 324/441] Removed console log statements

---
 sde_indexing_helper/static/js/delta_url_list.js | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 8caee0d6..538c6e6a 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -2199,7 +2199,6 @@ function handleWorkflowStatusSelect() {
 
 function handleReindexingStatusSelect() {
   $("body").on("click", ".reindexing_status_select", function () {
-    console.log("Reindexing status select clicked");
     $("#reindexingStatusChangeModal").modal();
     var collectionName = $(".urlStyle").text();
     var collection_id = $(this).data("collection-id");
@@ -2234,9 +2233,7 @@ function handleReindexingStatusSelect() {
           $button.removeClass(
             "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary"
           );
-          console.log("After remove class:", $button.attr('class'));
           $button.addClass(color_choices[parseInt(reindexing_status)]);
-          console.log("After add class:", $button.attr('class'));
           postReindexingStatus(collection_id, reindexing_status);
           $("#reindexingStatusChangeModal").modal("hide");
           break;

From a1e29f9df8268fa271eb2668251827b7b193bfa7 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 18 Dec 2024 11:58:19 -0600
Subject: [PATCH 325/441] fixed paging on excludes and includes tabs

---
 sde_indexing_helper/static/js/delta_url_list.js | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 060eff59..f68c2489 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -539,7 +539,8 @@ function initializeDataTable() {
   );
 
   var exclude_patterns_table = $("#exclude_patterns_table").DataTable({
-    // scrollY: true,
+    serverSide: true,
+    paging: true,
     dom: "lBrtip",
     buttons: [
       {
@@ -562,7 +563,7 @@ function initializeDataTable() {
       ["Show 25", "Show 50", "Show 100", "Show 500"],
     ],
     orderCellsTop: true,
-    pageLength: 100,
+    pageLength: 50,
     ajax: `/api/exclude-patterns/?format=datatables&collection_id=${collection_id}`,
     initComplete: function (data) {
       var table = $("#exclude_patterns_table").DataTable();
@@ -626,7 +627,8 @@ function initializeDataTable() {
   });
 
   var include_patterns_table = $("#include_patterns_table").DataTable({
-    // scrollY: true,
+    serverSide: true,
+    paging: true,
     lengthMenu: [
       [25, 50, 100, 500],
       ["Show 25", "Show 50", "Show 100", "Show 500"],
@@ -648,7 +650,7 @@ function initializeDataTable() {
         },
       },
     ],
-    pageLength: 100,
+    pageLength: 50,
     orderCellsTop: true,
     ajax: `/api/include-patterns/?format=datatables&collection_id=${collection_id}`,
     initComplete: function (data) {

From 9acfb148f50efa0671e097d6709cd7c7c1224847 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 13:07:21 -0600
Subject: [PATCH 326/441] remove assignment of reindexing finished status in
 migration file

---
 ...ion_reindexing_status_reindexinghistory.py | 38 +------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
index fb94ef11..8a746cb4 100644
--- a/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
+++ b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py
@@ -1,5 +1,4 @@
 # Generated by Django 4.2.9 on 2024-12-06 03:51
-
 from django.conf import settings
 from django.db import migrations, models
 import django.db.models.deletion
@@ -8,30 +7,6 @@
 def set_initial_reindexing_status(apps, schema_editor):
     Collection = apps.get_model("sde_collections", "Collection")
 
-    # List of collections that have been reindexed on LRM dev
-    reindexed_collections = {
-        "astrophysics_source_code_library",
-        "astrophysics_science_division_asd_code_660",
-        "the_astrophysics_astrochemistry_lab",
-        "Space_Physics_Data_Facility",
-        "ppi_node",
-        "sun_climate_powered_by_solar_irradiance",
-        "magnetospheric_multiscale_satellites",
-        "mdscc_deep_space_network",
-        "voyager",
-        "f_prime",
-        "interactive_multiinstrument_database_of_solar_flares",
-        "cii_hosted_payload_opportunity_online_database",
-        "national_space_weather_program",
-        "starchild_a_learning_center_for_young_astronomers",
-        "nexsci",
-        "explorer_1",
-        "the_new_great_observatories",
-        "nasa_ames_intelligent_systems_division_data",
-        "tropical_cyclone_information_system_data_repository",
-        "explorers_and_heliophysics_projects_division",
-    }
-
     # Define the workflow status values
     RESEARCH_IN_PROGRESS = 1
     READY_FOR_ENGINEERING = 2
@@ -68,22 +43,13 @@ def set_initial_reindexing_status(apps, schema_editor):
         NEEDS_DELETE,
     ]
 
-    # Set collections that have been reindexed
-    Collection.objects.filter(config_folder__in=reindexed_collections).update(reindexing_status=3)  # FINISHED
-
     # Set collections that don't need reindexing
-    Collection.objects.filter(workflow_status__in=reindexing_not_needed_statuses).exclude(
-        config_folder__in=reindexed_collections
-    ).update(
+    Collection.objects.filter(workflow_status__in=reindexing_not_needed_statuses).update(
         reindexing_status=1
     )  # NOT_NEEDED
 
     # All other collections need reindexing
-    Collection.objects.exclude(config_folder__in=reindexed_collections).exclude(
-        workflow_status__in=reindexing_not_needed_statuses
-    ).update(
-        reindexing_status=2
-    )  # NEEDED
+    Collection.objects.exclude(workflow_status__in=reindexing_not_needed_statuses).update(reindexing_status=2)  # NEEDED
 
 
 class Migration(migrations.Migration):

From 7fcf45ad0d9366be79e1ab46ad1d21368799f85f Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 13:08:47 -0600
Subject: [PATCH 327/441] change name to COSMOS

---
 sde_indexing_helper/templates/base.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_indexing_helper/templates/base.html b/sde_indexing_helper/templates/base.html
index f639f63e..b00e3636 100644
--- a/sde_indexing_helper/templates/base.html
+++ b/sde_indexing_helper/templates/base.html
@@ -59,7 +59,7 @@
                   aria-label="Toggle navigation">
             <span class="navbar-toggler-icon"></span>
           </button>
-          <a class="navbar-brand" href="{% url 'sde_collections:list' %}">SDE Indexing Helper</a>
+          <a class="navbar-brand" href="{% url 'sde_collections:list' %}">COSMOS</a>
           <div class="collapse navbar-collapse" id="navbarSupportedContent">
             <ul class="navbar-nav mr-auto">
               <li class="nav-item active">

From c8cb5cc847cad169c02370d12de62e6ac64fd12a Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 13:16:48 -0600
Subject: [PATCH 328/441] change name from SDE Indexing Helper to COSMOS

---
 SQLDumpRestoration.md                         | 109 +++++++++++++++++-
 config/urls.py                                |   6 +-
 config/wsgi.py                                |   2 +-
 docs/conf.py                                  |   2 +-
 .../templates/account/signup.html             |   2 +-
 sde_indexing_helper/templates/base.html       |   2 +-
 .../templates/includes/sidebar.html           |   2 +-
 .../templates/layouts/base.html               |   2 +-
 .../templates/layouts/base_auth.html          |   2 +-
 sde_indexing_helper/users/models.py           |   2 +-
 10 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md
index 6b4792be..866093db 100644
--- a/SQLDumpRestoration.md
+++ b/SQLDumpRestoration.md
@@ -82,4 +82,111 @@ docker-compose -f local.yml up
 docker-compose -f local.yml run --rm django python manage.py createsuperuser
 ```
 
-8. Log in to the SDE Indexing Helper frontend to ensure that all data has been correctly populated in the UI.
+8. Log in to the COSMOS frontend to ensure that all data has been correctly populated in the UI.
+
+
+
+# making the backup
+
+```bash
+ssh sde
+cat .envs/.production/.postgres
+```
+
+find the values for the variables:
+POSTGRES_HOST=sde-indexing-helper-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com
+POSTGRES_PORT=5432
+POSTGRES_DB=postgres
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=this_is_A_web_application_built_in_2023
+
+```bash
+docker ps
+```
+
+b3fefa2c19fb
+
+note here that you need to put the
+```bash
+docker exec -t your_postgres_container_id pg_dump -U your_postgres_user -d your_database_name > backup.sql
+```
+```bash
+docker exec -t container_id pg_dump -h host -U user -d database -W > prod_backup.sql
+```
+
+docker exec -t b3fefa2c19fb env PGPASSWORD="this_is_A_web_application_built_in_2023" pg_dump -h sde-indexing-helper-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres > prod_backup.sql
+
+# move the backup to local
+ go back to local computer and scp the file
+
+```bash
+scp sde:/home/ec2-user/sde_indexing_helper/prod_backup.sql .
+```
+scp prod_backup.sql sde_staging:/home/ec2-user/sde-indexing-helper
+if you have trouble transferring the file, you can use rsync:
+rsync -avzP prod_backup.sql sde_staging:/home/ec2-user/sde-indexing-helper/
+
+# restoring the backup
+bring down the local containers
+```bash
+docker-compose -f local.yml down
+docker-compose -f local.yml up postgres
+docker ps
+```
+
+find the container id
+
+c11d7bae2e56
+
+find the local variables from
+cat .envs/.production/.postgres
+POSTGRES_HOST=sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com
+POSTGRES_PORT=5432
+POSTGRES_DB=sde_staging
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=postgres
+
+
+```bash
+docker exec -it <container id> bash
+```
+docker exec -it c11d7bae2e56 bash
+
+## do all the database shit you need to
+
+
+psql -U <POSTGRES_USER> -d <POSTGRES_DB>
+psql -U postgres -d sde_staging
+or, if you are on one of the servers:
+psql -h sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres
+
+\c postgres
+DROP DATABASE sde_staging;
+CREATE DATABASE sde_staging;
+
+# do the backup
+
+```bash
+docker cp prod_backup.sql c11d7bae2e56:/
+docker exec -it c11d7bae2e56 bash
+```
+
+```bash
+psql -U <POSTGRES_USER> -d <POSTGRES_DB> -f backup.sql
+```
+psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper -f prod_backup.sql
+
+psql -h sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres -f prod_backup.sql
+pg_restore -h sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres prod_backup.sql
+
+
+
+docker down
+
+docker up build
+
+migrate
+
+down
+
+up
diff --git a/config/urls.py b/config/urls.py
index 93b2684f..fe43cf8b 100644
--- a/config/urls.py
+++ b/config/urls.py
@@ -4,9 +4,9 @@
 from django.urls import include, path
 from django.views import defaults as default_views
 
-admin.site.site_header = "SDE Indexing Helper Administration"  # default: "Django Administration"
-admin.site.index_title = "SDE Indexing Helper"  # default: "Site administration"
-admin.site.site_title = "SDE Indexing Helper"  # default: "Django site admin"
+admin.site.site_header = "COSMOS Administration"  # default: "Django Administration"
+admin.site.index_title = "COSMOS"  # default: "Site administration"
+admin.site.site_title = "COSMOS"  # default: "Django site admin"
 
 urlpatterns = [
     path("", include("sde_collections.urls", namespace="sde_collections")),
diff --git a/config/wsgi.py b/config/wsgi.py
index bbc3c1ef..becaf23d 100644
--- a/config/wsgi.py
+++ b/config/wsgi.py
@@ -1,5 +1,5 @@
 """
-WSGI config for SDE Indexing Helper project.
+WSGI config for COSMOS.
 
 This module contains the WSGI application used by Django's development server
 and any production WSGI deployments. It should expose a module-level variable
diff --git a/docs/conf.py b/docs/conf.py
index 93a07713..530d8841 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -28,7 +28,7 @@
 
 # -- Project information -----------------------------------------------------
 
-project = "SDE Indexing Helper"
+project = "COSMOS"
 copyright = """2023, NASA IMPACT"""
 author = "NASA IMPACT"
 
diff --git a/sde_indexing_helper/templates/account/signup.html b/sde_indexing_helper/templates/account/signup.html
index aa5c7f7c..a2652b9a 100644
--- a/sde_indexing_helper/templates/account/signup.html
+++ b/sde_indexing_helper/templates/account/signup.html
@@ -16,7 +16,7 @@
     <div class="card login-card">
       <div class="title-wrapper">
         <h4 class="card-title login-title">Lets Get Started!</h4>
-        <p>Create an account for SDE Indexing Helper</p>
+        <p>Create an account for COSMOS</p>
       </div>
       <div >
         <form  id="signup_form"  method="post" action="{% url 'account_signup' %}">
diff --git a/sde_indexing_helper/templates/base.html b/sde_indexing_helper/templates/base.html
index b00e3636..643e4724 100644
--- a/sde_indexing_helper/templates/base.html
+++ b/sde_indexing_helper/templates/base.html
@@ -7,7 +7,7 @@
     <meta http-equiv="x-ua-compatible" content="ie=edge">
     <title>
       {% block title %}
-        SDE Indexing Helper
+        COSMOS
       {% endblock title %}
     </title>
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
diff --git a/sde_indexing_helper/templates/includes/sidebar.html b/sde_indexing_helper/templates/includes/sidebar.html
index 490d53f4..2fbce312 100644
--- a/sde_indexing_helper/templates/includes/sidebar.html
+++ b/sde_indexing_helper/templates/includes/sidebar.html
@@ -6,7 +6,7 @@
   <div class="logo">
     <a rel="noopener noreferrer"
        href="{% url 'sde_collections:list' %}"
-       class="simple-text logo-normal">SDE Indexing Helper</a>
+       class="simple-text logo-normal">COMSMOS</a>
   </div>
   <div class="sidebar-wrapper">
     <ul class="nav">
diff --git a/sde_indexing_helper/templates/layouts/base.html b/sde_indexing_helper/templates/layouts/base.html
index dd592429..4970fa59 100644
--- a/sde_indexing_helper/templates/layouts/base.html
+++ b/sde_indexing_helper/templates/layouts/base.html
@@ -10,7 +10,7 @@
     <title>
       {% block title %}
       {% endblock title %}
-      | SDE Indexing Helper
+      | COSMOS
     </title>
     <meta content='width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0, shrink-to-fit=no'
           name='viewport' />
diff --git a/sde_indexing_helper/templates/layouts/base_auth.html b/sde_indexing_helper/templates/layouts/base_auth.html
index 2be2c6fd..5baa4d56 100644
--- a/sde_indexing_helper/templates/layouts/base_auth.html
+++ b/sde_indexing_helper/templates/layouts/base_auth.html
@@ -10,7 +10,7 @@
     <title>
       {% block title %}
       {% endblock title %}
-      | SDE Indexing Helper
+      | COSMOS
     </title>
     <meta content='width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0, shrink-to-fit=no'
           name='viewport' />
diff --git a/sde_indexing_helper/users/models.py b/sde_indexing_helper/users/models.py
index 3e84ff06..cede4f91 100644
--- a/sde_indexing_helper/users/models.py
+++ b/sde_indexing_helper/users/models.py
@@ -6,7 +6,7 @@
 
 class User(AbstractUser):
     """
-    Default custom user model for SDE Indexing Helper.
+    Default custom user model for COSMOS.
     If adding fields that need to be filled at user signup,
     check forms.SignupForm and forms.SocialSignupForms accordingly.
     """

From 01b9a1e2a81cdc23569f292b98bcf7b1b4ad288d Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 13:49:54 -0600
Subject: [PATCH 329/441] remove SVG SDE Indexing Helper logo and replace with
 simple COSMOS text

---
 .../templates/includes/navigation.html        | 14 ++------
 .../templates/layouts/base_auth.html          | 34 +++++++------------
 2 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/sde_indexing_helper/templates/includes/navigation.html b/sde_indexing_helper/templates/includes/navigation.html
index 658b4275..8b2b509e 100644
--- a/sde_indexing_helper/templates/includes/navigation.html
+++ b/sde_indexing_helper/templates/includes/navigation.html
@@ -21,17 +21,9 @@
       <a rel="noopener noreferrer"
          href="{% url 'sde_collections:list' %}"
          class="simple-text logo-normal title">
-         <svg width="223" height="25" fill="none" xmlns="http://www.w3.org/2000/svg">
-          <g clip-path="url(#a)" fill="#E5A000">
-            <path d="M1.48571 23.3429C.5 22.2429 0 20.4571 0 18v-2.4h4.82857v3.0571c0 1.2143.38572 1.8286 1.17143 1.8286.44286 0 .74286-.1286.91429-.3857.17142-.2571.25714-.6857.25714-1.3 0-.8-.1-1.4571-.28572-1.9857-.18571-.5286-.42857-.9572-.72857-1.3143-.3-.3571-.82857-.9-1.58571-1.6429l-2.11429-2.1142C.814286 10.1429 0 8.37143 0 6.42857c0-2.1.485714-3.68571 1.44286-4.78571C2.4.542857 3.81429 0 5.65714 0c2.21429 0 3.8.585714 4.77146 1.75714.9714 1.17143 1.4571 3 1.4571 5.5H6.88571l-.02857-1.68571c0-.32857-.08571-.58572-.27143-.77143-.18571-.18571-.42857-.28571-.75714-.28571-.38571 0-.67143.1-.85714.31428-.18572.21429-.28572.5-.28572.85714 0 .8.45715 1.62858 1.37143 2.48572l2.85715 2.74287c.67142.6428 1.21431 1.2571 1.65711 1.8428.4429.5858.7857 1.2715 1.0572 2.0572.2714.7857.4 1.7286.4 2.8143 0 2.4143-.4429 4.2571-1.3286 5.5C9.81429 24.3714 8.35714 25 6.31429 25c-2.22858 0-3.84286-.5571-4.82858-1.6571ZM13.6572.228577h6.7714c1.7571 0 3.0714.485714 3.9428 1.457143C25.2429 2.65715 25.7 4.1 25.7143 6l.0572 11.4857c.0142 2.4143-.4001 4.2429-1.2572 5.4572-.8571 1.2142-2.2714 1.8285-4.2571 1.8285h-6.6V.228577Zm5.8 20.200023c.9 0 1.3428-.4429 1.3428-1.3143V6.45715c0-.55714-.0428-.97143-.1285-1.24286-.0858-.27143-.2429-.45714-.4715-.55714-.2285-.1-.5857-.14286-1.0571-.14286H18.6V20.4286h.8572ZM27.7429 24.7714V.228577h9.8285V4.97143h-4.8285v4.77143h4.6285v4.62854h-4.6285V20h5.1428v4.7714H27.7429ZM45.9714 24.7714V.228577h4.7429V24.7714h-4.7429ZM52.6857 24.7714V.228577h5.0572L60 11.9714V.228577h4.7429V24.7714h-4.8l-2.4572-12.2857v12.2857h-4.8ZM66.9143.228577h6.7714c1.7572 0 3.0715.485714 3.9429 1.457143C78.5 2.65715 78.9572 4.1 78.9714 6l.0572 11.4857c.0143 2.4143-.4 4.2429-1.2572 5.4572-.8571 1.2142-2.2714 1.8285-4.2571 1.8285h-6.6V.228577Zm5.8 20.200023c.9 0 1.3429-.4429 1.3429-1.3143V6.45715c0-.55714-.0429-.97143-.1286-1.24286-.0857-.27143-.2429-.45714-.4714-.55714-.2286-.1-.5858-.14286-1.0572-.14286h-.5428V20.4286h.8571ZM81 24.7714V.228577h9.8286V4.97143H86v4.77143h4.6286v4.62854H86V20h5.1429v4.7714H81ZM92.0571 24.7714l2.6286-13.0285L92.6.200012h4.8l1.2857 8.457148L99.8.200012h4.829L102.514 11.7429l2.629 13.0285h-4.972l-1.5424-9.6-1.5715 9.6h-5ZM106.371 24.7714V.228577h4.743V24.7714h-4.743ZM113.086 24.7714V.228577h5.057L120.4 11.9714V.228577h4.743V24.7714h-4.8l-2.457-12.2857v12.2857h-4.8ZM128.414 23.1286c-.871-1.2429-1.3-3.1429-1.3-5.6715V6.88571c0-2.27142.5-3.98571 1.515-5.14285C129.643.585714 131.143 0 133.143 0c1.614 0 2.871.328571 3.771.971429.9.642861 1.515 1.585711 1.857 2.828571.343 1.24286.515 2.82857.515 4.77143h-4.8v-2.2c0-.58572-.086-1.04286-.243-1.37143-.157-.32857-.457-.48571-.872-.48571-.928 0-1.4.61428-1.4 1.82857V18.2286c0 .7428.1 1.3.286 1.6857.186.3857.529.5714 1 .5714.472 0 .814-.1857 1-.5714.186-.3857.286-.9429.286-1.6857v-3.5143h-1.314v-4.2286h6v14.2857h-1.972l-.828-2.0571C135.557 24.2429 134.2 25 132.371 25c-1.828 0-3.085-.6286-3.957-1.8714ZM147.857 24.7714V.228577h4.886V9h2.314V.228577h4.886V24.7714h-4.886V13.6286h-2.314v11.1428h-4.886ZM162.114 24.7714V.228577h9.829V4.97143h-4.829v4.77143h4.629v4.62854h-4.629V20h5.143v4.7714h-10.143ZM173.886 24.7714V.228577h4.914V20.6h5.057v4.1714h-9.971ZM185.229.228577h6.485c1.957 0 3.357.628571 4.2 1.900003.843 1.27143 1.257 3.11428 1.257 5.55714 0 2.44288-.371 4.27138-1.114 5.51428-.743 1.2429-2.071 1.8571-4 1.8571h-2v9.7143h-4.828V.228577Zm5.114 10.400023c.586 0 1.028-.1143 1.3-.3286.271-.2143.457-.52857.543-.94285.085-.41429.128-1 .128-1.75714 0-1.02858-.114-1.77143-.343-2.22858-.228-.45714-.685-.68571-1.371-.68571h-.543v5.94288h.286ZM198.714 24.7714V.228577h9.829V4.97143h-4.829v4.77143h4.629v4.62854h-4.629V20h5.143v4.7714h-10.143ZM210.486.228577h7.543c1.2 0 2.128.271428 2.785.814283.657.54286 1.1 1.3 1.329 2.28572.228.98571.343 2.22857.343 3.75714 0 1.38571-.186 2.47143-.543 3.25718-.357.7857-.986 1.3285-1.886 1.6285.743.1572 1.286.5286 1.614 1.1143.329.5857.5 1.3857.5 2.4l-.057 9.2857h-4.8v-9.6c0-.6857-.128-1.1285-.4-1.3143-.271-.1857-.757-.2857-1.457-.2857v11.2h-4.971V.228577Zm6.171 9.085713c.686 0 1.029-.74286 1.029-2.22857 0-.64286-.029-1.12857-.086-1.45714-.057-.32858-.157-.54286-.314-.67143-.157-.12857-.372-.18572-.657-.18572h-1.143v4.54286h1.171Z"/>
-          </g>
-          <defs>
-            <clipPath id="a">
-              <path fill="#fff" d="M0 0h222.486v25H0z"/>
-            </clipPath>
-          </defs>
-        </svg>
-        </a>
+        <h1>COSMOS</h1>
+        <br><br>
+      </a>
     </div>
     <div class="collapse navbar-collapse justify-content-end">
       {% if request.user.is_authenticated %}
diff --git a/sde_indexing_helper/templates/layouts/base_auth.html b/sde_indexing_helper/templates/layouts/base_auth.html
index 5baa4d56..344bc077 100644
--- a/sde_indexing_helper/templates/layouts/base_auth.html
+++ b/sde_indexing_helper/templates/layouts/base_auth.html
@@ -39,28 +39,20 @@
     <div class="auth-wrapper">
         <div class="auth-content">
             <div class="auth-col-1"></div>
-          <div class="container-fluid body-content auth-col-2">
-            <div class="auth-col-2-wrapper">
-            <div class="auth-sde-idx-helper">
-                <svg width="223" height="25" fill="none" xmlns="http://www.w3.org/2000/svg">
-                <g clip-path="url(#a)" fill="#E5A000">
-                  <path d="M1.48571 23.3429C.5 22.2429 0 20.4571 0 18v-2.4h4.82857v3.0571c0 1.2143.38572 1.8286 1.17143 1.8286.44286 0 .74286-.1286.91429-.3857.17142-.2571.25714-.6857.25714-1.3 0-.8-.1-1.4571-.28572-1.9857-.18571-.5286-.42857-.9572-.72857-1.3143-.3-.3571-.82857-.9-1.58571-1.6429l-2.11429-2.1142C.814286 10.1429 0 8.37143 0 6.42857c0-2.1.485714-3.68571 1.44286-4.78571C2.4.542857 3.81429 0 5.65714 0c2.21429 0 3.8.585714 4.77146 1.75714.9714 1.17143 1.4571 3 1.4571 5.5H6.88571l-.02857-1.68571c0-.32857-.08571-.58572-.27143-.77143-.18571-.18571-.42857-.28571-.75714-.28571-.38571 0-.67143.1-.85714.31428-.18572.21429-.28572.5-.28572.85714 0 .8.45715 1.62858 1.37143 2.48572l2.85715 2.74287c.67142.6428 1.21431 1.2571 1.65711 1.8428.4429.5858.7857 1.2715 1.0572 2.0572.2714.7857.4 1.7286.4 2.8143 0 2.4143-.4429 4.2571-1.3286 5.5C9.81429 24.3714 8.35714 25 6.31429 25c-2.22858 0-3.84286-.5571-4.82858-1.6571ZM13.6572.228577h6.7714c1.7571 0 3.0714.485714 3.9428 1.457143C25.2429 2.65715 25.7 4.1 25.7143 6l.0572 11.4857c.0142 2.4143-.4001 4.2429-1.2572 5.4572-.8571 1.2142-2.2714 1.8285-4.2571 1.8285h-6.6V.228577Zm5.8 20.200023c.9 0 1.3428-.4429 1.3428-1.3143V6.45715c0-.55714-.0428-.97143-.1285-1.24286-.0858-.27143-.2429-.45714-.4715-.55714-.2285-.1-.5857-.14286-1.0571-.14286H18.6V20.4286h.8572ZM27.7429 24.7714V.228577h9.8285V4.97143h-4.8285v4.77143h4.6285v4.62854h-4.6285V20h5.1428v4.7714H27.7429ZM45.9714 24.7714V.228577h4.7429V24.7714h-4.7429ZM52.6857 24.7714V.228577h5.0572L60 11.9714V.228577h4.7429V24.7714h-4.8l-2.4572-12.2857v12.2857h-4.8ZM66.9143.228577h6.7714c1.7572 0 3.0715.485714 3.9429 1.457143C78.5 2.65715 78.9572 4.1 78.9714 6l.0572 11.4857c.0143 2.4143-.4 4.2429-1.2572 5.4572-.8571 1.2142-2.2714 1.8285-4.2571 1.8285h-6.6V.228577Zm5.8 20.200023c.9 0 1.3429-.4429 1.3429-1.3143V6.45715c0-.55714-.0429-.97143-.1286-1.24286-.0857-.27143-.2429-.45714-.4714-.55714-.2286-.1-.5858-.14286-1.0572-.14286h-.5428V20.4286h.8571ZM81 24.7714V.228577h9.8286V4.97143H86v4.77143h4.6286v4.62854H86V20h5.1429v4.7714H81ZM92.0571 24.7714l2.6286-13.0285L92.6.200012h4.8l1.2857 8.457148L99.8.200012h4.829L102.514 11.7429l2.629 13.0285h-4.972l-1.5424-9.6-1.5715 9.6h-5ZM106.371 24.7714V.228577h4.743V24.7714h-4.743ZM113.086 24.7714V.228577h5.057L120.4 11.9714V.228577h4.743V24.7714h-4.8l-2.457-12.2857v12.2857h-4.8ZM128.414 23.1286c-.871-1.2429-1.3-3.1429-1.3-5.6715V6.88571c0-2.27142.5-3.98571 1.515-5.14285C129.643.585714 131.143 0 133.143 0c1.614 0 2.871.328571 3.771.971429.9.642861 1.515 1.585711 1.857 2.828571.343 1.24286.515 2.82857.515 4.77143h-4.8v-2.2c0-.58572-.086-1.04286-.243-1.37143-.157-.32857-.457-.48571-.872-.48571-.928 0-1.4.61428-1.4 1.82857V18.2286c0 .7428.1 1.3.286 1.6857.186.3857.529.5714 1 .5714.472 0 .814-.1857 1-.5714.186-.3857.286-.9429.286-1.6857v-3.5143h-1.314v-4.2286h6v14.2857h-1.972l-.828-2.0571C135.557 24.2429 134.2 25 132.371 25c-1.828 0-3.085-.6286-3.957-1.8714ZM147.857 24.7714V.228577h4.886V9h2.314V.228577h4.886V24.7714h-4.886V13.6286h-2.314v11.1428h-4.886ZM162.114 24.7714V.228577h9.829V4.97143h-4.829v4.77143h4.629v4.62854h-4.629V20h5.143v4.7714h-10.143ZM173.886 24.7714V.228577h4.914V20.6h5.057v4.1714h-9.971ZM185.229.228577h6.485c1.957 0 3.357.628571 4.2 1.900003.843 1.27143 1.257 3.11428 1.257 5.55714 0 2.44288-.371 4.27138-1.114 5.51428-.743 1.2429-2.071 1.8571-4 1.8571h-2v9.7143h-4.828V.228577Zm5.114 10.400023c.586 0 1.028-.1143 1.3-.3286.271-.2143.457-.52857.543-.94285.085-.41429.128-1 .128-1.75714 0-1.02858-.114-1.77143-.343-2.22858-.228-.45714-.685-.68571-1.371-.68571h-.543v5.94288h.286ZM198.714 24.7714V.228577h9.829V4.97143h-4.829v4.77143h4.629v4.62854h-4.629V20h5.143v4.7714h-10.143ZM210.486.228577h7.543c1.2 0 2.128.271428 2.785.814283.657.54286 1.1 1.3 1.329 2.28572.228.98571.343 2.22857.343 3.75714 0 1.38571-.186 2.47143-.543 3.25718-.357.7857-.986 1.3285-1.886 1.6285.743.1572 1.286.5286 1.614 1.1143.329.5857.5 1.3857.5 2.4l-.057 9.2857h-4.8v-9.6c0-.6857-.128-1.1285-.4-1.3143-.271-.1857-.757-.2857-1.457-.2857v11.2h-4.971V.228577Zm6.171 9.085713c.686 0 1.029-.74286 1.029-2.22857 0-.64286-.029-1.12857-.086-1.45714-.057-.32858-.157-.54286-.314-.67143-.157-.12857-.372-.18572-.657-.18572h-1.143v4.54286h1.171Z"/>
-                </g>
-                <defs>
-                  <clipPath id="a">
-                    <path fill="#fff" d="M0 0h222.486v25H0z"/>
-                  </clipPath>
-                </defs>
-              </svg></div>
-            {% block content %}
-            {% endblock content %}
+            <div class="container-fluid body-content auth-col-2">
+                <div class="auth-col-2-wrapper">
+                    <div class="auth-sde-idx-helper">
+                        <h1>COSMOS</h1>
+                    </div>
+                    {% block content %}
+                    {% endblock content %}
+                </div>
+            </div>
+            <div class="footer-wrapper">
+                {% include 'includes/footer.html' %}
+            </div>
         </div>
-          </div>
-          <div class="footer-wrapper">
-          {% include 'includes/footer.html' %}
-        </div>
-        </div>
-      </div>
+    </div>
     {% include 'includes/scripts.html' %}
     {% include 'includes/scripts-sidebar.html' %}
     <!-- Specific Page JS goes HERE  -->

From d8250ee72ba8fad1ef3c65ef0f4511908875f33e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 14:57:26 -0600
Subject: [PATCH 330/441] remove <br> blocking buttons

---
 sde_indexing_helper/templates/includes/navigation.html | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sde_indexing_helper/templates/includes/navigation.html b/sde_indexing_helper/templates/includes/navigation.html
index 8b2b509e..9bc98118 100644
--- a/sde_indexing_helper/templates/includes/navigation.html
+++ b/sde_indexing_helper/templates/includes/navigation.html
@@ -22,7 +22,6 @@
          href="{% url 'sde_collections:list' %}"
          class="simple-text logo-normal title">
         <h1>COSMOS</h1>
-        <br><br>
       </a>
     </div>
     <div class="collapse navbar-collapse justify-content-end">

From de941ce01dc5244e5e288923f51ab945a453c090 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 15:35:16 -0600
Subject: [PATCH 331/441] test adding long timeout to traefik

---
 compose/production/traefik/traefik.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 58d78eba..13eac304 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -14,6 +14,11 @@ entryPoints:
   web-secure:
     # https
     address: ":443"
+    transport:
+      respondingTimeouts:
+        readTimeout: "180s"
+        writeTimeout: "180s"
+        idleTimeout: "180s"
 
   flower:
     address: ":5555"
@@ -36,6 +41,7 @@ http:
         - web-secure
       middlewares:
         - csrf
+        - long-timeout
       service: django
       tls:
         # https://docs.traefik.io/master/routing/routers/#certresolver
@@ -56,12 +62,19 @@ http:
       # https://docs.djangoproject.com/en/dev/ref/csrf/#ajax
       headers:
         hostsProxyHeaders: ["X-CSRFToken"]
+    long-timeout:
+      forwardingTimeouts:
+        dialTimeout: "180s"
+        responseHeaderTimeout: "180s"
+        idleConnTimeout: "180s"
 
   services:
     django:
       loadBalancer:
         servers:
           - url: http://django:5000
+        responseForwarding:
+          flushInterval: "100ms"
 
     flower:
       loadBalancer:

From fb7a8093c9f439a038dbe6a5b0dc6b0c93c839a0 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 15:44:21 -0600
Subject: [PATCH 332/441] simplify timeout handling in traefik

---
 compose/production/traefik/traefik.yml | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 13eac304..23d9ffd9 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -14,11 +14,6 @@ entryPoints:
   web-secure:
     # https
     address: ":443"
-    transport:
-      respondingTimeouts:
-        readTimeout: "180s"
-        writeTimeout: "180s"
-        idleTimeout: "180s"
 
   flower:
     address: ":5555"
@@ -34,6 +29,9 @@ certificatesResolvers:
         entryPoint: web
 
 http:
+  serversTransport:
+    respondingTimeouts:
+      idleTimeout: "5m0s"
   routers:
     web-secure-router:
       rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)'
@@ -41,7 +39,6 @@ http:
         - web-secure
       middlewares:
         - csrf
-        - long-timeout
       service: django
       tls:
         # https://docs.traefik.io/master/routing/routers/#certresolver
@@ -62,19 +59,12 @@ http:
       # https://docs.djangoproject.com/en/dev/ref/csrf/#ajax
       headers:
         hostsProxyHeaders: ["X-CSRFToken"]
-    long-timeout:
-      forwardingTimeouts:
-        dialTimeout: "180s"
-        responseHeaderTimeout: "180s"
-        idleConnTimeout: "180s"
 
   services:
     django:
       loadBalancer:
         servers:
           - url: http://django:5000
-        responseForwarding:
-          flushInterval: "100ms"
 
     flower:
       loadBalancer:

From 094e6b0bf3a919c0b997ba6cc3d842a7bbebd183 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 15:47:43 -0600
Subject: [PATCH 333/441] try a timeout of 300s

---
 compose/production/traefik/traefik.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 23d9ffd9..e1e134fc 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -14,6 +14,9 @@ entryPoints:
   web-secure:
     # https
     address: ":443"
+    http:
+      middlewares:
+        - timeouts
 
   flower:
     address: ":5555"
@@ -29,9 +32,6 @@ certificatesResolvers:
         entryPoint: web
 
 http:
-  serversTransport:
-    respondingTimeouts:
-      idleTimeout: "5m0s"
   routers:
     web-secure-router:
       rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)'
@@ -59,6 +59,9 @@ http:
       # https://docs.djangoproject.com/en/dev/ref/csrf/#ajax
       headers:
         hostsProxyHeaders: ["X-CSRFToken"]
+    timeouts:
+      headers:
+        readTimeout: "300s"
 
   services:
     django:

From 5164e40ff360a8330b4f38fdb2b8688b9b812462 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 15:51:21 -0600
Subject: [PATCH 334/441] add timeout to the services layer

---
 compose/production/traefik/traefik.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index e1e134fc..2abaea3f 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -14,9 +14,6 @@ entryPoints:
   web-secure:
     # https
     address: ":443"
-    http:
-      middlewares:
-        - timeouts
 
   flower:
     address: ":5555"
@@ -59,15 +56,14 @@ http:
       # https://docs.djangoproject.com/en/dev/ref/csrf/#ajax
       headers:
         hostsProxyHeaders: ["X-CSRFToken"]
-    timeouts:
-      headers:
-        readTimeout: "300s"
 
   services:
     django:
       loadBalancer:
         servers:
           - url: http://django:5000
+        serversTransport:
+          serverTimeout: "300s"
 
     flower:
       loadBalancer:

From b44a3030457b92031922c1a778b0d2602937a658 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 15:53:56 -0600
Subject: [PATCH 335/441] create a dedicated section for serversTransport

---
 compose/production/traefik/traefik.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 2abaea3f..9c1227e7 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -62,14 +62,16 @@ http:
       loadBalancer:
         servers:
           - url: http://django:5000
-        serversTransport:
-          serverTimeout: "300s"
 
     flower:
       loadBalancer:
         servers:
           - url: http://flower:5555
 
+serversTransport:
+  defaultTransport:
+    requestTimeout: 90s
+
 providers:
   # https://docs.traefik.io/master/providers/file/
   file:

From c4fadd0013de3054e05b490a40e7c7cb4ef972c1 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 16:07:18 -0600
Subject: [PATCH 336/441] move the timeout to the services section

---
 compose/production/traefik/traefik.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 9c1227e7..731abca2 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -62,15 +62,18 @@ http:
       loadBalancer:
         servers:
           - url: http://django:5000
+        # Increase server timeout for long-running requests
+        server:
+          timeout: "300s"
 
     flower:
+      # Flower service
       loadBalancer:
         servers:
           - url: http://flower:5555
-
-serversTransport:
-  defaultTransport:
-    requestTimeout: 90s
+        # Increase server timeout for long-running requests
+        server:
+          timeout: "300s"
 
 providers:
   # https://docs.traefik.io/master/providers/file/

From 7f68c9ec46f923be8853ac1c562c4face03a9ea3 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 16:09:30 -0600
Subject: [PATCH 337/441] revert timeouts in traefik

---
 compose/production/traefik/traefik.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 731abca2..58d78eba 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -62,18 +62,11 @@ http:
       loadBalancer:
         servers:
           - url: http://django:5000
-        # Increase server timeout for long-running requests
-        server:
-          timeout: "300s"
 
     flower:
-      # Flower service
       loadBalancer:
         servers:
           - url: http://flower:5555
-        # Increase server timeout for long-running requests
-        server:
-          timeout: "300s"
 
 providers:
   # https://docs.traefik.io/master/providers/file/

From 31b82d78c1d61423150649d1718516ef52cce333 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 16:24:55 -0600
Subject: [PATCH 338/441] add long timeout middleware

---
 compose/production/traefik/traefik.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 58d78eba..0ad52e8a 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -36,6 +36,7 @@ http:
         - web-secure
       middlewares:
         - csrf
+        - long-timeout
       service: django
       tls:
         # https://docs.traefik.io/master/routing/routers/#certresolver
@@ -56,6 +57,11 @@ http:
       # https://docs.djangoproject.com/en/dev/ref/csrf/#ajax
       headers:
         hostsProxyHeaders: ["X-CSRFToken"]
+    long-timeout:
+      forwardingTimeouts:
+        dialTimeout: "5m"
+        responseHeaderTimeout: "5m"
+        idleTimeout: "5m"
 
   services:
     django:

From 884e2c257c3e070e20cd456958b90cd383dba037 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 16:26:50 -0600
Subject: [PATCH 339/441] use timeouts instead of forwarding timeouts

---
 compose/production/traefik/traefik.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 0ad52e8a..8b10f8c8 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -58,10 +58,9 @@ http:
       headers:
         hostsProxyHeaders: ["X-CSRFToken"]
     long-timeout:
-      forwardingTimeouts:
-        dialTimeout: "5m"
-        responseHeaderTimeout: "5m"
-        idleTimeout: "5m"
+      timeouts:
+        request: "5m"
+        idle: "5m"
 
   services:
     django:

From a34e7803cc99c36e2c8b73835e0cdd4eefaeaf56 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 16:33:42 -0600
Subject: [PATCH 340/441] remove middleware timeouts and add transport timeouts

---
 compose/production/traefik/traefik.yml | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 8b10f8c8..8967e19c 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -10,13 +10,28 @@ entryPoints:
       redirections:
         entryPoint:
           to: web-secure
+    transport:
+      respondingTimeouts:
+        readTimeout: "300s"
+        writeTimeout: "300s"
+        idleTimeout: "300s"
 
   web-secure:
     # https
     address: ":443"
+    transport:
+      respondingTimeouts:
+        readTimeout: "300s"
+        writeTimeout: "300s"
+        idleTimeout: "300s"
 
   flower:
     address: ":5555"
+    transport:
+      respondingTimeouts:
+        readTimeout: "300s"
+        writeTimeout: "300s"
+        idleTimeout: "300s"
 
 certificatesResolvers:
   letsencrypt:
@@ -36,7 +51,6 @@ http:
         - web-secure
       middlewares:
         - csrf
-        - long-timeout
       service: django
       tls:
         # https://docs.traefik.io/master/routing/routers/#certresolver
@@ -57,10 +71,6 @@ http:
       # https://docs.djangoproject.com/en/dev/ref/csrf/#ajax
       headers:
         hostsProxyHeaders: ["X-CSRFToken"]
-    long-timeout:
-      timeouts:
-        request: "5m"
-        idle: "5m"
 
   services:
     django:

From 30bfbc3d3692029d52ee23f05ec07d590a54df17 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 16:41:32 -0600
Subject: [PATCH 341/441] update the gunicorn timeout to match the traefik

---
 compose/production/django/start | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compose/production/django/start b/compose/production/django/start
index 97216fa1..91669ede 100644
--- a/compose/production/django/start
+++ b/compose/production/django/start
@@ -7,4 +7,8 @@ set -o nounset
 
 python /app/manage.py collectstatic --noinput
 
-exec /usr/local/bin/gunicorn config.wsgi --bind 0.0.0.0:5000 --chdir=/app
+exec /usr/local/bin/gunicorn config.wsgi \
+    --bind 0.0.0.0:5000 \
+    --chdir=/app \
+    --timeout 300 \
+    --graceful-timeout 300 \

From c84972e383d37d1b010bcc619b776465a911d3b9 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Wed, 18 Dec 2024 17:01:22 -0600
Subject: [PATCH 342/441] temporarily increase timeout to 10 minutes

---
 compose/production/django/start        |  4 ++--
 compose/production/traefik/traefik.yml | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/compose/production/django/start b/compose/production/django/start
index 91669ede..a8852d8a 100644
--- a/compose/production/django/start
+++ b/compose/production/django/start
@@ -10,5 +10,5 @@ python /app/manage.py collectstatic --noinput
 exec /usr/local/bin/gunicorn config.wsgi \
     --bind 0.0.0.0:5000 \
     --chdir=/app \
-    --timeout 300 \
-    --graceful-timeout 300 \
+    --timeout 600 \
+    --graceful-timeout 600 \
diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml
index 8967e19c..7ab6ecb7 100644
--- a/compose/production/traefik/traefik.yml
+++ b/compose/production/traefik/traefik.yml
@@ -12,26 +12,26 @@ entryPoints:
           to: web-secure
     transport:
       respondingTimeouts:
-        readTimeout: "300s"
-        writeTimeout: "300s"
-        idleTimeout: "300s"
+        readTimeout: "600s"
+        writeTimeout: "600s"
+        idleTimeout: "600s"
 
   web-secure:
     # https
     address: ":443"
     transport:
       respondingTimeouts:
-        readTimeout: "300s"
-        writeTimeout: "300s"
-        idleTimeout: "300s"
+        readTimeout: "600s"
+        writeTimeout: "600s"
+        idleTimeout: "600s"
 
   flower:
     address: ":5555"
     transport:
       respondingTimeouts:
-        readTimeout: "300s"
-        writeTimeout: "300s"
-        idleTimeout: "300s"
+        readTimeout: "600s"
+        writeTimeout: "600s"
+        idleTimeout: "600s"
 
 certificatesResolvers:
   letsencrypt:

From 21a921a020a98457eb7de0cfaecf3dfe28b7e97b Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Thu, 19 Dec 2024 15:11:47 +0530
Subject: [PATCH 343/441] column addition work in progress

---
 sde_collections/views.py                      |  2 +-
 .../static/js/collection_list.js              | 66 +++++++++++++++++--
 .../sde_collections/collection_list.html      | 11 ++++
 3 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/sde_collections/views.py b/sde_collections/views.py
index 8b3f1727..673705d9 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -71,7 +71,7 @@ def get_queryset(self):
             super()
             .get_queryset()
             .filter(delete=False)
-            .annotate(num_delta_urls=models.Count("delta_urls"))
+            .annotate(num_delta_urls=models.Count("delta_urls"), num_curated_urls=models.Count("curated_urls"))
             .order_by("-num_delta_urls")
         )
 
diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index 7500bad1..0cd5d8d7 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -4,13 +4,14 @@ const COLUMNS = {
   URL: 1,
   DIVISION: 2,
   DELTA_URLS: 3,
-  WORKFLOW_STATUS: 4,
-  CURATOR: 5,
-  CONNECTOR_TYPE: 6,
-  REINDEXING_STATUS: 7,
-  WORKFLOW_STATUS_RAW: 8,
-  CURATOR_ID: 9,
-  REINDEXING_STATUS_RAW: 10
+  CURATED_URLS: 4,
+  WORKFLOW_STATUS: 5,
+  CURATOR: 6,
+  CONNECTOR_TYPE: 7,
+  REINDEXING_STATUS: 8,
+  WORKFLOW_STATUS_RAW: 9,
+  CURATOR_ID: 10,
+  REINDEXING_STATUS_RAW: 11
 };
 
 var uniqueId; //used for logic related to contents on column customization modal
@@ -178,6 +179,56 @@ let table = $("#collection_table").DataTable({
       targets: [COLUMNS.DELTA_URLS],
       type: "num-fmt",
     },
+    {
+      searchPanes: {
+        options: [
+          {
+            label: "0 URLs",
+            value: function (rowData, rowIdx) {
+              return $(rowData[COLUMNS.CURATED_URLS]).text() == 0;
+            },
+          },
+          {
+            label: "1 solo URL",
+            value: function (rowData, rowIdx) {
+              return $(rowData[COLUMNS.CURATED_URLS]).text() == 1;
+            },
+          },
+          {
+            label: "1 to 100 URLs",
+            value: function (rowData, rowIdx) {
+              return $(rowData[COLUMNS.CURATED_URLS]).text() <= 100 && $(rowData[COLUMNS.CURATED_URLS]).text() > 1;
+            },
+          },
+          {
+            label: "100 to 1,000 URLs",
+            value: function (rowData, rowIdx) {
+              return $(rowData[COLUMNS.CURATED_URLS]).text() <= 1000 && $(rowData[COLUMNS.CURATED_URLS]).text() > 100;
+            },
+          },
+          {
+            label: "1,000 to 10,000 URLs",
+            value: function (rowData, rowIdx) {
+              return $(rowData[COLUMNS.CURATED_URLS]).text() <= 10000 && $(rowData[COLUMNS.CURATED_URLS]).text() > 1000;
+            },
+          },
+          {
+            label: "10,000 to 100,000 URLs",
+            value: function (rowData, rowIdx) {
+              return $(rowData[COLUMNS.CURATED_URLS]).text() <= 100000 && $(rowData[COLUMNS.CURATED_URLS]).text() > 10000;
+            },
+          },
+          {
+            label: "Over 100,000 URLs",
+            value: function (rowData, rowIdx) {
+              return $(rowData[COLUMNS.CURATED_URLS]).text() > 100000;
+            },
+          },
+        ],
+      },
+      targets: [COLUMNS.CURATED_URLS],
+      type: "num-fmt",
+    },
     // hide the data panes
     {
       searchPanes: {
@@ -442,6 +493,7 @@ $(document).ready(function () {
     null,
     "Division",
     "Delta URLs",
+    "Curated URLs",
     "Workflow Status",
     "Curator",
     "Connector Type",
diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index 49515454..8953606e 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -24,6 +24,7 @@ <h2 class="title">Welcome back!</h2>
             <th class="text-center noBorder url-th" style="padding-right:25px !important">Url</th>
             <th class="text-center noBorder" style="padding-right:25px !important">Division</th>
             <th class="text-center noBorder" style="padding-right:25px !important">Delta Urls</th>
+            <th class="text-center noBorder" style="padding-right:25px !important">Curated Urls</th>
             <th class="text-center noBorder" style="padding-right:25px !important">Workflow Status</th>
             <th class="text-center noBorder" style="padding-right:25px !important">Curator</th>
             <th class="text-center noBorder" style="padding-right:25px !important">Connector Type</th>
@@ -64,6 +65,9 @@ <h2 class="title">Welcome back!</h2>
             <!-- Empty cell for Delta Urls (no filter) -->
             <td class="filterRowBottom"></td>
 
+            <!-- Empty cell for Curated Urls (no filter) -->
+            <td class="filterRowBottom"></td>
+
             <!-- Dropdown filter for Workflow Status -->
             <td class="filterRowBottom">
                 <select id="collection-dropdown-4" class="select-dropdown selectStyling">
@@ -153,6 +157,13 @@ <h2 class="title">Welcome back!</h2>
                            role="button">{{ collection.num_delta_urls|intcomma }}</a>
                     </td>
 
+                    <!-- Curated URLs Column - Shows count and links if > 0 -->
+                    <td class="noBorder centerAlign">
+                        <a href=" {% if collection.num_curated_urls > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
+                           class="btn btn-sm {% if collection.num_curated_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
+                           role="button">{{ collection.num_curated_urls|intcomma }}</a>
+                    </td>
+
                     <!-- Workflow Status Dropdown -->
                     <td class="noBorder">
                         <div class="dropdown workflow_status_dropdown"

From 5bd6847370a350f4413ef7062905128277000f41 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 19 Dec 2024 10:12:04 -0600
Subject: [PATCH 344/441] rename to api_tests to test_sinequa_api

---
 sde_collections/tests/{api_tests.py => test_sinequa_api.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sde_collections/tests/{api_tests.py => test_sinequa_api.py} (100%)

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/test_sinequa_api.py
similarity index 100%
rename from sde_collections/tests/api_tests.py
rename to sde_collections/tests/test_sinequa_api.py

From 9a83b9dacbd52bf3b0932a82e7066349f5734fe7 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 19 Dec 2024 10:12:43 -0600
Subject: [PATCH 345/441] rename test_apis to test_url_apis

---
 sde_collections/tests/{test_apis.py => test_url_apis.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sde_collections/tests/{test_apis.py => test_url_apis.py} (100%)

diff --git a/sde_collections/tests/test_apis.py b/sde_collections/tests/test_url_apis.py
similarity index 100%
rename from sde_collections/tests/test_apis.py
rename to sde_collections/tests/test_url_apis.py

From 0ad2dc77648eb3c5d3fa0480a3432a7cfc5d908e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 19 Dec 2024 10:16:27 -0600
Subject: [PATCH 346/441] rename test_views to test_ej_api

---
 environmental_justice/tests/{test_views.py => test_ej_api.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename environmental_justice/tests/{test_views.py => test_ej_api.py} (100%)

diff --git a/environmental_justice/tests/test_views.py b/environmental_justice/tests/test_ej_api.py
similarity index 100%
rename from environmental_justice/tests/test_views.py
rename to environmental_justice/tests/test_ej_api.py

From e51ce45a2a610d954804e1366581430e417c1370 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 19 Dec 2024 11:44:29 -0600
Subject: [PATCH 347/441] add release notes file

---
 RELEASE_NOTES.md | 108 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 RELEASE_NOTES.md

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
new file mode 100644
index 00000000..1c60f929
--- /dev/null
+++ b/RELEASE_NOTES.md
@@ -0,0 +1,108 @@
+# COSMOS Release Notes
+## v3.0.0 from v2.0.1
+
+COSMOS v3.0.0 introduces several major architectural changes that fundamentally enhance the system's capabilities. The primary feature is a new website reindexing system that allows COSMOS to stay up-to-date with source website changes, addressing a key limitation of previous versions where websites could only be scraped once. This release includes comprehensive updates to the data models, frontend interface, rule creation system, and backend processing along with some bugfixes from v2.0.1.
+
+The Environmental Justice (EJ) system has been significantly expanded, growing less than 100 manually curated datasets to approximately 1,000 datasets through the integration of machine learning classification of NASA CMR records. This expansion is supported by a new modular processing suite that generates and extracts metadata using Subject Matter Expert (SME) criteria.
+
+To support future machine learning integration, COSMOS now implements a sophisticated two-column system that allows fields to maintain both ML-generated classifications and manual curator overrides. This system has been seamlessly integrated into the data models, serializers, and APIs, ensuring that both automated and human-curated data can coexist while maintaining clear precedence rules.
+
+To ensure reliability and maintainability of these major changes, this release includes extensive testing coverage with 213 new tests spanning URL processing, pattern management, Environmental Justice functionality, workflow triggers, and data migrations. Additionally, we've added comprehensive documentation across 15 new README files that cover everything from fundamental pattern system concepts to detailed API specifications and ML integration guidelines.
+
+
+### Major Features
+
+#### Reindexing System
+- **New Data Models**: Introduced DumpUrl, DeltaUrl, and CuratedUrl to support the reindexing workflow
+- **Automated Workflows**:
+  - New process to calculate deltas, deletions, and additions during migration
+  - Automatic promotion of DeltaUrls to CuratedUrls
+  - Status-based triggers for data ingestion and processing
+- **Duplicate Prevention**: System now prevents duplicate patterns and URLs
+- **Enhanced Frontend**:
+  - Added reindexing status column to collection and URL list pages
+  - New deletion tracking column on URL list page
+  - Updated collection list to display delta URL counts
+  - Improved URL list page accessibility via delta URL count
+
+#### Pattern System Improvements
+- Complete modularization of the pattern system
+- Enhanced handling of edge cases including overlapping patterns
+- Improved unapply logic
+- Functional inclusion rules
+- Pattern precedence system: most specific pattern takes priority, with pattern length as tiebreaker
+
+#### Environmental Justice (EJ) Enhancement
+- Expanded from 89 manual datasets to 1063 ML-classified NASA CMR records
+- New modular processing suite for metadata generation
+- Enhanced API with multiple data sources:
+  - Spreadsheet (original manual classifications)
+  - ML Production
+  - ML Testing
+  - Combined (ML production with spreadsheet overrides)
+- Custom processing suite for CMR metadata extraction
+
+#### Infrastructure Updates
+- Streamlined database backup and restore
+- Optimized Docker builds
+- Fixed LetsEncrypt staging issues
+- Modified Traefik timeouts for long-running jobs
+- Updated Sinequa worker configuration:
+  - Reduced worker count to 3 for neural workload optimization
+  - Added neural indexing to all webcrawlers
+  - Removed deprecated version mappings
+
+#### API Enhancements
+- New endpoints for curated and delta URLs:
+  - GET /curated-urls-api/<str:config_folder>/
+  - GET /delta-urls-api/<str:config_folder>/
+- Backwards compatibility through remapped CandidateUrl endpoint
+- Updated Environmental Justice API with new data source parameter
+
+### Technical Improvements
+
+#### Two-Column System
+- New architecture to support dual ML/manual classifications
+- Seamless integration with models, serializers, and APIs
+- Prioritization system for manual overrides
+
+#### Testing
+Added 213 new tests across multiple areas:
+- URL APIs and processing (19 tests)
+- Delta and pattern management (31 tests)
+- Environmental Justice API (7 tests)
+- Environmental Justice Mappings and Thresholding (58)
+- Workflow and status triggers (10 tests)
+- Migration and promotion processes (31 tests)
+- Field modifications and TDAMM tags (25 tests)
+- Additional system functionality (30 tests)
+
+
+#### Documentation
+Added comprehensive documentation across 15 READMEs covering:
+- Pattern system fundamentals and examples
+- Reindexing statuses and triggers
+- Model lifecycles and testing procedures
+- URL inclusion/exclusion logic
+- Environmental Justice classifier and API
+- ML column functionality
+- SQL dump restoration
+
+### Bug Fixes
+- Fixed non-functional includes
+- Resolved pagination issues for patterns (previously limited to 50)
+- Eliminated ability to create duplicate URLs and patterns
+- Corrected faulty unapply logic for modification patterns
+- Fixed unrepeatable logic for overlapping patterns
+- Allowed long running jobs to complete without timeouts
+
+### UI Updates
+- Renamed application from "SDE Indexing Helper" to "COSMOS"
+- Refactored collection list code for easier column management
+- Enhanced URL list page with new status and deletion tracking
+- Improved navigation through delta URL count integration
+
+### Administrative Changes
+- Added new admin panels for enhanced system management
+- Updated installation requirements
+- Enhanced database backup and restore functionality

From c7e74b20ac6be2a8d554476e6702050938c3dbf5 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Thu, 19 Dec 2024 13:40:05 -0600
Subject: [PATCH 348/441] Allow individual URL inclusion to override multi-URL
 excludes

---
 sde_collections/serializers.py                |  4 ++
 sde_collections/views.py                      | 20 +++++++
 .../static/js/delta_url_list.js               | 55 +++++++++++++++++--
 3 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 1a4ac099..d4e8502e 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -73,6 +73,8 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
     tdamm_tag = serializers.SerializerMethodField()
+    exclude_pattern_type = serializers.IntegerField(required=False)
+    include_pattern_id = serializers.IntegerField(read_only=True)
 
     def get_tdamm_tag(self, obj):
         tags = obj.tdamm_tag
@@ -108,6 +110,8 @@ class Meta:
             "division_display",
             "visited",
             "tdamm_tag",
+            "exclude_pattern_type",
+            "include_pattern_id",
         )
 
 
diff --git a/sde_collections/views.py b/sde_collections/views.py
index 8b3f1727..645ddfdd 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -276,10 +276,30 @@ def _filter_by_is_excluded(self, queryset, is_excluded):
     def get_queryset(self):
         queryset = super().get_queryset()
         if self.request.method == "GET":
+            collection_id = self.request.GET.get("collection_id")
             # Filter based on exclusion status
             is_excluded = self.request.GET.get("is_excluded")
             if is_excluded:
                 queryset = self._filter_by_is_excluded(queryset, is_excluded)
+
+            # Annotate queryset with two pieces of information:
+            # 1. exclude_pattern_type: Type of exclude pattern (1=Individual URL, 2=Multi-URL Pattern)
+            #    Ordered by -match_pattern_type to prioritize multi-url patterns (type 2)
+            # 2. include_pattern_id: ID of any include pattern affecting this URL
+            #    Used when we need to delete the include pattern during re-exclusion
+            queryset = queryset.annotate(
+                exclude_pattern_type=models.Subquery(
+                    DeltaExcludePattern.objects.filter(delta_urls=models.OuterRef("pk"), collection_id=collection_id)
+                    .order_by("-match_pattern_type")
+                    .values("match_pattern_type")[:1]
+                ),
+                include_pattern_id=models.Subquery(
+                    DeltaIncludePattern.objects.filter(
+                        delta_urls=models.OuterRef("pk"), collection_id=collection_id
+                    ).values("id")[:1]
+                ),
+            )
+
         return queryset.order_by("url")
 
     def update_division(self, request, pk=None):
diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index e27ed72f..63816216 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -259,6 +259,8 @@ function initializeDataTable() {
       getDocumentTypeColumn(),
       getDivisionColumn(),
       { data: "id", visible: false, searchable: false },
+      { data: "exclude_pattern_type", visible: false, searchable: false },
+      { data: "include_pattern_id", visible: false, searchable: false },
       { data: "generated_title_id", visible: false, searchable: false },
       { data: "match_pattern_type", visible: false, searchable: false },
       { data: "delta_urls_count", visible: false, searchable: false },
@@ -1456,11 +1458,54 @@ function handleUrlPartButton() {
 
 function handleExcludeIndividualUrlClick() {
   $("body").on("click", ".exclude_individual_url", function () {
-    postExcludePatterns(
-      (match_pattern = $(this).attr("value")),
-      (match_pattern_type = 1),
-      true
-    );
+    const url = $(this).attr("value");
+    // "check" for excluded, "close" for not excluded
+    const isExcluded = $(this).children("i").text() === "check";
+    const row = $(this).closest("tr");
+    const table = $("#delta_urls_table").DataTable();
+    const rowData = table.row(row).data();
+    const isAffectedByMultiPattern = rowData.exclude_pattern_type === MULTI_URL_PATTERN;
+    const patternId = rowData.include_pattern_id;
+
+    if (isAffectedByMultiPattern) {
+      // For URLs affected by multi-URL exclude patterns:
+      // - If excluded: Create individual include pattern to override
+      // - If not excluded: Delete the override include pattern
+      if (isExcluded) {
+        postIncludePatterns((match_pattern = url), (match_pattern_type = 1));
+      } else {
+        deletePatternWithoutPrompt(`/api/include-patterns/${patternId}/`);
+      }
+    } else {
+      // For URLs not affected by multi-URL patterns:
+      // Toggle individual exclude pattern
+      postExcludePatterns(
+        (match_pattern = url),
+        (match_pattern_type = 1),
+        true
+      );
+    }
+  });
+}
+
+function deletePatternWithoutPrompt(url) {
+  $.ajax({
+    url: url,
+    type: "DELETE",
+    data: {
+      csrfmiddlewaretoken: csrftoken,
+    },
+    headers: {
+      "X-CSRFToken": csrftoken,
+    },
+    success: function (data) {
+      $("#delta_urls_table").DataTable().ajax.reload(null, false);
+      $("#exclude_patterns_table").DataTable().ajax.reload(null, false);
+      $("#include_patterns_table").DataTable().ajax.reload(null, false);
+      $("#title_patterns_table").DataTable().ajax.reload(null, false);
+      $("#document_type_patterns_table").DataTable().ajax.reload(null, false);
+      $("#division_patterns_table").DataTable().ajax.reload(null, false);
+    },
   });
 }
 

From 456c76324b1f01cdc39201defaf069c0889c1725 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Thu, 19 Dec 2024 14:11:26 -0600
Subject: [PATCH 349/441] Modify serializers

---
 sde_collections/serializers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index d4e8502e..89853b5a 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -73,7 +73,7 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
     tdamm_tag = serializers.SerializerMethodField()
-    exclude_pattern_type = serializers.IntegerField(required=False)
+    exclude_pattern_type = serializers.IntegerField(read_only=True)
     include_pattern_id = serializers.IntegerField(read_only=True)
 
     def get_tdamm_tag(self, obj):

From f8c2329a951b2eed0f5d10105fc18937edb50441 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 19 Dec 2024 15:34:44 -0600
Subject: [PATCH 350/441] correct the count of ej spreadsheet values

---
 RELEASE_NOTES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 1c60f929..1ced2857 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -33,7 +33,7 @@ To ensure reliability and maintainability of these major changes, this release i
 - Pattern precedence system: most specific pattern takes priority, with pattern length as tiebreaker
 
 #### Environmental Justice (EJ) Enhancement
-- Expanded from 89 manual datasets to 1063 ML-classified NASA CMR records
+- Expanded from 92 manual datasets to 1063 ML-classified NASA CMR records
 - New modular processing suite for metadata generation
 - Enhanced API with multiple data sources:
   - Spreadsheet (original manual classifications)

From 7348a76b0edb7c610667682f9b5681b6b4bfd005 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 20 Dec 2024 15:51:45 -0600
Subject: [PATCH 351/441] update the code for deleting duplicates and migrating
 collections

---
 scripts/delete_duplicate_urls_on_webapp.py    | 126 ++++++++++++++----
 .../commands/migrate_urls_and_patterns.py     |  86 +++++-------
 2 files changed, 136 insertions(+), 76 deletions(-)

diff --git a/scripts/delete_duplicate_urls_on_webapp.py b/scripts/delete_duplicate_urls_on_webapp.py
index 0cc561f5..b3027ad5 100644
--- a/scripts/delete_duplicate_urls_on_webapp.py
+++ b/scripts/delete_duplicate_urls_on_webapp.py
@@ -1,35 +1,109 @@
-from django.db.models import Count
+import time
+
+from django.db.models import Count, Min
 
 from sde_collections.models.candidate_url import CandidateURL
 from sde_collections.models.collection import Collection
+from sde_collections.models.collection_choice_fields import WorkflowStatusChoices
+
+
+def is_priority_collection(collection):
+    priority_statuses = {
+        WorkflowStatusChoices.CURATED,
+        WorkflowStatusChoices.QUALITY_FIXED,
+        WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
+        WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
+        WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
+        WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
+        WorkflowStatusChoices.QUALITY_CHECK_FAILED,
+        WorkflowStatusChoices.QUALITY_CHECK_MINOR,
+        WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+        WorkflowStatusChoices.PROD_PERFECT,
+        WorkflowStatusChoices.PROD_MINOR,
+        WorkflowStatusChoices.PROD_MAJOR,
+    }
+    return collection.workflow_status in priority_statuses
+
 
+def deduplicate_candidate_urls():
+    start_time = time.time()
 
-def remove_duplicate_urls(collection_name):
-    """
-    Removes duplicate CandidateURL entries for a given collection name.
-
-    Args:
-    - collection_name: The name of the collection for which to remove duplicate URLs.
-    """
-    try:
-        collection = Collection.objects.get(name=collection_name)
-    except Collection.DoesNotExist:
-        print(f"Collection with name '{collection_name}' does not exist.")
-        return
-
-    duplicate_urls = (
-        CandidateURL.objects.filter(collection=collection)
-        .values("url")
-        .annotate(url_count=Count("id"))
-        .filter(url_count__gt=1)
+    # Keep the existing collection preprocessing
+    collection_counts = {
+        c["id"]: c["url_count"]
+        for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count")
+    }
+    collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()}
+
+    # Phase 1: Intra-collection duplicates (keep this part the same)
+    intra_dupes = (
+        CandidateURL.objects.values("collection_id", "url")
+        .annotate(count=Count("id"), min_id=Min("id"))
+        .filter(count__gt=1)
     )
 
-    for entry in duplicate_urls:
-        duplicate_entries = CandidateURL.objects.filter(collection=collection, url=entry["url"]).order_by("id")
+    intra_ids_to_delete = []
+    for dupe in intra_dupes:
+        dupe_ids = set(
+            CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"])
+            .exclude(id=dupe["min_id"])
+            .values_list("id", flat=True)
+        )
+        intra_ids_to_delete.extend(dupe_ids)
+
+    CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete()
+
+    # Phase 2: Modified Cross-collection duplicates
+    cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1)
+
+    cross_ids_to_delete = []
+    for dupe in cross_dupes:
+        # Get all instances of this URL with their relevant data
+        instances = list(CandidateURL.objects.filter(url=dupe["url"]).order_by("id").values("id", "collection_id"))
 
-        duplicates_to_delete = duplicate_entries.exclude(id=duplicate_entries.first().id)
-        count_deleted = duplicates_to_delete.count()
-        duplicates_to_delete.delete()
-        print(f"Deleted {count_deleted} duplicate entries for URL '{entry['url']}'.")
+        while len(instances) > 1:  # Process until we only have one instance left
+            # Create comparison data for each instance
+            instance_data = [
+                {
+                    "id": inst["id"],
+                    "collection_id": inst["collection_id"],
+                    "is_priority": collection_status[inst["collection_id"]],
+                    "url_count": collection_counts[inst["collection_id"]],
+                }
+                for inst in instances
+            ]
 
-    print("Completed deleting duplicated URLs...")
+            # Find the instance to keep based on the new rules
+            def get_instance_to_delete(instances_list):
+                # First, separate by priority
+                priority_instances = [i for i in instances_list if i["is_priority"]]
+                non_priority_instances = [i for i in instances_list if not i["is_priority"]]
+
+                # If we have both priority and non-priority, delete from non-priority
+                if priority_instances and non_priority_instances:
+                    return non_priority_instances[0]
+
+                # If all instances are of same priority type, compare url counts
+                working_list = priority_instances if priority_instances else non_priority_instances
+                min_count = min(i["url_count"] for i in working_list)
+                lowest_count_instances = [i for i in working_list if i["url_count"] == min_count]
+
+                # If multiple instances have the same count, take the one with lowest ID
+                return min(lowest_count_instances, key=lambda x: x["id"])
+
+            # Get the instance to delete
+            instance_to_delete = get_instance_to_delete(instance_data)
+
+            # Add it to our delete list and remove from instances
+            cross_ids_to_delete.append(instance_to_delete["id"])
+            instances = [inst for inst in instances if inst["id"] != instance_to_delete["id"]]
+
+    CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete()
+
+    elapsed_time = time.time() - start_time
+    action = "Deleted"
+    print(
+        f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} "
+        f"cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) "
+        f"in {elapsed_time:.2f} seconds"
+    )
diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py
index 7c28d1d4..e48cde41 100644
--- a/sde_collections/management/commands/migrate_urls_and_patterns.py
+++ b/sde_collections/management/commands/migrate_urls_and_patterns.py
@@ -14,7 +14,7 @@
     DeltaIncludePattern,
     DeltaTitlePattern,
 )
-from sde_collections.models.delta_url import CuratedUrl, DeltaUrl
+from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
 from sde_collections.models.pattern import (
     DivisionPattern,
     DocumentTypePattern,
@@ -50,6 +50,7 @@ def handle(self, *args, **kwargs):
 
         # Step 1: Clear all Delta instances
         start_time = time.time()
+        DumpUrl.objects.all().delete()
         CuratedUrl.objects.all().delete()
         DeltaUrl.objects.all().delete()
         DeltaExcludePattern.objects.all().delete()
@@ -59,22 +60,26 @@ def handle(self, *args, **kwargs):
         DeltaDivisionPattern.objects.all().delete()
         self.stdout.write(f"Cleared all Delta instances in {time.time() - start_time:.2f} seconds.")
 
-        # Step 2: Get collections with Candidate URLs
+        # Step 2: Get collections ordered by URL count
         start_time = time.time()
-        all_collections_with_urls = Collection.objects.annotate(url_count=Count("candidate_urls")).filter(
-            url_count__gt=0
-        )
-        self.stdout.write(f"Collected collections with URLs in {time.time() - start_time:.2f} seconds.")
+        total_collections = Collection.objects.count()
+        collections = Collection.objects.annotate(url_count=Count("candidate_urls")).order_by("url_count")
+        self.stdout.write(f"Retrieved and ordered collections in {time.time() - start_time:.2f} seconds.")
 
-        # Step 3: Migrate all CandidateURLs to DeltaUrl
-        start_time = time.time()
         # Set to track URLs globally across all collections
         global_unique_urls = set()
 
-        for collection in all_collections_with_urls:
+        # Process each collection individually
+        for index, collection in enumerate(collections):
+            collection_start_time = time.time()
+            self.stdout.write(
+                f"\nProcessing collection: {collection} with {collection.url_count} URLs ({index + 1}/{total_collections})"  # noqa
+            )
+
+            # Step 3: Migrate CandidateURLs to DeltaUrl for this collection
+            urls_start_time = time.time()
             delta_urls = []
 
-            # Filter CandidateURL objects, ensuring each URL is globally unique
             for candidate_url in CandidateURL.objects.filter(collection=collection):
                 if candidate_url.url not in global_unique_urls:
                     global_unique_urls.add(candidate_url.url)
@@ -93,56 +98,38 @@ def handle(self, *args, **kwargs):
 
             # Bulk create the unique DeltaUrl instances for this collection
             DeltaUrl.objects.bulk_create(delta_urls)
+            self.stdout.write(
+                f"Migrated {len(delta_urls)} URLs to DeltaUrl in {time.time() - urls_start_time:.2f} seconds"
+            )
 
-        self.stdout.write(f"Migrated CandidateURLs to DeltaUrl in {time.time() - start_time:.2f} seconds.")
-
-        # Step 4: Migrate Patterns
-        start_time = time.time()
-
-        pattern_start_time = time.time()
-        self.migrate_patterns(ExcludePattern)
-        self.stdout.write(f"ExcludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            # Step 4: Migrate Patterns for this collection
+            patterns_start_time = time.time()
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(IncludePattern)
-        self.stdout.write(f"IncludePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            for pattern_model in [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern]:
+                self.migrate_patterns_for_collection(pattern_model, collection)
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(TitlePattern)
-        self.stdout.write(f"TitlePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            self.stdout.write(f"Pattern migration completed in {time.time() - patterns_start_time:.2f} seconds")
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(DocumentTypePattern)
-        self.stdout.write(f"DocumentTypePattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
+            # Step 5: Promote to CuratedUrl if applicable
+            if collection.workflow_status in STATUSES_TO_MIGRATE:
+                promote_start_time = time.time()
+                collection.promote_to_curated()
+                self.stdout.write(f"Promoted to CuratedUrl in {time.time() - promote_start_time:.2f} seconds")
 
-        pattern_start_time = time.time()
-        self.migrate_patterns(DivisionPattern)
-        self.stdout.write(f"DivisionPattern migration completed in {time.time() - pattern_start_time:.2f} seconds.")
-
-        self.stdout.write(f"Total patterns migration completed in {time.time() - start_time:.2f} seconds.")
-
-        # Step 5: Promote DeltaUrls to CuratedUrl
-        start_time = time.time()
-        all_curated_collections_with_urls = all_collections_with_urls.filter(workflow_status__in=STATUSES_TO_MIGRATE)
-        self.stdout.write(
-            f"""Migrating URLs for {all_curated_collections_with_urls.count()} collections
-            with CURATED or higher status..."""
-        )
-        for collection in all_curated_collections_with_urls:
-            collection.promote_to_curated()
-        self.stdout.write(f"Promotion to CuratedUrl completed in {time.time() - start_time:.2f} seconds.")
+            self.stdout.write(
+                f"Total processing time for collection: {time.time() - collection_start_time:.2f} seconds\n"
+                f"--------------------"
+            )
 
         # Log the total time for the process
         self.stdout.write(f"Total migration process completed in {time.time() - overall_start_time:.2f} seconds.")
 
-    def migrate_patterns(self, non_delta_model):
-        """Migrate patterns from a non-delta model to the corresponding delta model."""
+    def migrate_patterns_for_collection(self, non_delta_model, collection):
+        """Migrate patterns from a non-delta model to the corresponding delta model for a specific collection."""
         # Determine the delta model name and fetch the model class
         delta_model_name = "Delta" + non_delta_model.__name__
         delta_model = apps.get_model(non_delta_model._meta.app_label, delta_model_name)
 
-        self.stdout.write(f"Migrating patterns from {non_delta_model.__name__} to {delta_model_name}...")
-
         # Get all field names from both models except 'id' (primary key)
         non_delta_fields = {field.name for field in non_delta_model._meta.fields if field.name != "id"}
         delta_fields = {field.name for field in delta_model._meta.fields if field.name != "id"}
@@ -150,12 +137,11 @@ def migrate_patterns(self, non_delta_model):
         # Find shared fields
         shared_fields = non_delta_fields.intersection(delta_fields)
 
-        for pattern in non_delta_model.objects.all():
+        # Only process patterns for the current collection
+        for pattern in non_delta_model.objects.filter(collection=collection):
             # Build the dictionary of shared fields to copy
             delta_fields_data = {field: getattr(pattern, field) for field in shared_fields}
 
             # Create an instance of the delta model and save it to call the custom save() method
             delta_instance = delta_model(**delta_fields_data)
             delta_instance.save()  # Explicitly call save() to trigger custom logic
-
-        self.stdout.write(f"Migration completed for {non_delta_model.__name__} to {delta_model_name}.")

From 4294dfd7c309634cba5a58c39b4c6d5047de0df3 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 24 Dec 2024 09:55:42 +0100
Subject: [PATCH 352/441] Added affected curated urls count on url pattern
 pages

---
 sde_collections/serializers.py                | 11 ++++++++
 .../static/js/delta_url_list.js               | 25 +++++++++++++++++++
 .../sde_collections/delta_urls_list.html      | 11 ++++++++
 3 files changed, 47 insertions(+)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 1a4ac099..e74b5892 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -72,6 +72,7 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     generated_title_id = serializers.SerializerMethodField(read_only=True)
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
+    curated_urls_count = serializers.SerializerMethodField(read_only=True)
     tdamm_tag = serializers.SerializerMethodField()
 
     def get_tdamm_tag(self, obj):
@@ -82,6 +83,10 @@ def get_delta_urls_count(self, obj):
         titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.delta_urls.count() if titlepattern else 0
 
+    def get_curated_urls_count(self, obj):
+        titlepattern = obj.deltatitlepatterns.last()
+        return titlepattern.curated_urls.count() if titlepattern else 0
+
     def get_generated_title_id(self, obj):
         titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.id if titlepattern else None
@@ -102,6 +107,7 @@ class Meta:
             "generated_title_id",
             "match_pattern_type",
             "delta_urls_count",
+            "curated_urls_count",
             "document_type",
             "document_type_display",
             "division",
@@ -261,10 +267,14 @@ def get_tree_root(self, obj):
 class BasePatternSerializer(serializers.ModelSerializer):
     match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True)
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
+    curated_urls_count = serializers.SerializerMethodField(read_only=True)
 
     def get_delta_urls_count(self, instance):
         return instance.delta_urls.count()
 
+    def get_curated_urls_count(self, instance):
+        return instance.curated_urls.count()
+
     class Meta:
         fields = (
             "id",
@@ -273,6 +283,7 @@ class Meta:
             "match_pattern_type",
             "match_pattern_type_display",
             "delta_urls_count",
+            "curated_urls_count",
         )
         abstract = True
 
diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index e27ed72f..0b9d05c0 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -605,6 +605,11 @@ function initializeDataTable() {
         class: "text-center whiteText",
         sortable: true,
       },
+      {
+        data: "curated_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
       {
         data: null,
         sortable: false,
@@ -686,6 +691,11 @@ function initializeDataTable() {
         class: "text-center whiteText",
         sortable: true,
       },
+      {
+        data: "curated_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
       {
         data: null,
         sortable: false,
@@ -766,6 +776,11 @@ function initializeDataTable() {
         class: "text-center whiteText",
         sortable: true,
       },
+      {
+        data: "curated_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
       {
         data: null,
         sortable: false,
@@ -878,6 +893,11 @@ function initializeDataTable() {
         class: "text-center whiteText",
         sortable: true,
       },
+      {
+        data: "curated_urls_count",
+        class: "text-center whiteText",
+        sortable: true,
+      },
       {
         data: null,
         sortable: false,
@@ -979,6 +999,11 @@ var division_patterns_table = $("#division_patterns_table").DataTable({
       class: "text-center whiteText",
       sortable: true,
     },
+    {
+      data: "curated_urls_count",
+      class: "text-center whiteText",
+      sortable: true,
+    },
     {
       data: null,
       sortable: false,
diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index 20bfc6e5..8484494d 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -231,6 +231,7 @@ <h3 class="whiteText deltaTitle">
                         <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Reason</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Affected Delta URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected Curated URLs</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
                         <th scope="col" class="text-center col-1"><strong>ID</strong></th>
                     </tr>
@@ -263,6 +264,9 @@ <h3 class="whiteText deltaTitle">
                         <th scope="col" class="text-center col-1">
                             <strong>Affected Delta URLs</strong>
                         </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected Curated URLs</strong>
+                        </th>
                         <th scope="col" class="text-center col-1">
                             <strong>Actions</strong>
                         </th>
@@ -300,6 +304,9 @@ <h3 class="whiteText deltaTitle">
                         <th scope="col" class="text-center col-1">
                             <strong>Affected Delta URLs</strong>
                         </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected Curated URLs</strong>
+                        </th>
                         <th scope="col" class="text-center col-1">
                             <strong>Actions</strong>
                         </th>
@@ -337,6 +344,9 @@ <h3 class="whiteText deltaTitle">
                         <th scope="col" class="text-center col-1">
                             <strong>Affected Delta URLs</strong>
                         </th>
+                        <th scope="col" class="text-center col-1">
+                            <strong>Affected Curated URLs</strong>
+                        </th>
                         <th scope="col" class="text-center col-1">
                             <strong>Actions</strong>
                         </th>
@@ -372,6 +382,7 @@ <h3 class="whiteText deltaTitle">
                         <th scope="col" class="text-center col-1"><strong>Match Pattern Type</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Division</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Affected Delta URLs</strong></th>
+                        <th scope="col" class="text-center col-1"><strong>Affected Curated URLs</strong></th>
                         <th scope="col" class="text-center col-1"><strong>Actions</strong></th>
                         <th scope="col" class="text-center col-1"><strong>ID</strong></th>
                     </tr>

From 4053928ea9c785ff50aaf45b0a376dc814a13c9d Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 27 Dec 2024 18:00:37 -0600
Subject: [PATCH 353/441] Match type pattern input added

---
 .../static/css/collections_list.css           | 47 +++++++++++
 .../static/js/delta_url_list.js               | 18 +++--
 .../sde_collections/delta_urls_list.html      | 79 ++++++++++++++++++-
 3 files changed, 135 insertions(+), 9 deletions(-)

diff --git a/sde_indexing_helper/static/css/collections_list.css b/sde_indexing_helper/static/css/collections_list.css
index caed7a4a..78796f98 100644
--- a/sde_indexing_helper/static/css/collections_list.css
+++ b/sde_indexing_helper/static/css/collections_list.css
@@ -313,3 +313,50 @@ margin-bottom: 0 !important;
       .search-container input:focus {
         font-style: italic;
       }
+
+      .pattern-dropdown {
+        width: 100%;
+        background: #A7BACD !important;
+        font-size: 15px;
+        font-weight: 500;
+        line-height: 17.58px;
+        color: #1F2935;
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        text-transform: capitalize;
+        border-radius: 4px;
+        margin-bottom: 0;
+    }
+
+    .pattern-dropdown-input {
+        flex-direction: column;
+        width: 100%;
+    }
+
+    .pattern-type-form {
+        width: 100%;
+        background: #15232E;
+        color: white;
+        border: 1px solid white;
+        padding: 24px 15px;
+        border-radius: 4px;
+    }
+
+    .pattern-form-group {
+        margin-top: 40px;
+    }
+
+    .form-label {
+        color: white;
+        display: flex;
+        font-size: 12px;
+        font-weight: 500;
+        letter-spacing: -0.02em;
+        margin-bottom: 8px;
+    }
+
+    .form-label .asterik {
+        color: #C3001A;
+        margin-left: 4px;
+    }
diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index e27ed72f..30c51fd6 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -1097,6 +1097,8 @@ function handleDivisionSelect() {
   $("body").on("click", ".division_select", function () {
     var match_pattern = $(this).closest(".document_type_dropdown").data("match-pattern");
     var division = $(this).attr("value");
+    // var match_pattern_type = $(this).attr("match-pattern-type");
+    // postDivisionPatterns(match_pattern, match_pattern_type, division);
     postDivisionPatterns(match_pattern, 1, division);
   });
 }
@@ -1130,7 +1132,7 @@ $("#division_pattern_form").on("submit", function (e) {
 
   console.log("Form Inputs:", inputs);  // Debugging line to check inputs
 
-  postDivisionPatterns(inputs.match_pattern, 2, inputs.division_pattern);
+  postDivisionPatterns(inputs.match_pattern, inputs.match_pattern_type, inputs.division_pattern);
 
   // Close the modal if it is open
   $("#divisionPatternModal").modal("hide");
@@ -1975,6 +1977,12 @@ $(".custom-menu li").click(function () {
   $(".custom-menu").hide(100);
 });
 
+$(".pattern_type_form_select").on("click", function (e) {
+  e.preventDefault();
+  $('input[name="match_pattern_type"]').val($(this).attr("value"));
+  $(".pattern-dropdown").text($(this).text());
+});
+
 $("#exclude_pattern_form").on("submit", function (e) {
   e.preventDefault();
 
@@ -1996,7 +2004,7 @@ $("#exclude_pattern_form").on("submit", function (e) {
 
   postExcludePatterns(
     (match_pattern = inputs.match_pattern),
-    (match_pattern_type = 2)
+    (match_pattern_type = inputs.match_pattern_type)
   );
 
   // close the modal if it is open
@@ -2024,7 +2032,7 @@ $("#include_pattern_form").on("submit", function (e) {
 
   postIncludePatterns(
     (match_pattern = inputs.match_pattern),
-    (match_pattern_type = 2)
+    (match_pattern_type = inputs.match_pattern_type)
   );
 
   // close the modal if it is open
@@ -2042,7 +2050,7 @@ $("#title_pattern_form").on("submit", function (e) {
   postTitlePatterns(
     (match_pattern = inputs.match_pattern),
     (title_pattern = inputs.title_pattern),
-    (match_pattern_type = 2)
+    (match_pattern_type = inputs.match_pattern_type)
   );
 
   // close the modal if it is open
@@ -2059,7 +2067,7 @@ $("#document_type_pattern_form").on("submit", function (e) {
 
   postDocumentTypePatterns(
     inputs.match_pattern,
-    2,
+    inputs.match_pattern_type,
     inputs.document_type_pattern
   );
 
diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index 20bfc6e5..dfee96c0 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -425,15 +425,30 @@ <h5 class="modal-title" id="excludePatternModalLabel">Exclude Pattern Form</h5>
             <form id="exclude_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern<span class="asterik">*</span></label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
+                    <div class="form-group pattern-form-group">
+                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <div class="input-group">
+                            <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
+                            <div class="input-group-append pattern-dropdown-input">
+                                <button class="btn btn-secondary btn-block dropdown-toggle pattern-dropdown" type="button"
+                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Pattern Type</button>
+                                <div class="pattern-type-form dropdown-menu">
+                                    <a class="dropdown-item pattern_type_form_select" value="1">Individual URL Pattern</a>
+                                    <a class="dropdown-item pattern_type_form_select" value="2">Multi-URL Pattern</a>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
-                    <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
-                    <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
-                </div></div>
+                        <button type="button" class="btn btn-secondary modal-button-1" data-dismiss="modal">Close</button>
+                        <button type="submit" class="btn btn-primary modal-button-2">Submit</button>
+                    </div>
+                </div>
             </form>
         </div>
     </div>
@@ -454,6 +469,20 @@ <h5 class="modal-title" id="includePatternModalLabel">Include Pattern Form</h5>
                         <label for="match_pattern_input" class="form-label">Match Pattern <div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
+                    <div class="form-group pattern-form-group">
+                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <div class="input-group">
+                            <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
+                            <div class="input-group-append pattern-dropdown-input">
+                                <button class="btn btn-secondary btn-block dropdown-toggle pattern-dropdown" type="button"
+                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Pattern Type</button>
+                                <div class="pattern-type-form dropdown-menu">
+                                    <a class="dropdown-item pattern_type_form_select" value="1">Individual URL Pattern</a>
+                                    <a class="dropdown-item pattern_type_form_select" value="2">Multi-URL Pattern</a>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
@@ -484,6 +513,20 @@ <h5 class="modal-title" id="titlePatternModalLabel">Title Pattern Form</h5>
                         <label for="title_pattern_input" class="form-label">Title Pattern <div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="title_pattern_input" required name="title_pattern">
                     </div>
+                    <div class="form-group pattern-form-group">
+                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <div class="input-group">
+                            <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
+                            <div class="input-group-append pattern-dropdown-input">
+                                <button class="btn btn-secondary btn-block dropdown-toggle pattern-dropdown" type="button"
+                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Pattern Type</button>
+                                <div class="pattern-type-form dropdown-menu">
+                                    <a class="dropdown-item pattern_type_form_select" value="1">Individual URL Pattern</a>
+                                    <a class="dropdown-item pattern_type_form_select" value="2">Multi-URL Pattern</a>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
@@ -529,6 +572,20 @@ <h5 class="modal-title" id="documentTypePatternModalLabel">Document Type Pattern
                             </div>
                         </div>
                     </div>
+                    <div class="form-group pattern-form-group">
+                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <div class="input-group">
+                            <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
+                            <div class="input-group-append pattern-dropdown-input">
+                                <button class="btn btn-secondary btn-block dropdown-toggle pattern-dropdown" type="button"
+                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Pattern Type</button>
+                                <div class="pattern-type-form dropdown-menu">
+                                    <a class="dropdown-item pattern_type_form_select" value="1">Individual URL Pattern</a>
+                                    <a class="dropdown-item pattern_type_form_select" value="2">Multi-URL Pattern</a>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">
@@ -573,6 +630,20 @@ <h5 class="modal-title" id="divisionPatternModalLabel">Division Pattern Form</h5
                             </div>
                         </div>
                     </div>
+                    <div class="form-group pattern-form-group">
+                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <div class="input-group">
+                            <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
+                            <div class="input-group-append pattern-dropdown-input">
+                                <button class="btn btn-secondary btn-block dropdown-toggle pattern-dropdown" type="button"
+                                    data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Select Pattern Type</button>
+                                <div class="pattern-type-form dropdown-menu">
+                                    <a class="dropdown-item pattern_type_form_select" value="1">Individual URL Pattern</a>
+                                    <a class="dropdown-item pattern_type_form_select" value="2">Multi-URL Pattern</a>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
                 </div>
                 <div class="modal-footer">
                     <div class="button-wrapper">

From 9f6ed26d279bfc677cd733693c648fb739cfd4b8 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Fri, 27 Dec 2024 18:12:14 -0600
Subject: [PATCH 354/441] HTML updated

---
 .../templates/sde_collections/delta_urls_list.html   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
index dfee96c0..b0f00bdb 100644
--- a/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
+++ b/sde_indexing_helper/templates/sde_collections/delta_urls_list.html
@@ -425,11 +425,11 @@ <h5 class="modal-title" id="excludePatternModalLabel">Exclude Pattern Form</h5>
             <form id="exclude_pattern_form">
                 <div class="modal-body">
                     <div class="form-group">
-                        <label for="match_pattern_input" class="form-label">Match Pattern<span class="asterik">*</span></label>
+                        <label for="match_pattern_input" class="form-label">Match Pattern<div class="asterik">*</div></label>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                     <div class="form-group pattern-form-group">
-                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <label for="pattern_type_input" class="form-label">Pattern Type<div class="asterik">*</div></label>
                         <div class="input-group">
                             <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
                             <div class="input-group-append pattern-dropdown-input">
@@ -470,7 +470,7 @@ <h5 class="modal-title" id="includePatternModalLabel">Include Pattern Form</h5>
                         <input type="text" class="form-control" id="match_pattern_input" required name="match_pattern">
                     </div>
                     <div class="form-group pattern-form-group">
-                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <label for="pattern_type_input" class="form-label">Pattern Type<div class="asterik">*</div></label>
                         <div class="input-group">
                             <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
                             <div class="input-group-append pattern-dropdown-input">
@@ -514,7 +514,7 @@ <h5 class="modal-title" id="titlePatternModalLabel">Title Pattern Form</h5>
                         <input type="text" class="form-control" id="title_pattern_input" required name="title_pattern">
                     </div>
                     <div class="form-group pattern-form-group">
-                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <label for="pattern_type_input" class="form-label">Pattern Type<div class="asterik">*</div></label>
                         <div class="input-group">
                             <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
                             <div class="input-group-append pattern-dropdown-input">
@@ -573,7 +573,7 @@ <h5 class="modal-title" id="documentTypePatternModalLabel">Document Type Pattern
                         </div>
                     </div>
                     <div class="form-group pattern-form-group">
-                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <label for="pattern_type_input" class="form-label">Pattern Type<div class="asterik">*</div></label>
                         <div class="input-group">
                             <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
                             <div class="input-group-append pattern-dropdown-input">
@@ -631,7 +631,7 @@ <h5 class="modal-title" id="divisionPatternModalLabel">Division Pattern Form</h5
                         </div>
                     </div>
                     <div class="form-group pattern-form-group">
-                        <label for="pattern_type_input" class="form-label">Pattern Type<span class="asterik">*</span></label>
+                        <label for="pattern_type_input" class="form-label">Pattern Type<div class="asterik">*</div></label>
                         <div class="input-group">
                             <input type="hidden" name="match_pattern_type" class="form-control" id="pattern_type_input" required>
                             <div class="input-group-append pattern-dropdown-input">

From 9635cf3571ac3a560f3ace3d4452a6d2e76d39b8 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Mon, 30 Dec 2024 21:33:58 -0600
Subject: [PATCH 355/441] Rectified curated url link condition

---
 .../templates/sde_collections/collection_list.html          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index 3d6142c7..f20a77de 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -150,16 +150,16 @@ <h2 class="title">Welcome back!</h2>
                     <!-- Division Column -->
                     <td class="whiteText noBorder">{{ collection.get_division_display }}</td>
 
-                    <!-- Delta URLs Column - Shows count and links if > 0 -->
+                    <!-- Delta URLs Column - Shows count and links if >= 0 -->
                     <td class="noBorder centerAlign">
                         <a href=" {% if collection.num_delta_urls >= 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
                            class="btn btn-sm {% if collection.num_delta_urls >= 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
                            role="button">{{ collection.num_delta_urls|intcomma }}</a>
                     </td>
 
-                    <!-- Curated URLs Column - Shows count and links if > 0 -->
+                    <!-- Curated URLs Column - Shows count and links if >= 0 -->
                     <td class="noBorder centerAlign">
-                        <a href=" {% if collection.num_curated_urls > 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
+                        <a href=" {% if collection.num_curated_urls >= 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
                            class="btn btn-sm {% if collection.num_curated_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
                            role="button">{{ collection.num_curated_urls|intcomma }}</a>
                     </td>

From e632d74ad3982f40a97b8460dd3f9eb99630dfd0 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Mon, 30 Dec 2024 21:36:34 -0600
Subject: [PATCH 356/441] Curated URL button CSS condition

---
 .../templates/sde_collections/collection_list.html              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_indexing_helper/templates/sde_collections/collection_list.html b/sde_indexing_helper/templates/sde_collections/collection_list.html
index f20a77de..9a2a2a9e 100644
--- a/sde_indexing_helper/templates/sde_collections/collection_list.html
+++ b/sde_indexing_helper/templates/sde_collections/collection_list.html
@@ -160,7 +160,7 @@ <h2 class="title">Welcome back!</h2>
                     <!-- Curated URLs Column - Shows count and links if >= 0 -->
                     <td class="noBorder centerAlign">
                         <a href=" {% if collection.num_curated_urls >= 0 %} {% url 'sde_collections:delta_urls' collection.pk %} {% endif %} "
-                           class="btn btn-sm {% if collection.num_curated_urls > 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
+                           class="btn btn-sm {% if collection.num_curated_urls >= 0 %}btn-primary {% else %}disabled{% endif %}candidateCount"
                            role="button">{{ collection.num_curated_urls|intcomma }}</a>
                     </td>
 

From 5ac843200cb4dc7dc2df12fa9a33ab0297f1b542 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Tue, 31 Dec 2024 16:11:01 -0600
Subject: [PATCH 357/441] Fixed count bug in views

---
 sde_collections/views.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sde_collections/views.py b/sde_collections/views.py
index 673705d9..4f25c34a 100644
--- a/sde_collections/views.py
+++ b/sde_collections/views.py
@@ -71,7 +71,10 @@ def get_queryset(self):
             super()
             .get_queryset()
             .filter(delete=False)
-            .annotate(num_delta_urls=models.Count("delta_urls"), num_curated_urls=models.Count("curated_urls"))
+            .annotate(
+                num_delta_urls=models.Count("delta_urls", distinct=True),
+                num_curated_urls=models.Count("curated_urls", distinct=True),
+            )
             .order_by("-num_delta_urls")
         )
 

From 909c9a6472e619ae3d9102555af3cd0ed8c55398 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanur.sharma@leftrightmind.com>
Date: Wed, 8 Jan 2025 10:44:47 -0600
Subject: [PATCH 358/441] Removed curated_url_count from DeltaURLSerializer

---
 sde_collections/serializers.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index e74b5892..256786c3 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -72,7 +72,6 @@ class DeltaURLSerializer(serializers.ModelSerializer):
     generated_title_id = serializers.SerializerMethodField(read_only=True)
     match_pattern_type = serializers.SerializerMethodField(read_only=True)
     delta_urls_count = serializers.SerializerMethodField(read_only=True)
-    curated_urls_count = serializers.SerializerMethodField(read_only=True)
     tdamm_tag = serializers.SerializerMethodField()
 
     def get_tdamm_tag(self, obj):
@@ -83,10 +82,6 @@ def get_delta_urls_count(self, obj):
         titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.delta_urls.count() if titlepattern else 0
 
-    def get_curated_urls_count(self, obj):
-        titlepattern = obj.deltatitlepatterns.last()
-        return titlepattern.curated_urls.count() if titlepattern else 0
-
     def get_generated_title_id(self, obj):
         titlepattern = obj.deltatitlepatterns.last()
         return titlepattern.id if titlepattern else None
@@ -107,7 +102,6 @@ class Meta:
             "generated_title_id",
             "match_pattern_type",
             "delta_urls_count",
-            "curated_urls_count",
             "document_type",
             "document_type_display",
             "division",

From bba9e0ec6f8131309f536e426c925f9786b58364 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 05:15:33 -0600
Subject: [PATCH 359/441] changes1

---
 .github/workflows/run_full_test_suite.yml | 2 +-
 pytest.ini                                | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 188c01ac..f432ed71 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Run test suite
         env:
           DJANGO_ENV: test
-        run: docker-compose -f local.yml run --rm django bash ./init.sh
+        run: docker-compose -f local.yml run --rm django pytest
 
       - name: Cleanup
         run: docker-compose -f local.yml down --volumes
diff --git a/pytest.ini b/pytest.ini
index c2b3a233..9c3db911 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
-addopts = --ds=config.settings.test --reuse-db
-python_files = tests.py test_*.py
+addopts = --ds=config.settings.test --reuse-db --ignore=document_classifier --ignore=functional_tests
+python_files = test_*.py tests.py

From 3d6ae5979a6befdd2295fdea3fd3b05897af190f Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 05:28:46 -0600
Subject: [PATCH 360/441] changes2

---
 .github/workflows/run_full_test_suite.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index f432ed71..85978e6b 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -28,6 +28,9 @@ jobs:
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
+      - name: Clear Python cache
+        run: docker-compose -f local.yml run --rm django find . -type f -name "*.pyc" -delete && find . -type d -name "__pycache__" -exec rm -rf {} +
+
       - name: Run test suite
         env:
           DJANGO_ENV: test

From 3cdadf33215ec5482087b39aae90edaccd756b3a Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 05:50:26 -0600
Subject: [PATCH 361/441] changes3

---
 pytest.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytest.ini b/pytest.ini
index 9c3db911..de684425 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
 addopts = --ds=config.settings.test --reuse-db --ignore=document_classifier --ignore=functional_tests
-python_files = test_*.py tests.py
+python_files = test_*.py

From af8aa3d8f429b26c7779bdbd9ab137f982e668ab Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 05:59:26 -0600
Subject: [PATCH 362/441] changes4

---
 .github/workflows/run_full_test_suite.yml | 24 ++++++++++-------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 85978e6b..5c588997 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -1,40 +1,36 @@
 name: Django Test Suite on PR
-
 on:
   pull_request:
     branches:
+      - feature/add-github-actions  # Trigger workflow on PRs to this branch
       - dev
 
 jobs:
   run-tests:
-    runs-on: ubuntu-latest
-
-    services:
-      docker:
-        image: docker:19.03.12
-        options: --privileged
-        ports:
+@@ -17,26 +17,21 @@ jobs:
           - 5432:5432
 
     steps:
+      # Check out the merged code (default behavior for pull_request events)
       - name: Check out merged code
         uses: actions/checkout@v2
 
+      # Install Docker Compose
       - name: Set up Docker Compose
         run: |
           sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
           sudo chmod +x /usr/local/bin/docker-compose
-
+      # Build the Docker environment
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
-      - name: Clear Python cache
-        run: docker-compose -f local.yml run --rm django find . -type f -name "*.pyc" -delete && find . -type d -name "__pycache__" -exec rm -rf {} +
-
+      # Run the test suite
       - name: Run test suite
         env:
+          DJANGO_ENV: test  # Example environment variable
           DJANGO_ENV: test
-        run: docker-compose -f local.yml run --rm django pytest
+        run: docker-compose -f local.yml run --rm django bash ./init.sh
 
+      # Cleanup Docker resources
       - name: Cleanup
-        run: docker-compose -f local.yml down --volumes
+        run: docker-compose -f local.yml down --volumes
\ No newline at end of file

From fa7180cebee78c0542157fbab67c07b5df379fd4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 11:59:45 +0000
Subject: [PATCH 363/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .github/workflows/run_full_test_suite.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 5c588997..27f6d51f 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -33,4 +33,4 @@ jobs:
 
       # Cleanup Docker resources
       - name: Cleanup
-        run: docker-compose -f local.yml down --volumes
\ No newline at end of file
+        run: docker-compose -f local.yml down --volumes

From aec11722a80c673c8cc487e588246cef9d39d47f Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 06:03:02 -0600
Subject: [PATCH 364/441] changes5

---
 .github/workflows/run_full_test_suite.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 5c588997..3737c5bc 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -1,36 +1,37 @@
 name: Django Test Suite on PR
+
 on:
   pull_request:
     branches:
-      - feature/add-github-actions  # Trigger workflow on PRs to this branch
       - dev
 
 jobs:
   run-tests:
-@@ -17,26 +17,21 @@ jobs:
+    runs-on: ubuntu-latest
+
+    services:
+      docker:
+        image: docker:19.03.12
+        options: --privileged
+        ports:
           - 5432:5432
 
     steps:
-      # Check out the merged code (default behavior for pull_request events)
       - name: Check out merged code
         uses: actions/checkout@v2
 
-      # Install Docker Compose
       - name: Set up Docker Compose
         run: |
           sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
           sudo chmod +x /usr/local/bin/docker-compose
-      # Build the Docker environment
+
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
-      # Run the test suite
       - name: Run test suite
         env:
-          DJANGO_ENV: test  # Example environment variable
           DJANGO_ENV: test
         run: docker-compose -f local.yml run --rm django bash ./init.sh
 
-      # Cleanup Docker resources
       - name: Cleanup
         run: docker-compose -f local.yml down --volumes
\ No newline at end of file

From 0fd2f49d2ebe8c1929dddbbc8b98f2623d544a8a Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 06:46:47 -0600
Subject: [PATCH 365/441] changes6

---
 .github/workflows/run_full_test_suite.yml | 4 ++--
 pytest.ini                                | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 188c01ac..2307ac61 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -28,10 +28,10 @@ jobs:
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
-      - name: Run test suite
+      - name: Run test suite with Pytest
         env:
           DJANGO_ENV: test
-        run: docker-compose -f local.yml run --rm django bash ./init.sh
+        run: docker-compose -f local.yml run --rm django pytest
 
       - name: Cleanup
         run: docker-compose -f local.yml down --volumes
diff --git a/pytest.ini b/pytest.ini
index de684425..7e58685c 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,4 @@
 [pytest]
-addopts = --ds=config.settings.test --reuse-db --ignore=document_classifier --ignore=functional_tests
+DJANGO_SETTINGS_MODULE = config.settings.test
+addopts = --reuse-db --ignore=document_classifier --ignore=functional_tests
 python_files = test_*.py

From 3897f3f25f2c094f4cfa188cc977dd57c5e0aebc Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 07:03:15 -0600
Subject: [PATCH 366/441] changes6.1

---
 sde_collections/urls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index 9ee77759..54fb1f26 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -54,7 +54,7 @@
     # Create a new DeltaURL instance: /delta-urls/
     # Update an existing DeltaURL instance: /delta-urls/{id}/
     # Delete an existing DeltaURL instance: /delta-urls/{id}/
-    path("api/", include(router.urls)),
+    path("api/", include((router.urls, "sde_collections"), namespace="api")),
     path(
         "delta-urls-api/<str:config_folder>/",
         view=views.DeltaURLAPIView.as_view(),

From c2588038831908b94518f9523556a71563ab0b3e Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 09:51:51 -0600
Subject: [PATCH 367/441] 6.3

---
 .github/workflows/run_full_test_suite.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 2307ac61..33eaf200 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -4,6 +4,7 @@ on:
   pull_request:
     branches:
       - dev
+      - feature/add-github-actions
 
 jobs:
   run-tests:

From 342f7dc58acedbb543ec442030ab5cf7b3b6f3d6 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 15:44:57 -0600
Subject: [PATCH 368/441] 6.4

---
 .github/workflows/run_full_test_suite.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 33eaf200..188c01ac 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -4,7 +4,6 @@ on:
   pull_request:
     branches:
       - dev
-      - feature/add-github-actions
 
 jobs:
   run-tests:
@@ -29,10 +28,10 @@ jobs:
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
-      - name: Run test suite with Pytest
+      - name: Run test suite
         env:
           DJANGO_ENV: test
-        run: docker-compose -f local.yml run --rm django pytest
+        run: docker-compose -f local.yml run --rm django bash ./init.sh
 
       - name: Cleanup
         run: docker-compose -f local.yml down --volumes

From 30a43547df65939c87d95330a24edf3f5a32b556 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 16:29:22 -0600
Subject: [PATCH 369/441] 6.6

---
 sde_collections/tests/test_workflow_status_triggers.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/sde_collections/tests/test_workflow_status_triggers.py b/sde_collections/tests/test_workflow_status_triggers.py
index d6fb8d85..cc879f1a 100644
--- a/sde_collections/tests/test_workflow_status_triggers.py
+++ b/sde_collections/tests/test_workflow_status_triggers.py
@@ -63,6 +63,16 @@ def test_quality_check_perfect_triggers_public_query(self, mock_add):
 
 class TestReindexingStatusTransitions(TestCase):
     def setUp(self):
+        # Mock the GitHubHandler to return valid XML content
+        self.mock_github_handler = patch("sde_collections.models.collection.GitHubHandler").start()
+        self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = b"""<?xml version="1.0" encoding="UTF-8"?>
+        <Sinequa>
+            <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
+            <CollectionSelection>Sample Collection</CollectionSelection>
+        </Sinequa>"""
+        self.addCleanup(patch.stopall)
+
+        # Create the collection with the mock applied
         self.collection = CollectionFactory(
             workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
             reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,

From a17e468c48f07568a80337403a2d447c76b6c0bb Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 16:37:23 -0600
Subject: [PATCH 370/441] 6.7

---
 .../tests/test_workflow_status_triggers.py    | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/sde_collections/tests/test_workflow_status_triggers.py b/sde_collections/tests/test_workflow_status_triggers.py
index cc879f1a..cfdc1412 100644
--- a/sde_collections/tests/test_workflow_status_triggers.py
+++ b/sde_collections/tests/test_workflow_status_triggers.py
@@ -63,20 +63,26 @@ def test_quality_check_perfect_triggers_public_query(self, mock_add):
 
 class TestReindexingStatusTransitions(TestCase):
     def setUp(self):
-        # Mock the GitHubHandler to return valid XML content
-        self.mock_github_handler = patch("sde_collections.models.collection.GitHubHandler").start()
-        self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = b"""<?xml version="1.0" encoding="UTF-8"?>
-        <Sinequa>
-            <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
-            <CollectionSelection>Sample Collection</CollectionSelection>
-        </Sinequa>"""
-        self.addCleanup(patch.stopall)
-
-        # Create the collection with the mock applied
-        self.collection = CollectionFactory(
-            workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
-            reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
-        )
+    # Mock the GitHubHandler to return valid XML content
+     self.mock_github_handler = patch(
+        "sde_collections.models.collection.GitHubHandler"
+    ).start()
+
+     self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = (
+        b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+        b"<Sinequa>\n"
+        b"    <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>\n"
+        b"    <CollectionSelection>Sample Collection</CollectionSelection>\n"
+        b"</Sinequa>"
+    )
+    
+     self.addCleanup(patch.stopall)
+
+    # Create the collection with the mock applied
+     self.collection = CollectionFactory(
+        workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+        reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
+    )
 
     @patch("sde_collections.tasks.fetch_and_replace_full_text.delay")
     def test_reindexing_finished_triggers_full_text_fetch(self, mock_fetch):

From 5ae4b43454bb3b4fdfa3e8aa9adc4538382c503e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 22:37:40 +0000
Subject: [PATCH 371/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../tests/test_workflow_status_triggers.py    | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/sde_collections/tests/test_workflow_status_triggers.py b/sde_collections/tests/test_workflow_status_triggers.py
index cfdc1412..82f66720 100644
--- a/sde_collections/tests/test_workflow_status_triggers.py
+++ b/sde_collections/tests/test_workflow_status_triggers.py
@@ -63,26 +63,24 @@ def test_quality_check_perfect_triggers_public_query(self, mock_add):
 
 class TestReindexingStatusTransitions(TestCase):
     def setUp(self):
-    # Mock the GitHubHandler to return valid XML content
-     self.mock_github_handler = patch(
-        "sde_collections.models.collection.GitHubHandler"
-    ).start()
-
-     self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = (
-        b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
-        b"<Sinequa>\n"
-        b"    <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>\n"
-        b"    <CollectionSelection>Sample Collection</CollectionSelection>\n"
-        b"</Sinequa>"
-    )
-    
-     self.addCleanup(patch.stopall)
-
-    # Create the collection with the mock applied
-     self.collection = CollectionFactory(
-        workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
-        reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
-    )
+        # Mock the GitHubHandler to return valid XML content
+        self.mock_github_handler = patch("sde_collections.models.collection.GitHubHandler").start()
+
+        self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = (
+            b'<?xml version="1.0" encoding="UTF-8"?>\n'
+            b"<Sinequa>\n"
+            b"    <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>\n"
+            b"    <CollectionSelection>Sample Collection</CollectionSelection>\n"
+            b"</Sinequa>"
+        )
+
+        self.addCleanup(patch.stopall)
+
+        # Create the collection with the mock applied
+        self.collection = CollectionFactory(
+            workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
+            reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED,
+        )
 
     @patch("sde_collections.tasks.fetch_and_replace_full_text.delay")
     def test_reindexing_finished_triggers_full_text_fetch(self, mock_fetch):

From 0ab6854d7a80700e9420c676620b92ade3ceaa15 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 10 Jan 2025 16:48:05 -0600
Subject: [PATCH 372/441] 6.8

---
 pytest.ini              | 5 ++---
 sde_collections/urls.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 7e58685c..d57a0bf9 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,3 @@
 [pytest]
-DJANGO_SETTINGS_MODULE = config.settings.test
-addopts = --reuse-db --ignore=document_classifier --ignore=functional_tests
-python_files = test_*.py
+addopts = --ds=config.settings.test --reuse-db
+python_files = tests.py test_*.py
\ No newline at end of file
diff --git a/sde_collections/urls.py b/sde_collections/urls.py
index 54fb1f26..9ee77759 100644
--- a/sde_collections/urls.py
+++ b/sde_collections/urls.py
@@ -54,7 +54,7 @@
     # Create a new DeltaURL instance: /delta-urls/
     # Update an existing DeltaURL instance: /delta-urls/{id}/
     # Delete an existing DeltaURL instance: /delta-urls/{id}/
-    path("api/", include((router.urls, "sde_collections"), namespace="api")),
+    path("api/", include(router.urls)),
     path(
         "delta-urls-api/<str:config_folder>/",
         view=views.DeltaURLAPIView.as_view(),

From bbe8794e1047b5744f440c076c7043fe3375de66 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 22:48:47 +0000
Subject: [PATCH 373/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytest.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytest.ini b/pytest.ini
index d57a0bf9..c2b3a233 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
 addopts = --ds=config.settings.test --reuse-db
-python_files = tests.py test_*.py
\ No newline at end of file
+python_files = tests.py test_*.py

From 094ea5e75a80cc2518422cc7410a7e3a28f65233 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Tue, 21 Jan 2025 18:14:42 -0600
Subject: [PATCH 374/441] made changes: removed unnecessary mentions and also
 ashish from it

---
 sde_collections/utils/slack_utils.py | 56 ++--------------------------
 1 file changed, 4 insertions(+), 52 deletions(-)

diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py
index a8fae3ca..59a8ff8a 100644
--- a/sde_collections/utils/slack_utils.py
+++ b/sde_collections/utils/slack_utils.py
@@ -3,96 +3,48 @@
 
 from ..models.collection_choice_fields import WorkflowStatusChoices
 
-SLACK_ID_MAPPING = {
-    "Carson Davis": "@UESJLQXH6",
-    "Bishwas Praveen": "@U05QZUF182J",
-    "Xiang Li": "@U03PPLNDZA7",
-    "Shravan Vishwanathan": "@U056B4HMGEP",
-    "Advait Yogaonkar": "@U06L5SKQ5QA",
-    "Emily Foshee": "@UPKDARB9P",
-    "Ashish Acharya": "@UC97PNAF6",
-    "channel": "!here",
-}
-
-
 STATUS_CHANGE_NOTIFICATIONS = {
     (WorkflowStatusChoices.RESEARCH_IN_PROGRESS, WorkflowStatusChoices.READY_FOR_ENGINEERING): {
         "message": "Research on {name} is complete. Ready for engineering! :rocket:",
-        "tags": [
-            SLACK_ID_MAPPING["Xiang Li"],
-            SLACK_ID_MAPPING["Shravan Vishwanathan"],
-            SLACK_ID_MAPPING["Advait Yogaonkar"],
-        ],
     },
     (WorkflowStatusChoices.ENGINEERING_IN_PROGRESS, WorkflowStatusChoices.READY_FOR_CURATION): {
         "message": "Engineering on {name} is complete. Ready for curation! :mag:",
-        "tags": [SLACK_ID_MAPPING["Emily Foshee"]],
     },
     (WorkflowStatusChoices.CURATION_IN_PROGRESS, WorkflowStatusChoices.CURATED): {
         "message": "Curation on {name} is complete. It's now curated! :checkered_flag:",
-        "tags": [
-            SLACK_ID_MAPPING["Carson Davis"],
-            SLACK_ID_MAPPING["Bishwas Praveen"],
-            SLACK_ID_MAPPING["Ashish Acharya"],
-        ],
     },
     (WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED): {
         "message": "Alert: Secret deployment of {name} has failed! :warning:",
-        "tags": [
-            SLACK_ID_MAPPING["Carson Davis"],
-            SLACK_ID_MAPPING["Bishwas Praveen"],
-            SLACK_ID_MAPPING["Ashish Acharya"],
-        ],
     },
     (WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK): {
         "message": "Indexing of {name} on Secret Prod completed successfully. Ready for LRM QC! :clipboard:",
-        "tags": [SLACK_ID_MAPPING["Shravan Vishwanathan"], SLACK_ID_MAPPING["Advait Yogaonkar"]],
     },
     (WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK): {
         "message": "LRM QC passed for {name}. Ready for final quality check! :white_check_mark:",
-        "tags": [SLACK_ID_MAPPING["Emily Foshee"]],
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_FAILED): {
-        "message": "Quality check on {name} has failed. Changes needed! :x:",
-        "tags": [
-            SLACK_ID_MAPPING["Xiang Li"],
-            SLACK_ID_MAPPING["Shravan Vishwanathan"],
-            SLACK_ID_MAPPING["Advait Yogaonkar"],
-        ],
+        "message": "<!here> <@U056B4HMGEP> <@U06L5SKQ5QA> Quality check on {name} has failed. Changes needed! :x:",
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_PERFECT): {
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",
-        "tags": [
-            SLACK_ID_MAPPING["Carson Davis"],
-            SLACK_ID_MAPPING["Bishwas Praveen"],
-            SLACK_ID_MAPPING["Ashish Acharya"],
-        ],
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_MINOR): {
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",
-        "tags": [
-            SLACK_ID_MAPPING["Carson Davis"],
-            SLACK_ID_MAPPING["Bishwas Praveen"],
-            SLACK_ID_MAPPING["Ashish Acharya"],
-        ],
     },
     (WorkflowStatusChoices.QUALITY_CHECK_PERFECT, WorkflowStatusChoices.PROD_PERFECT): {
-        "message": "{name} is now live on Public Prod! Congrats team! :sparkles:",
-        "tags": [SLACK_ID_MAPPING["channel"]],
+        "message": "<!here> {name} is now live on Public Prod! Congrats team! :sparkles:",
     },
     (WorkflowStatusChoices.QUALITY_CHECK_MINOR, WorkflowStatusChoices.PROD_MINOR): {
-        "message": "{name} is now live on Public Prod! Congrats team! :sparkles:",
-        "tags": [SLACK_ID_MAPPING["channel"]],
+        "message": "<!here> {name} is now live on Public Prod! Congrats team! :sparkles:",
     },
 }
 
 
 def format_slack_message(name, details, collection_id):
     message_template = details["message"]
-    tags = " ".join([f"<{user}>" for user in details["tags"]])
     link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/"  # noqa: E231
     linked_name = f"<{link}|{name}>"
-    return tags + " " + message_template.format(name=linked_name)
+    return message_template.format(name=linked_name)
 
 
 def send_slack_message(message):

From 7fdb624f5558ef1ae4555d5458c8f3d367182c3e Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Tue, 21 Jan 2025 18:15:53 -0600
Subject: [PATCH 375/441] removed @here mention

---
 sde_collections/utils/slack_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py
index 59a8ff8a..56896517 100644
--- a/sde_collections/utils/slack_utils.py
+++ b/sde_collections/utils/slack_utils.py
@@ -23,7 +23,7 @@
         "message": "LRM QC passed for {name}. Ready for final quality check! :white_check_mark:",
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_FAILED): {
-        "message": "<!here> <@U056B4HMGEP> <@U06L5SKQ5QA> Quality check on {name} has failed. Changes needed! :x:",
+        "message": "<@U056B4HMGEP> <@U06L5SKQ5QA> Quality check on {name} has failed. Changes needed! :x:",
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_PERFECT): {
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",

From 83922312a27d7fed3a0b401a75c273c1012169e7 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 22 Jan 2025 15:18:42 -0600
Subject: [PATCH 376/441] Commit-01

---
 .pre-commit-config.yaml | 47 ++++++++++++++++++++++++++----
 CODE_STANDARDS.md       | 63 +++++++++++++++++++++++++++++++++++++++++
 README.md               |  1 +
 bandit-config.yml       | 27 ++++++++++++++++++
 4 files changed, 132 insertions(+), 6 deletions(-)
 create mode 100644 CODE_STANDARDS.md
 create mode 100644 bandit-config.yml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5631a71d..1c3bd06f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,6 +8,8 @@ repos:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-yaml
+      - id: check-merge-conflict
+      - id: debug-statements
 
   - repo: https://github.com/asottile/pyupgrade
     rev: v3.17.0
@@ -37,14 +39,47 @@ repos:
     hooks:
       - id: mypy
         args: ["--strict"]
-        # ignoring everything for now
-        exclude: .
-        additional_dependencies: [django-stubs, celery,  django-environ, django-extensions, django-crispy-forms,
-        crispy-bootstrap5, django-allauth, django-celery-beat, djangorestframework, djangorestframework-datatables,
-        django-debug-toolbar, psycopg2-binary, python-slugify, xmltodict, PyGithub, boto3, scrapy, types-requests]
+        exclude: "."
+        additional_dependencies:
+          - django-stubs
+          - celery
+          - django-environ
+          - django-extensions
+          - django-crispy-forms
+          - crispy-bootstrap5
+          - django-allauth
+          - django-celery-beat
+          - djangorestframework
+          - djangorestframework-datatables
+          - django-debug-toolbar
+          - psycopg2-binary
+          - python-slugify
+          - xmltodict
+          - PyGithub
+          - boto3
+          - scrapy
+          - types-requests
 
+  - repo: https://github.com/PyCQA/bandit
+    rev: '1.7.0'
+    hooks:
+      - id: bandit
+        args: ['-r', '--configfile=bandit-config.yml']
+
+  - repo: https://github.com/zricethezav/gitleaks
+    rev: 'v8.0.4'
+    hooks:
+      - id: gitleaks
+        args: ['--config=gitleaks-config.toml']
+
+  - repo: local
+    hooks:
+      - id: hadolint
+        name: Lint Dockerfiles
+        entry: bash -c "docker run --rm -i hadolint/hadolint < /dev/stdin"
+        language: system
+        types: [dockerfile]
 
-# sets up .pre-commit-ci.yaml to ensure pre-commit dependencies stay up to date
 ci:
   autoupdate_schedule: weekly
   skip: []
diff --git a/CODE_STANDARDS.md b/CODE_STANDARDS.md
new file mode 100644
index 00000000..bbb13b0b
--- /dev/null
+++ b/CODE_STANDARDS.md
@@ -0,0 +1,63 @@
+# Coding Standards and Conventions for COSMOS
+
+## Overview
+To maintain high-quality code and ensure consistency across the entire COSMOS project, we have established coding standards and conventions. This document outlines the key standards and practices that all contributors are expected to follow. Adhering to these guidelines helps us to achieve a codebase that appears as if it were written by a single entity, regardless of the number of contributors.
+
+## Coding Standards
+
+### Formatting Standards
+- **Line Length**: Maximum of 120 characters per line to ensure readability across various environments.
+- **Code Formatting**: Utilize tools like Black for Python code to ensure consistent formatting across the entire codebase.
+- **Import Ordering**: Follow a consistent import order:
+  - Standard library imports.
+  - Third-party imports.
+  - Application-specific imports.
+
+### Naming Conventions
+- **Variables and Functions**: Use `snake_case`.
+- **Classes and Exceptions**: Use `CamelCase`.
+- **Constants**: Use `UPPER_CASE`.
+
+### Commenting
+- Inline comments should be used sparingly and only when necessary to explain "why" something is done, not "what" is done.
+- All public methods, classes, and modules should include docstrings that follow the [Google style guide](https://google.github.io/styleguide/pyguide.html).
+
+### Error Handling
+- Explicit is better than implicit. Raise exceptions rather than returning None or any error codes.
+- Use custom exceptions over generic exceptions when possible to make error handling more predictive.
+
+## Tool Configurations and Pre-commit Hooks
+
+To automate and enforce these standards, the following tools are configured with pre-commit hooks in our development process:
+
+### Pre-commit Hooks Setup
+
+To ensure that these tools are run automatically on every commit, contributors must set up pre-commit hooks locally. Run the following commands to install and configure pre-commit hooks:
+
+```bash
+pip install pre-commit
+pre-commit install
+pre-commit run --all-files
+```
+
+The following pre-commit hooks are configured:
+
+- trailing-whitespace, end-of-file-fixer, check-yaml, check-merge-conflict, debug-statements: Checks for common formatting issues.
+- pyupgrade: Automatically upgrades syntax for newer versions of the language.
+- black: Formats Python code to ensure consistent styling.
+- isort: Sorts imports alphabetically and automatically separated into sections.
+- flake8: Lints code to catch styling errors and potential bugs.
+- mypy: Checks type annotations to catch potential bugs.
+- bandit: Scans code for common security issues.
+- gitleaks: Prevents secrets from being committed to the repository.
+- hadolint: Lints Dockerfiles to ensure best practices and common conventions are followed.
+
+## Continuous Integration (CI)
+When a commit is pushed to a branch that is part of a Pull Request, our Continuous Integration (CI) pipeline automatically runs specified tools to check code quality, style, security and other standards. If these checks fail, the PR cannot be merged until all issues are resolved.
+
+## Quality Standards Enforcement
+- PRs must pass all checks from the configured pre-commit hooks and CI pipeline to be eligible for merging.
+- Code reviews additionally focus on logical errors and code quality beyond what automated tools can detect.
+
+## Conclusion
+By adhering to these standards and utilizing the tools set up, we maintain the high quality and consistency of our codebase, making it easier for developers to collaborate effectively.
\ No newline at end of file
diff --git a/README.md b/README.md
index ede13f22..ab3da78b 100644
--- a/README.md
+++ b/README.md
@@ -225,6 +225,7 @@ $ pip install pre-commit
 $ pre-commit install
 $ pre-commit run --all-files
 ```
+For detailed information on the coding standards and conventions we enforce, please see our [Coding Standards and Conventions](CODE_STANDARDS.md).
 
 ### Sentry Setup
 
diff --git a/bandit-config.yml b/bandit-config.yml
new file mode 100644
index 00000000..09a97aa1
--- /dev/null
+++ b/bandit-config.yml
@@ -0,0 +1,27 @@
+# bandit-config.yml
+skips:
+  - B101  # Skip assert used (often used in tests)
+  - B403  # Skip import from the pickle module
+
+exclude:
+  - ./tests/         # Exclude test directories
+  - ./migrations/    # Exclude migration directories
+  - ./venv/          # Exclude virtual environment
+  
+tests:
+  - B105             # Include test for hardcoded password strings
+  - B602             # Include test for subprocess call with shell equals true
+
+profiles:
+  default:
+    include:
+      - B301         # Include test for pickle
+      - B403         # Include test for dangerous default argument
+    exclude:
+      - B401         # Exclude test for import telnetlib
+
+# Set the severity level to focus on higher-risk issues
+severity: 'HIGH'
+
+# Set the confidence level to ensure that reported issues are likely true positives
+confidence: 'HIGH'
\ No newline at end of file

From 0002414d592fd673c65f29e190b8f5bd37babc6e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 22 Jan 2025 21:23:46 +0000
Subject: [PATCH 377/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CODE_STANDARDS.md | 2 +-
 bandit-config.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CODE_STANDARDS.md b/CODE_STANDARDS.md
index bbb13b0b..39d473b7 100644
--- a/CODE_STANDARDS.md
+++ b/CODE_STANDARDS.md
@@ -60,4 +60,4 @@ When a commit is pushed to a branch that is part of a Pull Request, our Continuo
 - Code reviews additionally focus on logical errors and code quality beyond what automated tools can detect.
 
 ## Conclusion
-By adhering to these standards and utilizing the tools set up, we maintain the high quality and consistency of our codebase, making it easier for developers to collaborate effectively.
\ No newline at end of file
+By adhering to these standards and utilizing the tools set up, we maintain the high quality and consistency of our codebase, making it easier for developers to collaborate effectively.
diff --git a/bandit-config.yml b/bandit-config.yml
index 09a97aa1..0d02e0eb 100644
--- a/bandit-config.yml
+++ b/bandit-config.yml
@@ -7,7 +7,7 @@ exclude:
   - ./tests/         # Exclude test directories
   - ./migrations/    # Exclude migration directories
   - ./venv/          # Exclude virtual environment
-  
+
 tests:
   - B105             # Include test for hardcoded password strings
   - B602             # Include test for subprocess call with shell equals true
@@ -24,4 +24,4 @@ profiles:
 severity: 'HIGH'
 
 # Set the confidence level to ensure that reported issues are likely true positives
-confidence: 'HIGH'
\ No newline at end of file
+confidence: 'HIGH'

From e9046c342955365d15f28c83954d904a0f98014e Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 22 Jan 2025 16:13:04 -0600
Subject: [PATCH 378/441] Commit2

---
 .pre-commit-config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1c3bd06f..e0cd47fd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -76,9 +76,10 @@ repos:
     hooks:
       - id: hadolint
         name: Lint Dockerfiles
-        entry: bash -c "docker run --rm -i hadolint/hadolint < /dev/stdin"
+        entry: bash -c 'if [ -z "$CI" ]; then docker run --rm -i hadolint/hadolint hadolint -- -; fi'
         language: system
         types: [dockerfile]
+        pass_filenames: false
 
 ci:
   autoupdate_schedule: weekly

From 10f7d791bbea99cfb60ff6378be08f03321ce4da Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 22 Jan 2025 16:18:04 -0600
Subject: [PATCH 379/441] Commit3

---
 .pre-commit-config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e0cd47fd..e932f430 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -76,11 +76,12 @@ repos:
     hooks:
       - id: hadolint
         name: Lint Dockerfiles
-        entry: bash -c 'if [ -z "$CI" ]; then docker run --rm -i hadolint/hadolint hadolint -- -; fi'
+        entry: bash -c 'if [ -z "$PRE_COMMIT_HOME" ]; then docker run --rm -i hadolint/hadolint hadolint -- -; else echo "Skipping Docker-based hadolint on pre-commit.ci"; exit 0; fi'
         language: system
         types: [dockerfile]
         pass_filenames: false
 
+
 ci:
   autoupdate_schedule: weekly
   skip: []

From e0c7f94af10e657dfb1376e35723d67ba9f21586 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Fri, 24 Jan 2025 12:03:34 -0600
Subject: [PATCH 380/441] reconfigured code to maintain slack ids in dict

---
 sde_collections/utils/slack_utils.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py
index 56896517..91824e30 100644
--- a/sde_collections/utils/slack_utils.py
+++ b/sde_collections/utils/slack_utils.py
@@ -3,6 +3,12 @@
 
 from ..models.collection_choice_fields import WorkflowStatusChoices
 
+SLACK_ID_MAPPING = {
+    "Shravan Vishwanathan": "<@U056B4HMGEP>",
+    "Advait Yogaonkar": "<@U06L5SKQ5QA>",
+    "channel": "<!here>",
+}
+
 STATUS_CHANGE_NOTIFICATIONS = {
     (WorkflowStatusChoices.RESEARCH_IN_PROGRESS, WorkflowStatusChoices.READY_FOR_ENGINEERING): {
         "message": "Research on {name} is complete. Ready for engineering! :rocket:",
@@ -23,19 +29,23 @@
         "message": "LRM QC passed for {name}. Ready for final quality check! :white_check_mark:",
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_FAILED): {
-        "message": "<@U056B4HMGEP> <@U06L5SKQ5QA> Quality check on {name} has failed. Changes needed! :x:",
+        "message": "Quality check on {name} has failed. Changes needed! :x:",
+        "mention_users": ["Shravan Vishwanathan", "Advait Yogaonkar"],
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_PERFECT): {
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",
+        "mention_users": ["channel"],
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_MINOR): {
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",
     },
     (WorkflowStatusChoices.QUALITY_CHECK_PERFECT, WorkflowStatusChoices.PROD_PERFECT): {
         "message": "<!here> {name} is now live on Public Prod! Congrats team! :sparkles:",
+        "mention_users": ["channel"],
     },
     (WorkflowStatusChoices.QUALITY_CHECK_MINOR, WorkflowStatusChoices.PROD_MINOR): {
         "message": "<!here> {name} is now live on Public Prod! Congrats team! :sparkles:",
+        "mention_users": ["channel"],
     },
 }
 
@@ -44,6 +54,9 @@ def format_slack_message(name, details, collection_id):
     message_template = details["message"]
     link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/"  # noqa: E231
     linked_name = f"<{link}|{name}>"
+    if "mention_users" in details:
+        slack_mentions = " ".join(SLACK_ID_MAPPING[user] for user in details["mention_users"])
+        return slack_mentions + " " + message_template.format(name=linked_name)
     return message_template.format(name=linked_name)
 
 

From 66bf44d86dd7740b6f3f882c8a9c26b8ef076a1e Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Fri, 24 Jan 2025 12:04:27 -0600
Subject: [PATCH 381/441] removed unnecessary mentions

---
 sde_collections/utils/slack_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py
index 91824e30..8d1fa6a3 100644
--- a/sde_collections/utils/slack_utils.py
+++ b/sde_collections/utils/slack_utils.py
@@ -34,7 +34,6 @@
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_PERFECT): {
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",
-        "mention_users": ["channel"],
     },
     (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_MINOR): {
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",

From 7db25dc1be1423576d9a01d407b9f9095c7eb116 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 24 Jan 2025 15:56:06 -0600
Subject: [PATCH 382/441] Update .github/workflows/run_full_test_suite.yml

---
 .github/workflows/run_full_test_suite.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 188c01ac..89880f51 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -11,7 +11,7 @@ jobs:
 
     services:
       docker:
-        image: docker:19.03.12
+        image: docker:24.0.5
         options: --privileged
         ports:
           - 5432:5432

From 536c79d93e168d5ea72c683ae378d428487500ee Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 24 Jan 2025 16:11:07 -0600
Subject: [PATCH 383/441] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e932f430..e00d2c3f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -72,14 +72,6 @@ repos:
       - id: gitleaks
         args: ['--config=gitleaks-config.toml']
 
-  - repo: local
-    hooks:
-      - id: hadolint
-        name: Lint Dockerfiles
-        entry: bash -c 'if [ -z "$PRE_COMMIT_HOME" ]; then docker run --rm -i hadolint/hadolint hadolint -- -; else echo "Skipping Docker-based hadolint on pre-commit.ci"; exit 0; fi'
-        language: system
-        types: [dockerfile]
-        pass_filenames: false
 
 
 ci:

From 96f3b1766389dd51d0be2c714fc9036ce6da5e48 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 24 Jan 2025 16:12:59 -0600
Subject: [PATCH 384/441] Update bandit-config.yml

---
 bandit-config.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bandit-config.yml b/bandit-config.yml
index 0d02e0eb..36a65525 100644
--- a/bandit-config.yml
+++ b/bandit-config.yml
@@ -15,7 +15,6 @@ tests:
 profiles:
   default:
     include:
-      - B301         # Include test for pickle
       - B403         # Include test for dangerous default argument
     exclude:
       - B401         # Exclude test for import telnetlib

From 2a92a488de832cd046901d212f0200fbcfcea85f Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 24 Jan 2025 16:22:04 -0600
Subject: [PATCH 385/441] remove extra github mock from
 test_workflow_status_triggers

---
 .../tests/test_workflow_status_triggers.py          | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/sde_collections/tests/test_workflow_status_triggers.py b/sde_collections/tests/test_workflow_status_triggers.py
index 82f66720..8d2fa2d9 100644
--- a/sde_collections/tests/test_workflow_status_triggers.py
+++ b/sde_collections/tests/test_workflow_status_triggers.py
@@ -63,19 +63,6 @@ def test_quality_check_perfect_triggers_public_query(self, mock_add):
 
 class TestReindexingStatusTransitions(TestCase):
     def setUp(self):
-        # Mock the GitHubHandler to return valid XML content
-        self.mock_github_handler = patch("sde_collections.models.collection.GitHubHandler").start()
-
-        self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = (
-            b'<?xml version="1.0" encoding="UTF-8"?>\n'
-            b"<Sinequa>\n"
-            b"    <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>\n"
-            b"    <CollectionSelection>Sample Collection</CollectionSelection>\n"
-            b"</Sinequa>"
-        )
-
-        self.addCleanup(patch.stopall)
-
         # Create the collection with the mock applied
         self.collection = CollectionFactory(
             workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,

From 22bde3be48adb2839566902d063d08349cfecf2e Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 24 Jan 2025 16:33:38 -0600
Subject: [PATCH 386/441] add back in saif's magic code

---
 .../tests/test_workflow_status_triggers.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sde_collections/tests/test_workflow_status_triggers.py b/sde_collections/tests/test_workflow_status_triggers.py
index 8d2fa2d9..82f66720 100644
--- a/sde_collections/tests/test_workflow_status_triggers.py
+++ b/sde_collections/tests/test_workflow_status_triggers.py
@@ -63,6 +63,19 @@ def test_quality_check_perfect_triggers_public_query(self, mock_add):
 
 class TestReindexingStatusTransitions(TestCase):
     def setUp(self):
+        # Mock the GitHubHandler to return valid XML content
+        self.mock_github_handler = patch("sde_collections.models.collection.GitHubHandler").start()
+
+        self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = (
+            b'<?xml version="1.0" encoding="UTF-8"?>\n'
+            b"<Sinequa>\n"
+            b"    <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>\n"
+            b"    <CollectionSelection>Sample Collection</CollectionSelection>\n"
+            b"</Sinequa>"
+        )
+
+        self.addCleanup(patch.stopall)
+
         # Create the collection with the mock applied
         self.collection = CollectionFactory(
             workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT,

From 76bc0961361c334da5593d2920d5cd99476c1c65 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 24 Jan 2025 16:34:18 -0600
Subject: [PATCH 387/441] Update sde_collections/utils/slack_utils.py

---
 sde_collections/utils/slack_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py
index 8d1fa6a3..d10434e2 100644
--- a/sde_collections/utils/slack_utils.py
+++ b/sde_collections/utils/slack_utils.py
@@ -39,7 +39,7 @@
         "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:",
     },
     (WorkflowStatusChoices.QUALITY_CHECK_PERFECT, WorkflowStatusChoices.PROD_PERFECT): {
-        "message": "<!here> {name} is now live on Public Prod! Congrats team! :sparkles:",
+        "message": "{name} is now live on Public Prod! Congrats team! :sparkles:",
         "mention_users": ["channel"],
     },
     (WorkflowStatusChoices.QUALITY_CHECK_MINOR, WorkflowStatusChoices.PROD_MINOR): {

From 92630346c989fd0340bb68acc78b582d67de01e1 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 24 Jan 2025 16:34:26 -0600
Subject: [PATCH 388/441] Update sde_collections/utils/slack_utils.py

---
 sde_collections/utils/slack_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py
index d10434e2..44979e04 100644
--- a/sde_collections/utils/slack_utils.py
+++ b/sde_collections/utils/slack_utils.py
@@ -43,7 +43,7 @@
         "mention_users": ["channel"],
     },
     (WorkflowStatusChoices.QUALITY_CHECK_MINOR, WorkflowStatusChoices.PROD_MINOR): {
-        "message": "<!here> {name} is now live on Public Prod! Congrats team! :sparkles:",
+        "message": "{name} is now live on Public Prod! Congrats team! :sparkles:",
         "mention_users": ["channel"],
     },
 }

From 0616fedaeb2e170efc17ac3965dc65ad93a314c9 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 29 Jan 2025 14:00:51 -0600
Subject: [PATCH 389/441] added a new model to create dropdwon options

---
 feedback/models.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/feedback/models.py b/feedback/models.py
index de2921b5..c2a11f7d 100644
--- a/feedback/models.py
+++ b/feedback/models.py
@@ -4,12 +4,36 @@
 from sde_collections.utils.slack_utils import send_slack_message
 
 
+class FeedbackFormDropdown(models.Model):
+    DEFAULT_OPTIONS = [
+        {"name": "I need help or have a general question", "display_order": 1},
+        {"name": "I have a data/content question or comment", "display_order": 2},
+        {"name": "I would like to report an error", "display_order": 3},
+        {"name": "I have an idea or suggested improvement to share", "display_order": 4},
+        {"name": "General comment or feedback", "display_order": 5},
+    ]
+
+    name = models.CharField(max_length=200)
+    display_order = models.PositiveIntegerField(default=1)
+
+    class Meta:
+        ordering = ["display_order", "name"]
+        verbose_name = "Dropdowm Option"
+        verbose_name_plural = "Dropdown Options"
+
+    def __str__(self):
+        return self.name
+
+
 class Feedback(models.Model):
     name = models.CharField(max_length=150)
     email = models.EmailField()
     subject = models.CharField(max_length=400)
     comments = models.TextField()
     source = models.CharField(max_length=50, default="SDE", blank=True)
+    dropdown_option = models.ForeignKey(
+        FeedbackFormDropdown, on_delete=models.SET_NULL, null=True, related_name="feedback"
+    )
     created_at = models.DateTimeField(null=True, blank=True)
 
     class Meta:
@@ -32,10 +56,12 @@ def format_notification_message(self):
         """
         Returns a formatted notification message containing details from this Feedback instance.
         """
+        dropdown_option_text = self.dropdown_option.name if self.dropdown_option else "Not Specified"
         notification_message = (
             f"<!here> New Feedback Received : \n"  # noqa: E203
             f"Name: {self.name}\n"
             f"Email: {self.email}\n"
+            f"Dropdwon Choice: {dropdown_option_text}\n"
             f"Subject: {self.subject}\n"
             f"Comments: {self.comments}\n"
             f"Source: {self.source}\n"

From 0757d299ca0b02ffdb25ca9a48348fb49a5c6edc Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 29 Jan 2025 14:01:23 -0600
Subject: [PATCH 390/441] added a serializer for the dropdown options model

---
 feedback/serializers.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/feedback/serializers.py b/feedback/serializers.py
index 6956e16f..22873699 100644
--- a/feedback/serializers.py
+++ b/feedback/serializers.py
@@ -1,6 +1,12 @@
 from rest_framework import serializers
 
-from .models import ContentCurationRequest, Feedback
+from .models import ContentCurationRequest, Feedback, FeedbackFormDropdown
+
+
+class FeedbackFormDropdownSerializer(serializers.ModelSerializer):
+    class Meta:
+        model = FeedbackFormDropdown
+        fields = ["id", "name"]
 
 
 class FeedbackSerializer(serializers.ModelSerializer):
@@ -12,6 +18,7 @@ class Meta:
             "subject",
             "comments",
             "source",
+            "dropdown_option",
             "created_at",
         ]
 

From 2fe695039830c2d5715391e4bb559792d64ccd59 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 29 Jan 2025 14:01:54 -0600
Subject: [PATCH 391/441] added a list view for the options

---
 feedback/views.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/feedback/views.py b/feedback/views.py
index 6e4b0174..ca070564 100644
--- a/feedback/views.py
+++ b/feedback/views.py
@@ -1,7 +1,11 @@
 from rest_framework import generics
 
-from .models import ContentCurationRequest, Feedback
-from .serializers import ContentCurationRequestSerializer, FeedbackSerializer
+from .models import ContentCurationRequest, Feedback, FeedbackFormDropdown
+from .serializers import (
+    ContentCurationRequestSerializer,
+    FeedbackFormDropdownSerializer,
+    FeedbackSerializer,
+)
 
 
 class ContactFormModelView(generics.CreateAPIView):
@@ -9,6 +13,11 @@ class ContactFormModelView(generics.CreateAPIView):
     serializer_class = FeedbackSerializer
 
 
+class FeedbackFormDropdownListView(generics.ListAPIView):
+    queryset = FeedbackFormDropdown.objects.all()
+    serializer_class = FeedbackFormDropdownSerializer
+
+
 class ContentCurationRequestView(generics.CreateAPIView):
     queryset = ContentCurationRequest.objects.all()
     serializer_class = ContentCurationRequestSerializer

From 7ed7c697d3f2e098368aa6ace70be712836dc160 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 29 Jan 2025 14:02:30 -0600
Subject: [PATCH 392/441] added an endpoint for LRM to consume options

---
 feedback/urls.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/feedback/urls.py b/feedback/urls.py
index 63ee219c..b44ca0a0 100644
--- a/feedback/urls.py
+++ b/feedback/urls.py
@@ -1,10 +1,19 @@
 from django.urls import path
 
-from .views import ContactFormModelView, ContentCurationRequestView
+from .views import (
+    ContactFormModelView,
+    ContentCurationRequestView,
+    FeedbackFormDropdownListView,
+)
 
 app_name = "feedback"
 urlpatterns = [
     path("contact-us-api/", ContactFormModelView.as_view(), name="contact-us-api"),
+    path(
+        "feedback-form-dropdown-options-api/",
+        FeedbackFormDropdownListView.as_view(),
+        name="feedback-form-dropdown-options-api",
+    ),
     path(
         "content-curation-request-api/",
         ContentCurationRequestView.as_view(),

From 255ccbb3ff9904d7b4763f4ba03dd7b8e260e8e9 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 29 Jan 2025 14:02:43 -0600
Subject: [PATCH 393/441] migration file

---
 ...ckformdropdown_feedback_dropdown_option.py | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 feedback/migrations/0005_feedbackformdropdown_feedback_dropdown_option.py

diff --git a/feedback/migrations/0005_feedbackformdropdown_feedback_dropdown_option.py b/feedback/migrations/0005_feedbackformdropdown_feedback_dropdown_option.py
new file mode 100644
index 00000000..da4bf7d8
--- /dev/null
+++ b/feedback/migrations/0005_feedbackformdropdown_feedback_dropdown_option.py
@@ -0,0 +1,37 @@
+# Generated by Django 4.2.9 on 2025-01-29 19:27
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("feedback", "0004_contentcurationrequest_created_at_and_more"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="FeedbackFormDropdown",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("name", models.CharField(max_length=200)),
+                ("display_order", models.PositiveIntegerField(default=1)),
+            ],
+            options={
+                "verbose_name": "Dropdowm Option",
+                "verbose_name_plural": "Dropdown Options",
+                "ordering": ["display_order", "name"],
+            },
+        ),
+        migrations.AddField(
+            model_name="feedback",
+            name="dropdown_option",
+            field=models.ForeignKey(
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                related_name="feedback",
+                to="feedback.feedbackformdropdown",
+            ),
+        ),
+    ]

From ed14ffbe08a5433aac3ae60c0a5bf3b642b5e4c0 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 29 Jan 2025 14:12:11 -0600
Subject: [PATCH 394/441] slack notification format changed

---
 feedback/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/feedback/models.py b/feedback/models.py
index c2a11f7d..4746e5f3 100644
--- a/feedback/models.py
+++ b/feedback/models.py
@@ -56,7 +56,7 @@ def format_notification_message(self):
         """
         Returns a formatted notification message containing details from this Feedback instance.
         """
-        dropdown_option_text = self.dropdown_option.name if self.dropdown_option else "Not Specified"
+        dropdown_option_text = self.dropdown_option.name if self.dropdown_option else "No Option Selected"
         notification_message = (
             f"<!here> New Feedback Received : \n"  # noqa: E203
             f"Name: {self.name}\n"

From 27b0d9affae09410785d7d34fe6b01f00f67898c Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 29 Jan 2025 16:05:25 -0600
Subject: [PATCH 395/441] added tests for feedback and content curation request
 forms

---
 feedback/tests.py | 146 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/feedback/tests.py b/feedback/tests.py
index e69de29b..4c956f35 100644
--- a/feedback/tests.py
+++ b/feedback/tests.py
@@ -0,0 +1,146 @@
+# docker compose -f local.yml run --rm django pytest feedback/tests.py
+
+import pytest
+from django.urls import reverse
+from rest_framework import status
+from rest_framework.test import APIClient
+
+from feedback.models import ContentCurationRequest, Feedback, FeedbackFormDropdown
+
+
+@pytest.fixture
+def api_client():
+    return APIClient()
+
+
+@pytest.fixture
+def dropdown_option(db):
+    return FeedbackFormDropdown.objects.create(name="I need help or have a general question", display_order=1)
+
+
+@pytest.fixture
+def feedback_data(dropdown_option):
+    return {
+        "name": "Test User",
+        "email": "test@example.com",
+        "subject": "Test Subject",
+        "comments": "Test Comments",
+        "source": "TEST",
+        "dropdown_option": dropdown_option.id,
+    }
+
+
+@pytest.fixture
+def content_curation_data():
+    return {
+        "name": "Test User",
+        "email": "test@example.com",
+        "scientific_focus": "Biology",
+        "data_type": "Genomics",
+        "data_link": "https://example.com/data",
+        "additional_info": "Extra details",
+    }
+
+
+@pytest.mark.django_db
+class TestFeedbackFormDropdown:
+    def test_dropdown_str_representation(self, dropdown_option):
+        """Test string representation of dropdown options"""
+        assert str(dropdown_option) == "I need help or have a general question"
+
+    def test_dropdown_ordering(self):
+        """Test that dropdown options are ordered by display_order"""
+        dropdown1 = FeedbackFormDropdown.objects.create(name="First Option", display_order=1)
+        dropdown2 = FeedbackFormDropdown.objects.create(name="Second Option", display_order=2)
+        dropdowns = FeedbackFormDropdown.objects.all()
+        assert dropdowns[0] == dropdown1
+        assert dropdowns[1] == dropdown2
+
+
+@pytest.mark.django_db
+class TestFeedbackAPI:
+    def test_get_dropdown_options(self, api_client, dropdown_option):
+        """Test retrieving dropdown options"""
+        url = reverse("feedback:feedback-form-dropdown-options-api")
+        response = api_client.get(url)
+        assert response.status_code == status.HTTP_200_OK
+        assert len(response.data["results"]) == 1
+        assert response.data["results"][0]["name"] == dropdown_option.name
+
+    def test_create_feedback_success(self, api_client, feedback_data):
+        """Test successful feedback creation"""
+        url = reverse("feedback:contact-us-api")
+        response = api_client.post(url, feedback_data, format="json")
+        assert response.status_code == status.HTTP_201_CREATED
+        assert Feedback.objects.count() == 1
+
+    def test_create_feedback_invalid_email(self, api_client, feedback_data):
+        """Test feedback creation with invalid email"""
+        url = reverse("feedback:contact-us-api")
+        feedback_data["email"] = "invalid-email"
+        response = api_client.post(url, feedback_data, format="json")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert "email" in response.data["error"]
+
+    @pytest.mark.parametrize("field", ["name", "email", "subject", "comments"])
+    def test_create_feedback_missing_required_fields(self, api_client, feedback_data, field):
+        """Test feedback creation with missing required fields"""
+        url = reverse("feedback:contact-us-api")
+        feedback_data.pop(field)
+        response = api_client.post(url, feedback_data, format="json")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert field in response.data["error"]
+
+    def test_create_feedback_invalid_dropdown(self, api_client, feedback_data):
+        """Test feedback creation with non-existent dropdown option"""
+        url = reverse("feedback:contact-us-api")
+        feedback_data["dropdown_option"] = 999
+        response = api_client.post(url, feedback_data, format="json")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert "dropdown_option" in response.data["error"]
+
+
+@pytest.mark.django_db
+class TestContentCurationRequestAPI:
+    def test_create_request_success(self, api_client, content_curation_data):
+        """Test successful content curation request creation"""
+        url = reverse("feedback:content-curation-request-api")
+        response = api_client.post(url, content_curation_data, format="json")
+        assert response.status_code == status.HTTP_201_CREATED
+        assert ContentCurationRequest.objects.count() == 1
+
+    def test_create_request_without_additional_info(self, api_client, content_curation_data):
+        """Test request creation without optional additional info"""
+        url = reverse("feedback:content-curation-request-api")
+        del content_curation_data["additional_info"]
+        response = api_client.post(url, content_curation_data, format="json")
+        assert response.status_code == status.HTTP_201_CREATED
+        assert ContentCurationRequest.objects.first().additional_info == ""
+
+    def test_create_request_invalid_email(self, api_client, content_curation_data):
+        """Test request creation with invalid email"""
+        url = reverse("feedback:content-curation-request-api")
+        content_curation_data["email"] = "invalid-email"
+        response = api_client.post(url, content_curation_data, format="json")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert "email" in response.data["error"]
+
+    @pytest.mark.parametrize("field", ["name", "email", "scientific_focus", "data_type", "data_link"])
+    def test_create_request_missing_required_fields(self, api_client, content_curation_data, field):
+        """Test request creation with missing required fields"""
+        url = reverse("feedback:content-curation-request-api")
+        content_curation_data.pop(field)
+        response = api_client.post(url, content_curation_data, format="json")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert field in response.data["error"]
+
+    @pytest.mark.parametrize(
+        "field,length", [("name", 151), ("data_link", 1001), ("scientific_focus", 201), ("data_type", 101)]
+    )
+    def test_create_request_field_max_lengths(self, api_client, content_curation_data, field, length):
+        """Test request creation with fields exceeding max length"""
+        url = reverse("feedback:content-curation-request-api")
+        content_curation_data[field] = "x" * length
+        response = api_client.post(url, content_curation_data, format="json")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert field in response.data["error"]

From 86b7dad3bc988de0a83381a2ab14912614e0de5b Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Thu, 30 Jan 2025 20:32:37 -0600
Subject: [PATCH 396/441] merge api_tests with test_sinequa_api

---
 sde_collections/tests/api_tests.py        | 137 ----------------
 sde_collections/tests/test_sinequa_api.py | 180 +++++++++++++++++++---
 2 files changed, 159 insertions(+), 158 deletions(-)
 delete mode 100644 sde_collections/tests/api_tests.py

diff --git a/sde_collections/tests/api_tests.py b/sde_collections/tests/api_tests.py
deleted file mode 100644
index 73f2a77a..00000000
--- a/sde_collections/tests/api_tests.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
-import unittest
-from unittest.mock import Mock, patch
-
-from requests import HTTPError
-
-from ..sinequa_api import Api
-
-
-class TestApi(unittest.TestCase):
-    def setUp(self):
-        # Set up an instance of the Api class with parameters for testing
-        self.api = Api(server_name="test", user="test_user", password="test_password", token="test_token")
-
-    @patch("requests.post")
-    def test_process_response_success(self, mock_post):
-        # This test checks the process_response method when the HTTP request is successful
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {"key": "value"}
-        mock_post.return_value = mock_response
-
-        response = self.api.process_response("http://example.com/api", payload={"test": "data"})
-        self.assertEqual(response, {"key": "value"})
-        mock_post.assert_called_once()
-
-    @patch("requests.post")
-    def test_process_response_failure(self, mock_post):
-        # Create a mock response object with a 500 status code
-        mock_response = Mock()
-        mock_response.status_code = 500
-        mock_response.json.return_value = {"error": "Internal Server Error"}
-        mock_post.return_value = mock_response
-
-        def raise_for_status():
-            if mock_response.status_code != 200:
-                raise HTTPError(
-                    f"{mock_response.status_code} Server Error: Internal Server Error for url: http://example.com/api"
-                )
-
-        mock_response.raise_for_status = raise_for_status
-
-        # Attempt to process the response and check if it correctly handles the HTTP error
-        with self.assertRaises(HTTPError):
-            self.api.process_response("http://example.com/api", payload={"test": "data"})
-
-    @patch("requests.post")
-    def test_query(self, mock_post):
-        """
-        The test ensures that the query method constructs the correct URL and payload based on input parameters,
-          and processes a successful API response to return the expected data
-        """
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {"key": "value"}
-        mock_post.return_value = mock_response
-
-        response = self.api.query(page=1, collection_config_folder="sample_folder")
-        self.assertEqual(response, {"key": "value"})
-
-        expected_url = "https://sciencediscoveryengine.test.nasa.gov/api/v1/search.query"
-        expected_payload = {
-            "app": "nasa-sba-smd",
-            "query": {
-                "name": "query-smd-primary",
-                "text": "",
-                "page": 1,
-                "pageSize": 1000,
-                "advanced": {"collection": "/SDE/sample_folder/"},
-            },
-        }
-        mock_post.assert_called_once_with(expected_url, headers={}, json=expected_payload, verify=False)
-
-    @patch("requests.post")
-    def test_sql_query(self, mock_post):
-        # Mock response for the `sql_query` function with token-based authentication
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {"Rows": [["http://example.com", "sample text", "sample title"]]}
-        mock_post.return_value = mock_response
-
-        sql = "SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/sample_folder/'"
-        response = self.api.sql_query(sql)
-        self.assertEqual(response, {"Rows": [["http://example.com", "sample text", "sample title"]]})
-
-    @patch("requests.post")
-    def test_get_full_texts(self, mock_post):
-        # Mock response for the `get_full_texts` method
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {
-            "Rows": [
-                ["http://example.com/article1", "Here is the full text of the first article...", "Article One Title"],
-                ["http://example.com/article2", "Here is the full text of the second article...", "Article Two Title"],
-            ]
-        }
-        mock_post.return_value = mock_response
-
-        result = self.api.get_full_texts(collection_config_folder="sample_folder")
-        expected = [
-            {
-                "url": "http://example.com/article1",
-                "full_text": "Here is the full text of the first article...",
-                "title": "Article One Title",
-            },
-            {
-                "url": "http://example.com/article2",
-                "full_text": "Here is the full text of the second article...",
-                "title": "Article Two Title",
-            },
-        ]
-        self.assertEqual(result, expected)
-
-    def test_missing_token_for_sql_query(self):
-        # To test when token is missing for sql_query
-        api = Api(server_name="test", token=None)
-        with self.assertRaises(ValueError):
-            api.sql_query("SELECT * FROM test_table")
-
-    def test_process_full_text_response(self):
-        # Test `_process_full_text_response` parsing functionality
-        raw_response = {
-            "Rows": [
-                ["http://example.com/article1", "Full text for article 1", "Title 1"],
-                ["http://example.com/article2", "Full text for article 2", "Title 2"],
-            ]
-        }
-        processed_response = Api._process_full_text_response(raw_response)
-        expected = [
-            {"url": "http://example.com/article1", "full_text": "Full text for article 1", "title": "Title 1"},
-            {"url": "http://example.com/article2", "full_text": "Full text for article 2", "title": "Title 2"},
-        ]
-        self.assertEqual(processed_response, expected)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/sde_collections/tests/test_sinequa_api.py b/sde_collections/tests/test_sinequa_api.py
index 1ef3a477..85a24bc7 100644
--- a/sde_collections/tests/test_sinequa_api.py
+++ b/sde_collections/tests/test_sinequa_api.py
@@ -1,4 +1,5 @@
 # docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py
+import json
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -12,6 +13,12 @@
 
 @pytest.mark.django_db
 class TestApiClass:
+    """
+    Test suite for the Sinequa API integration.
+    Tests cover authentication, query construction, response processing,
+    and error handling across different server configurations.
+    """
+
     @pytest.fixture
     def collection(self):
         """Fixture to create a collection object for testing."""
@@ -25,7 +32,10 @@ def collection(self):
 
     @pytest.fixture
     def api_instance(self):
-        """Fixture to create an Api instance with mocked server configs."""
+        """
+        Fixture to create an Api instance with mocked server configs.
+        Provides a consistent test environment with predefined credentials.
+        """
         with patch(
             "sde_collections.sinequa_api.server_configs",
             {
@@ -41,7 +51,13 @@ def api_instance(self):
 
     @patch("requests.post")
     def test_process_response_success(self, mock_post, api_instance):
-        """Test that process_response handles successful responses."""
+        """
+        Test that process_response successfully handles and parses API responses.
+        Verifies:
+        1. Correct HTTP request processing
+        2. JSON response parsing
+        3. Return value structure
+        """
         mock_response = MagicMock()
         mock_response.status_code = 200
         mock_response.json.return_value = {"key": "value"}
@@ -49,27 +65,57 @@ def test_process_response_success(self, mock_post, api_instance):
 
         response = api_instance.process_response("http://example.com", payload={"test": "data"})
         assert response == {"key": "value"}
+        mock_post.assert_called_once()
 
     @patch("requests.post")
     def test_process_response_failure(self, mock_post, api_instance):
-        """Test that process_response raises an exception on failure."""
+        """
+        Test that process_response properly handles failed API requests.
+        Verifies appropriate exception raising and error messaging.
+        """
         mock_response = MagicMock()
         mock_response.status_code = 500
         mock_post.return_value = mock_response
-        mock_response.raise_for_status.side_effect = Exception("Internal Server Error")
+        mock_response.raise_for_status.side_effect = requests.RequestException("Internal Server Error")
 
-        with pytest.raises(Exception, match="Internal Server Error"):
+        with pytest.raises(requests.RequestException, match="Internal Server Error"):
             api_instance.process_response("http://example.com", payload={"test": "data"})
 
+    def test_missing_token_for_sql_query(self, api_instance):
+        """
+        Test that attempting SQL queries without a token raises an appropriate error.
+        Verifies token validation before query execution.
+        """
+        api_instance._provided_token = None
+        with pytest.raises(ValueError, match="Token is required"):
+            api_instance._execute_sql_query("SELECT * FROM test")
+
     @patch("sde_collections.sinequa_api.Api.process_response")
     def test_query(self, mock_process_response, api_instance):
-        """Test that query sends correct payload and processes response."""
+        """
+        Test that query method:
+        1. Constructs the correct URL and payload based on input parameters
+        2. Processes API response correctly
+        3. Returns expected data structure
+        """
         mock_process_response.return_value = {"result": "success"}
         response = api_instance.query(page=1, collection_config_folder="folder")
         assert response == {"result": "success"}
 
+        # Verify payload construction
+        mock_process_response.assert_called_once()
+        call_args = mock_process_response.call_args
+        assert "folder" in str(call_args)  # Verify collection folder is included
+        assert "page" in str(call_args)  # Verify pagination parameters
+
     def test_process_rows_to_records(self, api_instance):
-        """Test processing row data into record dictionaries."""
+        """
+        Test processing of raw SQL row data into structured record dictionaries.
+        Verifies:
+        1. Correct parsing of valid input data
+        2. Error handling for malformed rows
+        3. Output format consistency
+        """
         # Test valid input
         valid_rows = [["http://example.com/1", "Text 1", "Title 1"], ["http://example.com/2", "Text 2", "Title 2"]]
         expected_output = [
@@ -85,7 +131,13 @@ def test_process_rows_to_records(self, api_instance):
 
     @patch("sde_collections.sinequa_api.Api.process_response")
     def test_execute_sql_query(self, mock_process_response, api_instance):
-        """Test SQL query execution."""
+        """
+        Test SQL query execution with token-based authentication.
+        Verifies:
+        1. Query construction
+        2. Token validation
+        3. Response processing
+        """
         mock_process_response.return_value = {"Rows": [], "TotalRowCount": 0}
 
         # Test successful query
@@ -99,7 +151,13 @@ def test_execute_sql_query(self, mock_process_response, api_instance):
 
     @patch("sde_collections.sinequa_api.Api._execute_sql_query")
     def test_get_full_texts_pagination(self, mock_execute_sql, api_instance):
-        """Test that get_full_texts correctly handles pagination."""
+        """
+        Test pagination handling in get_full_texts method.
+        Verifies:
+        1. Correct batch processing
+        2. Accurate record counting
+        3. Proper iteration termination
+        """
         # Mock responses for two pages of results
         mock_execute_sql.side_effect = [
             {
@@ -117,17 +175,18 @@ def test_get_full_texts_pagination(self, mock_execute_sql, api_instance):
         assert len(batches[0]) == 2  # First batch has 2 records
         assert len(batches[1]) == 1  # Second batch has 1 record
 
-        # Verify content of first batch
+        # Verify content of batches
         assert batches[0] == [
             {"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"},
             {"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"},
         ]
-
-        # Verify content of second batch
         assert batches[1] == [{"url": "http://example.com/3", "full_text": "Text 3", "title": "Title 3"}]
 
     def test_get_full_texts_missing_index(self, api_instance):
-        """Test that get_full_texts raises error when index is missing from config."""
+        """
+        Test error handling when index configuration is missing.
+        Verifies appropriate error message and exception type.
+        """
         api_instance.config.pop("index", None)
         with pytest.raises(ValueError, match="Index not defined for server"):
             next(api_instance.get_full_texts("test_folder"))
@@ -141,7 +200,13 @@ def test_get_full_texts_missing_index(self, api_instance):
     )
     @patch("requests.post")
     def test_query_authentication(self, mock_post, server_name, expect_auth, api_instance):
-        """Test authentication handling for different server types."""
+        """
+        Test authentication handling for different server types.
+        Verifies:
+        1. Dev servers require authentication
+        2. Production servers skip authentication
+        3. Correct credential handling
+        """
         api_instance.server_name = server_name
         mock_post.return_value = MagicMock(status_code=200, json=lambda: {"result": "success"})
 
@@ -154,7 +219,10 @@ def test_query_authentication(self, mock_post, server_name, expect_auth, api_ins
 
     @patch("requests.post")
     def test_query_dev_server_missing_credentials(self, mock_post, api_instance):
-        """Test that dev servers raise error when credentials are missing."""
+        """
+        Test error handling for dev servers with missing credentials.
+        Verifies appropriate error messages and authentication requirements.
+        """
         api_instance.server_name = "xli"
         api_instance._provided_user = None
         api_instance._provided_password = None
@@ -164,7 +232,13 @@ def test_query_dev_server_missing_credentials(self, mock_post, api_instance):
 
     @patch("sde_collections.sinequa_api.Api._execute_sql_query")
     def test_get_full_texts_batch_size_reduction(self, mock_execute_sql, api_instance):
-        """Test that batch size reduces appropriately on failure and continues processing."""
+        """
+        Test batch size reduction logic when queries fail.
+        Verifies:
+        1. Progressive batch size reduction
+        2. Retry mechanism
+        3. Successful recovery
+        """
         # Mock first query to fail, then succeed with smaller batch
         mock_execute_sql.side_effect = [
             requests.RequestException("Query too large"),  # First attempt fails
@@ -181,7 +255,7 @@ def test_get_full_texts_batch_size_reduction(self, mock_execute_sql, api_instanc
         assert len(batches[0]) == 1
         assert batches[0][0]["url"] == "http://example.com/1"
 
-        # Verify the calls made - first with original size, then with reduced size
+        # Verify batch size reduction logic
         assert mock_execute_sql.call_count == 2
         first_call = mock_execute_sql.call_args_list[0][0][0]
         second_call = mock_execute_sql.call_args_list[1][0][0]
@@ -190,24 +264,88 @@ def test_get_full_texts_batch_size_reduction(self, mock_execute_sql, api_instanc
 
     @patch("sde_collections.sinequa_api.Api._execute_sql_query")
     def test_get_full_texts_minimum_batch_size(self, mock_execute_sql, api_instance):
-        """Test behavior when reaching minimum batch size."""
+        """
+        Test behavior when reaching minimum batch size.
+        Verifies error handling at minimum batch size threshold.
+        """
         mock_execute_sql.side_effect = requests.RequestException("Query failed")
 
         # Start with batch_size=4, min_batch_size=1
-        # Should try: 4 -> 2 -> 1 -> raise error
         with pytest.raises(ValueError, match="Failed to process batch even at minimum size 1"):
             list(api_instance.get_full_texts("test_folder", batch_size=4, min_batch_size=1))
 
-        # Should have tried 3 times before giving up
+        # Verify retry attempts
         assert mock_execute_sql.call_count == 3
         calls = mock_execute_sql.call_args_list
         assert "COUNT 4" in calls[0][0][0]  # First try with 4
         assert "COUNT 2" in calls[1][0][0]  # Second try with 2
         assert "COUNT 1" in calls[2][0][0]  # Final try with 1
 
+    @patch("requests.post")
+    def test_sql_query_construction(self, mock_post, api_instance):
+        """
+        Test direct SQL query execution with specific URL and payload validation.
+        Verifies:
+        1. Correct URL construction
+        2. Proper payload formatting
+        3. Token-based authentication
+        """
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"Rows": [["http://example.com", "sample text", "sample title"]]}
+        mock_post.return_value = mock_response
+
+        sql = "SELECT url1, text, title FROM test_index WHERE collection = '/SDE/sample_folder/'"
+        api_instance._execute_sql_query(sql)
+
+        # Verify URL and payload construction
+        mock_post.assert_called_once()
+        call_args = mock_post.call_args
+
+        # Get the actual payload from the call arguments
+        _, kwargs = call_args
+        payload = json.loads(kwargs.get("data", "{}"))
+
+        # Verify each component separately
+        assert "engine.sql" in call_args[0][0]  # Verify endpoint
+        assert kwargs["headers"]["Authorization"] == "Bearer test_token"  # Verify token usage
+        assert payload["sql"] == sql  # Verify SQL query inclusion
+
+    def test_process_full_text_response(self, api_instance):
+        """
+        Test static method for processing full text response data.
+        Verifies:
+        1. Correct parsing of raw response data
+        2. Proper dictionary structure creation
+        3. Error handling for invalid response format
+        """
+        # Test valid response processing
+        raw_response = {
+            "Rows": [
+                ["http://example.com/article1", "Full text 1", "Title 1"],
+                ["http://example.com/article2", "Full text 2", "Title 2"],
+            ]
+        }
+        expected = [
+            {"url": "http://example.com/article1", "full_text": "Full text 1", "title": "Title 1"},
+            {"url": "http://example.com/article2", "full_text": "Full text 2", "title": "Title 2"},
+        ]
+        processed = Api._process_full_text_response(raw_response)
+        assert processed == expected
+
+        # Test invalid response format
+        with pytest.raises(ValueError, match="Invalid response format"):
+            Api._process_full_text_response({"wrong_key": []})
+
     @patch("sde_collections.sinequa_api.Api._execute_sql_query")
     def test_get_full_texts_batch_size_progression(self, mock_execute_sql, api_instance):
-        """Test multiple batch size reductions followed by successful query."""
+        """
+        Test multiple batch size reductions followed by successful query.
+        Verifies:
+        1. Progressive batch size reduction steps
+        2. Recovery after multiple failures
+        3. Final successful query execution
+        """
         mock_execute_sql.side_effect = [
             requests.RequestException("First failure"),
             requests.RequestException("Second failure"),

From 68bb9ebb20ed83f157050c43c3cbf5311374a478 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 5 Feb 2025 00:54:11 -0600
Subject: [PATCH 397/441] serialzed and changed API structure to fit LRM
 requirements

---
 sde_collections/serializers.py | 66 ++++++++++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 8159dbb1..48e739e2 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -235,8 +235,70 @@ class Meta:
         )
 
     def get_tdamm_tag(self, obj):
-        tags = obj.tdamm_tag
-        return tags if tags is not None else []
+        if not obj.tdamm_tag or obj.tdamm_tag == ["NOT_TDAMM"]:
+            return {"messengers": [], "objects": [], "signals": []}
+
+        categorized_tags = {"messengers": [], "objects": [], "signals": []}
+
+        tag_transformations = {
+            "MMA_M_EM": "Messenger/EM Radiation",
+            "MMA_M_EM_G": "Messenger/EM Radiation/Gamma rays",
+            "MMA_M_EM_X": "Messenger/EM Radiation/X-rays",
+            "MMA_M_EM_U": "Messenger/EM Radiation/Ultraviolet",
+            "MMA_M_EM_O": "Messenger/EM Radiation/Optical",
+            "MMA_M_EM_I": "Messenger/EM Radiation/Infrared",
+            "MMA_M_EM_M": "Messenger/EM Radiation/Microwave",
+            "MMA_M_EM_R": "Messenger/EM Radiation/Radio",
+            "MMA_M_G": "Messenger/Gravitational Waves",
+            "MMA_M_G_CBI": "Messenger/Gravitational Waves/Compact Binary Inspiral",
+            "MMA_M_G_S": "Messenger/Gravitational Waves/Stochastic",
+            "MMA_M_G_CON": "Messenger/Gravitational Waves/Continuous",
+            "MMA_M_G_B": "Messenger/Gravitational Waves/Burst",
+            "MMA_M_C": "Messenger/Cosmic Rays",
+            "MMA_M_N": "Messenger/Neutrinos",
+            "MMA_O_BI": "Objects/Binaries",
+            "MMA_O_BI_BBH": "Objects/Binaries/Binary Black Holes",
+            "MMA_O_BI_BNS": "Objects/Binaries/Binary Neutron Stars",
+            "MMA_O_BI_C": "Objects/Binaries/Cataclysmic Variables",
+            "MMA_O_BI_N": "Objects/Binaries/Neutron Star-Black Hole",
+            "MMA_O_BI_B": "Objects/Binaries/Binary Pulsars",
+            "MMA_O_BI_W": "Objects/Binaries/White Dwarf Binaries",
+            "MMA_O_BH": "Objects/Black Holes",
+            "MMA_O_BH_AGN": "Objects/Black Holes/Active Galactic Nuclei",
+            "MMA_O_BH_IM": "Objects/Black Holes/Intermediate mass",
+            "MMA_O_BH_STM": "Objects/Black Holes/Stellar mass",
+            "MMA_O_BH_SUM": "Objects/Black Holes/Supermassive",
+            "MMA_O_E": "Objects/Exoplanets",
+            "MMA_O_N": "Objects/Neutron Stars",
+            "MMA_O_N_M": "Objects/Neutron Stars/Magnetars",
+            "MMA_O_N_P": "Objects/Neutron Stars/Pulsars",
+            "MMA_O_N_PWN": "Objects/Neutron Stars/Pulsar Wind Nebula",
+            "MMA_O_S": "Objects/Supernova Remnants",
+            "MMA_S_F": "Signals/Fast Radio Bursts",
+            "MMA_S_G": "Signals/Gamma-ray Bursts",
+            "MMA_S_K": "Signals/Kilonovae",
+            "MMA_S_N": "Signals/Novae",
+            "MMA_S_P": "Signals/Pevatrons",
+            "MMA_S_ST": "Signals/Stellar flares",
+            "MMA_S_SU": "Signals/Supernovae",
+        }
+
+        for tag in obj.tdamm_tag:
+            if tag == "NOT_TDAMM":
+                continue
+
+            transformed_tag = tag_transformations.get(tag)
+            if not transformed_tag:
+                continue
+
+            if tag.startswith("MMA_M_"):
+                categorized_tags["messengers"].append(transformed_tag)
+            elif tag.startswith("MMA_O_"):
+                categorized_tags["objects"].append(transformed_tag)
+            elif tag.startswith("MMA_S_"):
+                categorized_tags["signals"].append(transformed_tag)
+
+        return categorized_tags
 
     def get_document_type(self, obj):
         if obj.document_type is not None:

From 4c85183eb957e1a1116a34f37f032382d92385f0 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 5 Feb 2025 22:53:01 -0600
Subject: [PATCH 398/441] Add documentation for frontend testing methodologies

---
 docs/test/frontend_testing_methodologies.md | 239 ++++++++++++++++++++
 1 file changed, 239 insertions(+)
 create mode 100644 docs/test/frontend_testing_methodologies.md

diff --git a/docs/test/frontend_testing_methodologies.md b/docs/test/frontend_testing_methodologies.md
new file mode 100644
index 00000000..5418ecf3
--- /dev/null
+++ b/docs/test/frontend_testing_methodologies.md
@@ -0,0 +1,239 @@
+# Frontend Testing Methodologies for Django Projects
+## Overview
+
+This document outlines testing methodologies for Django projects with HTML forms and JavaScript enhancements, focusing on Python-based testing solutions. While going through the codebase, I can see it is primarily a JavaScript-heavy frontend that uses plain HTML forms enhanced with JavaScript/jQuery rather than server-rendered Django forms. Django forms are being used only in the admin panel of the project.
+
+## Primary Testing Tools
+
+### 1. Selenium with Python (Chosen)
+
+#### Capabilities
+- Full browser automation
+- JavaScript execution support
+- Real DOM interaction
+- Cross-browser testing
+- Modal dialog handling
+- AJAX request testing
+- File upload testing
+- DataTables interaction
+
+#### Implementation
+```python
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+class TestCollectionDetail:
+    def setup_method(self):
+        self.driver = webdriver.Chrome()
+        self.wait = WebDriverWait(self.driver, 10)
+
+    def test_title_change_modal(self):
+        # Example test for title change modal
+        self.driver.get("/collections/1/")
+        title_button = self.wait.until(
+            EC.element_to_be_clickable((By.ID, "change-title-btn"))
+        )
+        title_button.click()
+        
+        modal = self.wait.until(
+            EC.visibility_of_element_located((By.ID, "title-modal"))
+        )
+        
+        form = modal.find_element(By.TAG_NAME, "form")
+        input_field = form.find_element(By.NAME, "title")
+        input_field.send_keys("New Title")
+        form.submit()
+        
+        # Wait for AJAX completion
+        self.wait.until(
+            EC.text_to_be_present_in_element((By.ID, "collection-title"), "New Title")
+        )
+```
+
+#### Pros
+- Complete end-to-end testing
+- Real browser interaction
+- JavaScript support
+- Comprehensive API
+- Strong community support
+
+#### Drawbacks
+- Slower execution
+- Browser dependencies
+- More complex setup
+- Can be flaky with timing issues
+
+### 2. pytest-django with django-test-client
+
+#### Capabilities
+- Form submission testing
+- Response validation
+- Header verification
+- Status code checking
+- Session handling
+- Template rendering testing
+
+#### Implementation
+```python
+import pytest
+from django.urls import reverse
+
+@pytest.mark.django_db
+class TestCollectionForms:
+    def test_collection_create(self, client):
+        url = reverse('collection_create')
+        data = {
+            'title': 'Test Collection',
+            'division': 'division1',
+            'workflow_status': 'active'
+        }
+        response = client.post(url, data)
+        assert response.status_code == 302  # Redirect after success
+        
+        # Verify creation
+        response = client.get(reverse('collection_detail', kwargs={'pk': 1}))
+        assert 'Test Collection' in response.content.decode()
+```
+
+#### Pros
+- Fast execution
+- No browser dependency
+- Simpler setup
+- Integrated with Django
+
+#### Drawbacks
+- **No JavaScript support (Dealbreaker)**
+- Limited DOM interaction
+- Can't test real user interactions
+
+### 3. Playwright for Python
+
+#### Capabilities
+- Modern browser automation
+- Async/await support
+- Network interception
+- Mobile device emulation
+- Automatic waiting
+- Screenshot and video capture
+
+#### Implementation
+```python
+from playwright.sync_api import sync_playwright
+
+def test_modal_form_submission():
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+        
+        page.goto("/collections/")
+        
+        # Click button to open modal
+        page.click("#add-collection-btn")
+        
+        # Fill form in modal
+        page.fill("#title-input", "New Collection")
+        page.fill("#division-input", "Division A")
+        
+        # Submit form
+        page.click("#submit-btn")
+        
+        # Wait for success message
+        success_message = page.wait_for_selector(".toast-success")
+        assert "Collection created" in success_message.text_content()
+        
+        browser.close()
+```
+
+#### Pros
+- Modern API design
+- Better stability than Selenium
+- Built-in async support
+- Powerful debugging tools
+
+#### Drawbacks
+- **Newer tool, smaller community (Dealbreaker)**
+- Additional system dependencies
+- Learning curve for async features
+
+
+### 4. Beautiful Soup with Requests
+A combination for testing HTML structure and content.
+
+**Capabilities:**
+- HTML parsing and validation
+- Content extraction
+- Structure verification
+- Link checking
+- Form field validation
+- Template testing
+
+**Pros:**
+- Lightweight solution
+- Flexible HTML parsing
+- No browser dependency
+- Fast execution
+- Simple API
+- Low resource usage
+
+**Drawbacks:**
+- **No JavaScript support (Dealbreaker)**
+- Limited interaction testing
+- No visual testing
+- Basic functionality only
+- No real browser simulation
+
+## Testing Strategy Recommendations
+
+1. **Primary Testing Tool**: Selenium with Python
+   - Best suited for your JavaScript-heavy interface
+   - Handles modals and AJAX naturally
+   - Extensive documentation and community support
+
+2. **Test Coverage Areas**:
+   - Modal form interactions
+   - AJAX submissions
+   - DataTables functionality
+   - Form validation
+   - Success/error messages
+   - URL routing
+   - DOM updates
+
+## Implementation Steps
+
+1. Set up testing environment:
+```bash
+pip install selenium pytest pytest-django
+```
+
+2. Create base test classes:
+```python
+import pytest
+from selenium import webdriver
+
+class BaseUITest:
+    @pytest.fixture(autouse=True)
+    def setup_class(self):
+        self.driver = webdriver.Chrome()
+        yield
+        self.driver.quit()
+
+    def login(self):
+        # Common login logic
+        pass
+```
+
+3. Organize tests by feature:
+```python
+class TestCollectionManagement(BaseUITest):
+    def test_create_collection(self):
+        pass
+
+    def test_edit_collection(self):
+        pass
+
+class TestURLPatterns(BaseUITest):
+    def test_add_include_pattern(self):
+        pass
+```
\ No newline at end of file

From 464f8d150478dbb1885dca718ca9e723ca050495 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Wed, 5 Feb 2025 23:04:39 -0600
Subject: [PATCH 399/441] Make doc enhancements

---
 docs/test/frontend_testing_methodologies.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/test/frontend_testing_methodologies.md b/docs/test/frontend_testing_methodologies.md
index 5418ecf3..efe35de6 100644
--- a/docs/test/frontend_testing_methodologies.md
+++ b/docs/test/frontend_testing_methodologies.md
@@ -184,6 +184,20 @@ A combination for testing HTML structure and content.
 - Basic functionality only
 - No real browser simulation
 
+## Feature Comparison Table
+
+| Feature                    | Selenium | Django Test Client | Playwright | Beautiful Soup |
+|---------------------------|----------|-------------------|------------|----------------|
+| JavaScript Support        | ✅ Yes    | ❌ No             | ✅ Yes      | ❌ No          |
+| Setup Complexity          | 🟡 Medium   | 🟢 Low            | 🟡 Medium   | 🟢 Low         |
+| Execution Speed           | 🔴 Slow   | 🟢 Fast           | 🟡 Medium   | 🟢 Fast        |
+| Modal Testing             | ✅ Yes    | ❌ No             | ✅ Yes      | ❌ No          |
+| AJAX Testing              | ✅ Yes    | ❌ No             | ✅ Yes      | ❌ No          |
+| Cross-browser Testing     | ✅ Yes    | ❌ No             | ✅ Yes      | ❌ No          |
+| Real User Interaction     | ✅ Yes    | ❌ No             | ✅ Yes      | ❌ No          |
+| Documentation Quality    | ✅ Excellent| ✅ Good          | ✅ Good     | ✅ Good        |
+| Community Support        | ✅ Large   | ✅ Large          | 🟡 Growing  | ✅ Large       |
+
 ## Testing Strategy Recommendations
 
 1. **Primary Testing Tool**: Selenium with Python

From cef4d38cf6146a7a31ab6bdd5515a7a93d6c2b2b Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Thu, 6 Feb 2025 00:21:28 -0600
Subject: [PATCH 400/441] HTML validator has been set at serializer level

---
 feedback/serializers.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/feedback/serializers.py b/feedback/serializers.py
index 6956e16f..8359b3bc 100644
--- a/feedback/serializers.py
+++ b/feedback/serializers.py
@@ -1,9 +1,28 @@
+import re
+
 from rest_framework import serializers
 
 from .models import ContentCurationRequest, Feedback
 
 
+class HTMLFreeCharField(serializers.CharField):
+    def to_internal_value(self, data):
+        value = super().to_internal_value(data)
+
+        if re.search(r"<[^>]+>", value):
+            raise serializers.ValidationError("HTML tags are not allowed in this field")
+
+        return value
+
+
 class FeedbackSerializer(serializers.ModelSerializer):
+
+    name = HTMLFreeCharField(max_length=150)
+    email = serializers.EmailField()
+    subject = HTMLFreeCharField(max_length=400)
+    comments = HTMLFreeCharField()
+    source = HTMLFreeCharField(max_length=50, required=False, default="SDE")
+
     class Meta:
         model = Feedback
         fields = [

From a2c6ceafd0bbb8d8105a5f7ff87e33b001f711b2 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Thu, 6 Feb 2025 13:30:36 -0600
Subject: [PATCH 401/441] Add implementation steps

---
 docs/test/frontend_testing_methodologies.md | 40 ++++++++++++++++++---
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/docs/test/frontend_testing_methodologies.md b/docs/test/frontend_testing_methodologies.md
index efe35de6..3a9348a4 100644
--- a/docs/test/frontend_testing_methodologies.md
+++ b/docs/test/frontend_testing_methodologies.md
@@ -216,12 +216,37 @@ A combination for testing HTML structure and content.
 
 ## Implementation Steps
 
-1. Set up testing environment:
+1. Add testing dependencies to requirements file `requirements/local.txt`:
+```text
+# Testing Dependencies
+selenium>=4.15.2
+pytest-xdist>=3.3.1
+pytest-cov>=4.1.0
+```
+
+2. Update Dockerfile `compose/local/django/Dockerfile` to install Chrome and ChromeDriver:
+```dockerfile
+# Install Chrome and ChromeDriver for Selenium tests
+RUN apt-get update && apt-get install -y \
+    chromium \
+    chromium-driver \
+    && rm -rf /var/lib/apt/lists/*
+```
+
+3. Rebuild Docker container to apply changes:
 ```bash
-pip install selenium pytest pytest-django
+docker-compose -f local.yml build django
 ```
 
-2. Create base test classes:
+4. Create test directory structure:
+```bash
+mkdir -p tests/frontend
+touch tests/frontend/__init__.py
+touch tests/frontend/base.py
+touch tests/frontend/test_setup.py
+```
+
+5. Create base test classes:
 ```python
 import pytest
 from selenium import webdriver
@@ -238,7 +263,7 @@ class BaseUITest:
         pass
 ```
 
-3. Organize tests by feature:
+6. Organize tests by feature:
 ```python
 class TestCollectionManagement(BaseUITest):
     def test_create_collection(self):
@@ -250,4 +275,9 @@ class TestCollectionManagement(BaseUITest):
 class TestURLPatterns(BaseUITest):
     def test_add_include_pattern(self):
         pass
-```
\ No newline at end of file
+```
+
+7. Run tests:
+```bash
+docker-compose -f local.yml run --rm django pytest tests/frontend/test_setup.py -v
+```

From 8e8100b54e9d509e8afecccef403ceb1c5ced124 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Thu, 6 Feb 2025 13:16:32 -0600
Subject: [PATCH 402/441] Setup infra / Implement basic frontend test

---
 compose/local/django/Dockerfile              |  1 +
 requirements/local.txt                       |  3 +
 sde_collections/tests/frontend/__init__.py   |  0
 sde_collections/tests/frontend/base.py       | 59 ++++++++++++++++++++
 sde_collections/tests/frontend/test_setup.py | 16 ++++++
 5 files changed, 79 insertions(+)
 create mode 100644 sde_collections/tests/frontend/__init__.py
 create mode 100644 sde_collections/tests/frontend/base.py
 create mode 100644 sde_collections/tests/frontend/test_setup.py

diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
index 10c3d17e..b2fc17e6 100644
--- a/compose/local/django/Dockerfile
+++ b/compose/local/django/Dockerfile
@@ -52,6 +52,7 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   && wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \
   && apt-get update \
   && apt-get install -y postgresql-15 postgresql-client-15 \
+  && apt-get install -y chromium chromium-driver \
   # cleaning up unused files
   && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
   && rm -rf /var/lib/apt/lists/*
diff --git a/requirements/local.txt b/requirements/local.txt
index ef637bca..4a85e096 100644
--- a/requirements/local.txt
+++ b/requirements/local.txt
@@ -13,6 +13,9 @@ pytest==8.0.0  # https://github.com/pytest-dev/pytest
 pytest-sugar==1.0.0  # https://github.com/Frozenball/pytest-sugar
 types-requests # maybe instead, we should add `mypy --install-types` to the dockerfile?
 types-xmltodict
+pytest-xdist>=3.3.1
+pytest-cov>=4.1.0
+selenium>=4.15.2    # Selenium (Frontend Testing)
 
 # Documentation
 # ------------------------------------------------------------------------------
diff --git a/sde_collections/tests/frontend/__init__.py b/sde_collections/tests/frontend/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/sde_collections/tests/frontend/base.py b/sde_collections/tests/frontend/base.py
new file mode 100644
index 00000000..bfc3f72c
--- /dev/null
+++ b/sde_collections/tests/frontend/base.py
@@ -0,0 +1,59 @@
+import shutil
+import subprocess
+
+import pytest
+from django.contrib.staticfiles.testing import StaticLiveServerTestCase
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+
+
+class BaseTestCase(StaticLiveServerTestCase):
+    """Base class for all frontend tests using Selenium."""
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        # Verify ChromeDriver is available
+        chromedriver_path = shutil.which("chromedriver")
+        if not chromedriver_path:
+            pytest.fail("ChromeDriver not found. Please ensure chromium-driver is installed.")
+
+        # # Set up Chrome options
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.binary_location = "/usr/bin/chromium"
+
+        try:
+            # Create service with explicit path
+            service = Service(executable_path=chromedriver_path, log_path="/tmp/chromedriver.log")
+
+            # Initialize WebDriver with service and options
+            cls.driver = webdriver.Chrome(service=service, options=chrome_options)
+
+            cls.driver.set_window_size(1920, 1080)
+            cls.driver.implicitly_wait(10)
+            cls.wait = WebDriverWait(cls.driver, 10)
+
+        except Exception as e:
+            # Print debugging information
+            subprocess.run(["which", "chromium"])
+            subprocess.run(["which", "chromedriver"])
+            subprocess.run(["chromium", "--version"])
+            subprocess.run(["chromedriver", "--version"])
+            pytest.fail(f"Failed to initialize ChromeDriver: {str(e)}")
+
+    @classmethod
+    def tearDownClass(cls):
+        if hasattr(cls, "driver"):
+            cls.driver.quit()
+        super().tearDownClass()
+
+    def setUp(self):
+        """Set up test case."""
+        super().setUp()
+        # Add any additional setup here
diff --git a/sde_collections/tests/frontend/test_setup.py b/sde_collections/tests/frontend/test_setup.py
new file mode 100644
index 00000000..dc8561dd
--- /dev/null
+++ b/sde_collections/tests/frontend/test_setup.py
@@ -0,0 +1,16 @@
+from .base import BaseTestCase
+
+
+class TestSetup(BaseTestCase):
+    """Verify Selenium setup is working correctly."""
+
+    def test_basic_page_load(self):
+        """Test that we can load a page."""
+        # Print the live server URL
+        print(f"\nTest server running at: {self.live_server_url}")
+
+        self.driver.get(self.live_server_url)
+        print(f"Current URL: {self.driver.current_url}")
+        print(f"Page Title: {self.driver.title}")
+
+        assert self.driver.title == "Sign In | COSMOS"

From 82cbf3ebebd63eff60698119e849f3e58c5ab218 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Thu, 6 Feb 2025 14:50:46 -0600
Subject: [PATCH 403/441] Add few initial frontend tests

---
 sde_collections/tests/frontend/base.py        | 76 +++++++++++++++++++
 sde_collections/tests/frontend/test_auth.py   | 57 ++++++++++++++
 .../tests/frontend/test_collections.py        | 56 ++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 sde_collections/tests/frontend/test_auth.py
 create mode 100644 sde_collections/tests/frontend/test_collections.py

diff --git a/sde_collections/tests/frontend/base.py b/sde_collections/tests/frontend/base.py
index bfc3f72c..55fc6c43 100644
--- a/sde_collections/tests/frontend/base.py
+++ b/sde_collections/tests/frontend/base.py
@@ -2,10 +2,13 @@
 import subprocess
 
 import pytest
+from django.contrib.auth import get_user_model
 from django.contrib.staticfiles.testing import StaticLiveServerTestCase
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
 
@@ -57,3 +60,76 @@ def setUp(self):
         """Set up test case."""
         super().setUp()
         # Add any additional setup here
+
+    def create_test_user(self, username="test_user", password="test_password123", **kwargs):
+        """Create a test user for login testing."""
+        User = get_user_model()
+
+        # Delete user if it already exists
+        User.objects.filter(username=username).delete()
+
+        user_data = {
+            "username": username,
+            "is_active": True,
+            "is_staff": True,  # Ensure user is staff
+            "is_superuser": False,  # Ensure user is superuser
+        }
+        user_data.update(kwargs)
+
+        user = User.objects.create_user(**user_data)
+        user.set_password(password)
+        user.save()
+
+        # Verify user was created correctly
+        print(f"\nCreated user: {username}")
+        print(f"Is active: {user.is_active}")
+        print(f"Is staff: {user.is_staff}")
+        print(f"Is superuser: {user.is_superuser}")
+
+        return user, password
+
+    def login(self, username="test_user", password="test_password123"):
+        """
+        Login helper method.
+        Returns True if login successful, False otherwise.
+        """
+        # Navigate to login page
+        self.driver.get(f"{self.live_server_url}/accounts/login/")
+
+        try:
+            # Wait for and fill username
+            username_input = self.wait.until(EC.presence_of_element_located((By.NAME, "login")))
+            username_input.send_keys(username)
+
+            # Fill password
+            password_input = self.driver.find_element(By.NAME, "password")
+            password_input.send_keys(password)
+
+            # Find and click the login button
+            login_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
+            login_button.click()
+
+            # Wait for successful login by checking for redirect
+            self.wait.until(EC.url_changes("/accounts/login/"))
+
+            # Print debug information
+            print(f"Current URL after login: {self.driver.current_url}")
+            return True
+
+        except Exception as e:
+            print(f"Login failed: {str(e)}")
+            return False
+
+    def logout(self):
+        """Logout helper method."""
+        try:
+            # Click logout link/button (adjust selector based on your UI)
+            logout_link = self.driver.find_element(By.CSS_SELECTOR, "a[href*='logout']")
+            logout_link.click()
+
+            # Wait for redirect to login page
+            self.wait.until(EC.presence_of_element_located((By.NAME, "username")))
+            return True
+        except Exception as e:
+            print(f"Logout failed: {str(e)}")
+            return False
diff --git a/sde_collections/tests/frontend/test_auth.py b/sde_collections/tests/frontend/test_auth.py
new file mode 100644
index 00000000..3c84cef0
--- /dev/null
+++ b/sde_collections/tests/frontend/test_auth.py
@@ -0,0 +1,57 @@
+from .base import BaseTestCase
+
+
+class TestAuthentication(BaseTestCase):
+    """Test authentication functionality."""
+
+    def setUp(self):
+        super().setUp()
+        self.test_username = "test_user"
+        self.test_password = "test_password123"
+        self.user, _ = self.create_test_user(username=self.test_username, password=self.test_password)
+
+    def test_successful_login(self):
+        """Test successful login process."""
+        # Attempt login
+        login_success = self.login(self.test_username, self.test_password)
+        assert login_success, "Login should be successful"
+
+        # Verify we're on the dashboard or home page
+        assert "Welcome back!" in self.driver.page_source
+
+        # print(self.driver.page_source)
+
+        # # Verify user menu is present
+        # user_menu = self.wait.until(
+        #     EC.presence_of_element_located((By.CLASS_NAME, "user-menu"))
+        # )
+        # assert self.test_username in user_menu.text
+
+    # def test_failed_login(self):
+    #     """Test login failure with incorrect credentials."""
+    #     login_success = self.login(self.test_username, "wrong_password")
+    #     assert not login_success, "Login should fail with incorrect password"
+
+    #     # Verify error message
+    #     error_message = self.wait.until(
+    #         EC.presence_of_element_located((By.CLASS_NAME, "alert-error"))
+    #     )
+    #     assert "Please enter a correct username and password" in error_message.text
+
+    # def test_logout(self):
+    #     """Test logout functionality."""
+    #     # First login
+    #     login_success = self.login(self.test_username, self.test_password)
+    #     assert login_success, "Login should be successful"
+
+    #     # Then logout
+    #     logout_success = self.logout()
+    #     assert logout_success, "Logout should be successful"
+
+    #     # Verify we're back at login page
+    #     assert "login" in self.driver.current_url.lower()
+
+    def tearDown(self):
+        """Clean up after each test."""
+        self.user.delete()
+        super().tearDown()
diff --git a/sde_collections/tests/frontend/test_collections.py b/sde_collections/tests/frontend/test_collections.py
new file mode 100644
index 00000000..02f17a0f
--- /dev/null
+++ b/sde_collections/tests/frontend/test_collections.py
@@ -0,0 +1,56 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+
+from ..factories import CollectionFactory, UserFactory
+from .base import BaseTestCase
+
+
+class TestCollections(BaseTestCase):
+    """Test collection-related functionality."""
+
+    def setUp(self):
+        """Set up test data."""
+        super().setUp()
+        # Create test user and collections
+        self.user = UserFactory(is_staff=True)
+        self.user.set_password("test_password123")
+        self.user.save()
+
+        # Create 3 test collections
+        self.collections = [CollectionFactory(curated_by=self.user) for _ in range(3)]
+        # Store collection names for verification
+        self.collection_names = [collection.name for collection in self.collections]
+
+    def test_collections_display(self):
+        """Test that collections are displayed after login."""
+        # Login
+        self.login(self.user.username, "test_password123")
+
+        # Navigate to collections page
+        self.driver.get(f"{self.live_server_url}/")
+
+        # Print page source for debugging
+        # print(f"\nCurrent URL: {self.driver.current_url}")
+        print(f"Page Source: {self.driver.page_source}")
+
+        # Wait for specific table to load using ID
+        table = self.wait.until(EC.presence_of_element_located((By.ID, "collection_table")))
+
+        # Additional verification that it's the right table
+        assert "table-striped dataTable" in table.get_attribute("class")
+
+        # Print debug info
+        print(f"\nCurrent URL: {self.driver.current_url}")
+        print(f"Table HTML: {table.get_attribute('outerHTML')}")
+
+        # Get all table text
+        table_text = table.text
+
+        # Verify each collection name is present
+        for collection_name in self.collection_names:
+            assert collection_name in table_text, f"Collection '{collection_name}' not found in table"
+            print(f"Found collection: {collection_name}")
+
+    def tearDown(self):
+        """Clean up test data."""
+        super().tearDown()

From c544c29cb34a2af436dd234cabb32eabbed2a914 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 7 Feb 2025 13:54:10 -0600
Subject: [PATCH 404/441] Issue_1001

---
 .coveragerc                  |  6 +++
 docs/Tests/testing_plan.md   | 62 +++++++++++++++++++++++++
 docs/Tests/testing_plan_2.md | 79 ++++++++++++++++++++++++++++++++
 docs/Tests/testing_plan_3.md | 88 ++++++++++++++++++++++++++++++++++++
 init.sh                      | 24 ++++++++--
 requirements/local.txt       |  1 +
 6 files changed, 256 insertions(+), 4 deletions(-)
 create mode 100644 .coveragerc
 create mode 100644 docs/Tests/testing_plan.md
 create mode 100644 docs/Tests/testing_plan_2.md
 create mode 100644 docs/Tests/testing_plan_3.md

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 00000000..300e53ba
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,6 @@
+[run]
+source = .
+include = */*.py
+
+[report]
+show_missing = True
diff --git a/docs/Tests/testing_plan.md b/docs/Tests/testing_plan.md
new file mode 100644
index 00000000..0694cce6
--- /dev/null
+++ b/docs/Tests/testing_plan.md
@@ -0,0 +1,62 @@
+# Testing Plan for Project
+
+## Overview
+This document outlines the current testing status and identifies priority areas where additional testing is necessary to ensure application robustness and maintainability.
+
+## Note on Test Coverage
+The percentages of test coverage reported below are derived using the `coverage` tool. This tool generates a coverage report as part of the Run Full Test Suite, which triggers every time a commit is made to a branch for which there is an open PR. Observations from these reports can be viewed directly in the terminal.
+
+## Current Testing Status Overview
+
+### Well-Covered Areas (90-100% Coverage)
+- **Configuration and Initial Setup**
+  - `config/__init__.py`, `config/celery_app.py`, `config/settings/base.py`, `config/settings/__init__.py`, `config/settings/test.py`: These configurations are crucial for the application's initial setup and runtime environment, and they are well-tested ensuring stability in configuration loading and processing.
+
+- **Environmental Justice Features**
+  - All modules within `environmental_justice` except some migrations showing high test coverage. This includes models, views, and serializers which are critical to the functionality of the environmental justice features.
+
+- **Feedback System**
+  - Similar to environmental justice, the feedback system shows robust testing across its serializers, views, and most models.
+
+### Areas with Moderate Coverage (70-89% Coverage)
+- **sde_collections**
+  - Some modules in `sde_collections` like `admin.py`, `models/collection.py`, and `models/candidate_url.py` show moderate coverage, suggesting that critical business logic related to collection management could benefit from additional testing.
+
+### Critical Areas with Insufficient Tests (<70% Coverage)
+- **Settings Files**
+  - `config/settings/local.py` and `config/settings/production.py`: These files are crucial as they define environment-specific settings but currently have 0% coverage. Immediate attention is required to ensure that all configurations hold up under various environments.
+
+- **Configuration Generation**
+  - Most of the `config_generation` scripts such as `db_to_xml.py`, `api.py`, `generate_commands.py`, etc., are critically under-tested. Given that these scripts likely play a significant role in setting up and maintaining the application state, comprehensive tests are essential.
+
+- **Utilities and Helpers**
+  - Utility scripts and helpers like those in `sde_collections/utils`, particularly `github_helper.py` and `health_check.py`, show significant gaps in testing. These are important for the application's integration and maintenance operations.
+
+## Recommendations for Immediate Test Development
+
+### High Priority Tests
+1. **Complete Testing for Settings Management**
+   - Develop tests for `config/settings/local.py` and `config/settings/production.py` to validate all configurations under simulated environments to prevent deployment issues.
+
+2. **Robust Tests for Configuration Generation**
+   - Implement unit and integration tests for the `config_generation` module to ensure all configurations and settings are generated correctly and errors are handled gracefully.
+
+3. **Enhanced Testing for Collection Management**
+   - Focus on increasing coverage for `sde_collections/models` and `sde_collections/views`. Given their direct impact on user data management and interface behavior, it is critical to cover these with more comprehensive tests including edge cases and failure modes.
+
+### Medium Priority Tests
+1. **Utility Scripts and Helpers**
+   - Scripts that support operational tasks such as `sde_collections/utils/github_helper.py` and `health_check.py` need thorough testing to ensure reliability in operational tasks.
+
+2. **Admin Interfaces and Forms**
+   - Given the moderate coverage in some admin and form-related areas, additional tests should be considered to cover all user interactions and data validation scenarios.
+
+### Lower Priority Tests
+1. **Further Testing of Well-Covered Areas**
+   - While not immediate, ensure that any new features or changes in areas like `environmental_justice` and `feedback` modules continue to maintain high coverage and reflect any new business logic or changes.
+
+## Testing Plan Execution
+
+- **Schedule and Assignments**: Assign test development tasks according to priority, with scheduled milestones for high-priority tests within the next sprint. Medium and lower priority tests can be scheduled for subsequent sprints.
+- **Resources**: Allocate resources not only for writing tests but also for setting up better test environments and potentially integrating more comprehensive CI/CD pipelines to automate and validate coverage on each build.
+- **Review and Adapt**: Continuously monitor test coverage metrics and adapt the testing plan as the project evolves. This dynamic approach will help ensure that the testing strategy remains aligned with project goals and technological shifts.
diff --git a/docs/Tests/testing_plan_2.md b/docs/Tests/testing_plan_2.md
new file mode 100644
index 00000000..d495e462
--- /dev/null
+++ b/docs/Tests/testing_plan_2.md
@@ -0,0 +1,79 @@
+# Testing Plan
+
+## Overview
+This document provides a comprehensive outline of our application's current test coverage, identifies areas that require immediate testing attention, and guides future test development priorities.
+
+## Note on Test Coverage
+The percentages of test coverage reported below are derived using the `coverage` tool. This tool generates a coverage report as part of the Run Ful Test Suite, which triggers every time a commit is made to a branch for which there is an open PR. Observations from these reports can be viewed directly in the terminal.
+
+## Testing Categories and Current Coverage
+
+### Configuration and Setup
+- **Files and Coverage**:
+  - `config/settings/local.py` (0%)
+  - `config/settings/production.py` (0%)
+  - `config/settings/base.py` (100%)
+  - `config/urls.py` (71%)
+  - `config/wsgi.py` (0%)
+
+### Model Layer
+- **Files and Coverage**:
+  - `environmental_justice/models.py` (97%)
+  - `feedback/models.py` (64%)
+  - `sde_collections/models/candidate_url.py` (82%)
+  - `sde_collections/models/collection.py` (65%)
+  - `sde_collections/models/collection_choice_fields.py` (86%)
+  - `sde_collections/models/delta_patterns.py` (89%)
+  - `sde_collections/models/delta_url.py` (77%)
+  - `sde_collections/models/pattern.py` (46%)
+  - `sde_indexing_helper/users/models.py` (100%)
+
+### Views and Controllers
+- **Files and Coverage**:
+  - `environmental_justice/views.py` (100%)
+  - `feedback/views.py` (100%)
+  - `sde_collections/views.py` (38%)
+
+### Data Serialization and APIs
+- **Files and Coverage**:
+  - `environmental_justice/serializers.py` (100%)
+  - `feedback/serializers.py` (100%)
+  - `sde_collections/serializers.py` (75%)
+
+### Admin Interface
+- **Files and Coverage**:
+  - `environmental_justice/admin.py` (100%)
+  - `feedback/admin.py` (100%)
+  - `sde_collections/admin.py` (66%)
+
+### Utilities and Helpers
+- **Files and Coverage**:
+  - `sde_collections/utils/*.py` (Various coverage)
+  - `sde_indexing_helper/utils/*.py` (0% to 100% coverage)
+
+### Testing Infrastructure
+- **Files and Coverage**:
+  - `sde_collections/tests/*.py` (High coverage with some gaps)
+  - `sde_indexing_helper/users/tests/*.py` (High coverage with minor gaps)
+
+### Database and Migration Scripts
+- **Files and Coverage**:
+  - `config_generation/*.py` (Mostly 0% with some critical scripts untested)
+  - All migration files across modules (Varied coverage, some with 0%)
+
+### Task Automation and Background Jobs
+- **Files and Coverage**:
+  - `sde_collections/tasks.py` (44%)
+
+### Critical Areas for Immediate Testing
+- **Configuration Files**: Immediate attention to `local.py` and `production.py` is crucial for environment-specific settings.
+- **View Layers in `sde_collections`**: Given their centrality to application logic and low coverage.
+- **Model Complexity**: Specifically, the `sde_collections/models/collection.py` and `pattern.py` due to their low coverage and complexity.
+
+## Recommended Actions
+1. **Expand Unit Tests**: Focus on untested configuration files and complex models.
+2. **Integration Tests**: Develop tests for views to ensure full application flow is covered.
+3. **Continuous Integration Improvements**: Ensure that all tests are executed during CI processes, and add checks for test coverage thresholds.
+
+## Conclusion
+Prioritizing these areas will ensure robustness and reliability of our application, addressing both immediate testing gaps and setting a foundation for continuous quality assurance.
diff --git a/docs/Tests/testing_plan_3.md b/docs/Tests/testing_plan_3.md
new file mode 100644
index 00000000..14e6827f
--- /dev/null
+++ b/docs/Tests/testing_plan_3.md
@@ -0,0 +1,88 @@
+# Testing Plan
+
+## Overview
+This document provides a comprehensive outline of our application's current test coverage, identifies areas that require immediate testing attention, and guides future test development priorities.
+
+## Note on Test Coverage
+The percentages of test coverage reported below are derived using the `coverage` tool. This tool generates a coverage report as part of the Run Full Test Suite, which triggers every time a commit is made to a branch for which there is an open PR. Observations from these reports can be viewed directly in the terminal.
+
+## Testing Categories and Current Coverage
+
+### Configuration and Setup
+- **Critical and Untested**:
+  - `config/settings/local.py` (0%): Environment-specific settings for local development.
+  - `config/settings/production.py` (0%): Environment-specific settings for production environments.
+  - `config/wsgi.py` (0%): WSGI configuration for deployment.
+- **Partially Tested**:
+  - `config/urls.py` (71%): URL dispatching and routing configurations.
+- **Tested**:
+  - `config/settings/base.py` (100%): Base settings including middleware, database configurations, etc.
+
+### Model Layer
+- **Critical and Undertested**:
+  - `sde_collections/models/collection.py` (65%): Core model for handling collections, requires deeper testing due to its critical role in data management.
+  - `sde_collections/models/pattern.py` (46%): Handles complex pattern matching logic, significantly under-tested.
+  - `feedback/models.py` (64%): Involves important logic for managing feedback, requires additional coverage to ensure robust data integrity and operations.
+- **Tested**:
+  - `environmental_justice/models.py` (97%)
+  - `sde_collections/models/candidate_url.py` (82%)
+  - `sde_collections/models/collection_choice_fields.py` (86%)
+  - `sde_collections/models/delta_patterns.py` (89%)
+  - `sde_collections/models/delta_url.py` (77%)
+  - `sde_indexing_helper/users/models.py` (100%)
+
+### Views and Controllers
+- **Critical and Undertested**:
+  - `sde_collections/views.py` (38%): Central component for application's user interface logic, critically under-tested.
+- **Tested**:
+  - `environmental_justice/views.py` (100%)
+  - `feedback/views.py` (100%)
+
+### Data Serialization and APIs
+- **Critical and Undertested**:
+  - `sde_collections/serializers.py` (75%): Essential for API interaction, requires further testing to ensure robust data serialization.
+- **Tested**:
+  - `environmental_justice/serializers.py` (100%)
+  - `feedback/serializers.py` (100%)
+
+### Admin Interface
+- **Critical and Undertested**:
+  - `sde_collections/admin.py` (66%): Admin interface for managing application data, requires additional tests.
+- **Tested**:
+  - `environmental_justice/admin.py` (100%)
+  - `feedback/admin.py` (100%)
+
+### Utilities and Helpers
+- **Critical and Undertested/Untested**:
+  - `sde_collections/utils/github_helper.py` (19%)
+  - `sde_collections/utils/health_check.py` (14%)
+  - `sde_collections/utils/bulk_github_push.py` (0%)
+  - `sde_collections/utils/generate_deployment_message.py` (0%)
+  - `sde_collections/utils/slack_utils.py` (79%)
+  - `sde_indexing_helper/utils/storages.py` (0%)
+
+### Testing Infrastructure
+- **Tested with Gaps**:
+  - `sde_collections/tests/*.py`
+  - `sde_indexing_helper/users/tests/*.py`
+
+### Database and Migration Scripts
+- **Critical and Mostly Untested**:
+  - `config_generation/*.py`: Most scripts involved in database configuration and migration scripts are critically under-tested, posing a risk to data integrity and application setup.
+
+### Task Automation and Background Jobs
+- **Critical and Undertested**:
+  - `sde_collections/tasks.py` (44%): Handles background tasks and automation, significantly under-tested considering their operational importance.
+
+## Critical Areas for Immediate Testing
+1. **Configuration Files**: Immediate testing for `config/settings/local.py` and `config/settings/production.py` to ensure they function correctly across different environments.
+2. **Core Business Logic in Models and Views**: Focus on significantly under-tested `sde_collections/models/collection.py` and `sde_collections/views.py`.
+3. **Utility Scripts and Background Jobs**: Address the lack of tests for critical utilities such as `sde_collections/utils/github_helper.py` and `sde_collections/tasks.py`.
+
+## Recommended Actions
+- **Expand Unit Tests**: For critical configuration files and complex models.
+- **Integration Tests**: Enhance coverage for views and serializers to ensure complete application workflows.
+- **Continuous Integration Improvements**: Implement tests during CI processes and enforce coverage thresholds.
+
+## Conclusion
+This plan highlights critical testing needs and outlines a structured approach to addressing immediate testing gaps while setting a long-term foundation for continuous quality assurance. By following this plan, we aim to improve the robustness and reliability of our application systematically.
diff --git a/init.sh b/init.sh
index be89aad0..1e86d679 100644
--- a/init.sh
+++ b/init.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-echo "Running all test cases across the project..."
+echo "Running all test cases across the project with coverage analysis..."
 
 # Initialize a failure counter
 failure_count=0
@@ -10,10 +10,21 @@ excluded_dirs="document_classifier functional_tests"
 # Find all test files except those in excluded directories
 test_files=$(find . -type f -name "test_*.py" | grep -Ev "$(echo $excluded_dirs | sed 's/ /|/g')")
 
-# Run each test file
+# Begin coverage tracking
+coverage erase  # Clear any existing coverage data
+
+# Setup .coveragerc configuration to include all Python files
+echo "[run]
+source = .
+include = */*.py
+
+[report]
+show_missing = True" > .coveragerc
+
+# Run each test file with coverage
 for test_file in $test_files; do
     echo "Running $test_file..."
-    pytest "$test_file"
+    coverage run --append -m pytest "$test_file"  # Using settings from .coveragerc
 
     # Check the exit status of pytest
     if [ $? -ne 0 ]; then
@@ -22,10 +33,15 @@ for test_file in $test_files; do
     fi
 done
 
+# Generate coverage reports
+echo "Generating coverage report..."
+coverage report
+
 # Report the results
 if [ $failure_count -ne 0 ]; then
-    echo "$failure_count test(s) failed."
+    echo "$failure_count test(s) failed. Refer to the terminal output for details."
     exit 1
 else
     echo "All tests passed successfully!"
+    echo "Coverage summary has been output to the terminal."
 fi
diff --git a/requirements/local.txt b/requirements/local.txt
index ef637bca..9ff4aa00 100644
--- a/requirements/local.txt
+++ b/requirements/local.txt
@@ -13,6 +13,7 @@ pytest==8.0.0  # https://github.com/pytest-dev/pytest
 pytest-sugar==1.0.0  # https://github.com/Frozenball/pytest-sugar
 types-requests # maybe instead, we should add `mypy --install-types` to the dockerfile?
 types-xmltodict
+coverage
 
 # Documentation
 # ------------------------------------------------------------------------------

From 4376c6ac1bc5a39cd1d27deac1a3f2ef3a9e16ea Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 7 Feb 2025 13:58:46 -0600
Subject: [PATCH 405/441] add initial testing strategy document

---
 docs/architecture-decisions/testing_strategy.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 docs/architecture-decisions/testing_strategy.md

diff --git a/docs/architecture-decisions/testing_strategy.md b/docs/architecture-decisions/testing_strategy.md
new file mode 100644
index 00000000..7f538315
--- /dev/null
+++ b/docs/architecture-decisions/testing_strategy.md
@@ -0,0 +1,12 @@
+## Overview
+As of early 2025, we have only recently been writing tests for new features, and have about 250 tests in total, mostly centered around the EJ portal, the reindexing process, and pattern applications. 
+
+Although this covers much of the core system logic, there still remain a number of untested logical areas such as the config file generation, core project settings, frontend features, etc.
+
+This document outlines a testing strategy for the project, which will guide us towards adding tests in the most critical areas first, followed by a plan to fully cover the remaining areas.
+
+## Current Coverage
+Using the coverage library, the following report was generated:
+
+
+

From 5979726129e63cf027e4722a604bdd4066957219 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 7 Feb 2025 14:00:50 -0600
Subject: [PATCH 406/441] Issue

---
 .../testing_strategy.md                       | 244 ++++++++++++++++++
 1 file changed, 244 insertions(+)

diff --git a/docs/architecture-decisions/testing_strategy.md b/docs/architecture-decisions/testing_strategy.md
index 7f538315..2b81a62a 100644
--- a/docs/architecture-decisions/testing_strategy.md
+++ b/docs/architecture-decisions/testing_strategy.md
@@ -10,3 +10,247 @@ Using the coverage library, the following report was generated:
 
 
 
+
+
+Generating coverage report...
+Name                                                                                                  Stmts   Miss  Cover   Missing
+-----------------------------------------------------------------------------------------------------------------------------------
+config/__init__.py                                                                                        2      0   100%
+config/celery_app.py                                                                                      6      0   100%
+config/settings/__init__.py                                                                               0      0   100%
+config/settings/base.py                                                                                  94      0   100%
+config/settings/local.py                                                                                 20     20     0%   1-65
+config/settings/production.py                                                                            48     48     0%   1-162
+config/settings/test.py                                                                                   7      0   100%
+config/urls.py                                                                                           14      4    71%   26-47
+config/wsgi.py                                                                                            8      8     0%   17-36
+config_generation/__init__.py                                                                             0      0   100%
+config_generation/api.py                                                                                 34     34     0%   1-88
+config_generation/config_example.py                                                                      15     15     0%   1-69
+config_generation/db_to_xml.py                                                                          203    133    34%   45, 47, 50, 96, 119-125, 129-136, 142-149, 197-200, 206-214, 225-230, 242-271, 274-278, 285-292, 303-308, 311, 317, 326-332, 342-349, 361-368, 371-374, 377-378, 382-390, 393-399, 402-412, 415-429
+config_generation/db_to_xml_file_based.py                                                                52     52     0%   4-119
+config_generation/delete_config_folders.py                                                               24     24     0%   9-50
+config_generation/delete_server_content.py                                                               12     12     0%   3-25
+config_generation/delete_webapp_collections.py                                                            5      5     0%   6-12
+config_generation/export_collections.py                                                                  36     36     0%   1-73
+config_generation/export_whole_index.py                                                                  28     28     0%   1-58
+config_generation/generate_collection_list.py                                                            29     29     0%   8-69
+config_generation/generate_commands.py                                                                   41     41     0%   6-87
+config_generation/generate_emac_indexer.py                                                               24     24     0%   1-81
+config_generation/generate_jobs.py                                                                       42     42     0%   8-100
+config_generation/generate_scrapers.py                                                                   15     15     0%   2-54
+config_generation/minimum_api.py                                                                         33     33     0%   1-81
+config_generation/preprocess_sources.py                                                                  25     25     0%   1-50
+config_generation/sources_to_scrape.py                                                                   28     28     0%   2-1631
+config_generation/tests/__init__.py                                                                       0      0   100%
+config_generation/tests/test_db_to_xml.py                                                                23      3    87%   20, 24, 28
+docs/__init__.py                                                                                          0      0   100%
+docs/conf.py                                                                                             17     17     0%   13-62
+environmental_justice/__init__.py                                                                         0      0   100%
+environmental_justice/admin.py                                                                            5      0   100%
+environmental_justice/apps.py                                                                             4      0   100%
+environmental_justice/migrations/0001_initial.py                                                          5      0   100%
+environmental_justice/migrations/0002_environmentaljusticerow_sde_links.py                                4      0   100%
+environmental_justice/migrations/0003_remove_environmentaljusticerow_sde_links_and_more.py                4      0   100%
+environmental_justice/migrations/0004_alter_environmentaljusticerow_data_visualization_and_more.py        4      0   100%
+environmental_justice/migrations/0005_environmentaljusticerow_destination_server.py                       7      2    71%   5-6
+environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py       9      4    56%   7-20
+environmental_justice/migrations/__init__.py                                                              0      0   100%
+environmental_justice/models.py                                                                          29      1    97%   44
+environmental_justice/serializers.py                                                                      6      0   100%
+environmental_justice/tests/conftest.py                                                                  15      0   100%
+environmental_justice/tests/factories.py                                                                 24      0   100%
+environmental_justice/tests/test_ej_api.py                                                               74      0   100%
+environmental_justice/views.py                                                                           23      0   100%
+feedback/__init__.py                                                                                      0      0   100%
+feedback/admin.py                                                                                        14      0   100%
+feedback/apps.py                                                                                          4      0   100%
+feedback/migrations/0001_initial.py                                                                       5      0   100%
+feedback/migrations/0002_alter_contentcurationrequest_additional_info.py                                  4      0   100%
+feedback/migrations/0003_feedback_source.py                                                               4      0   100%
+feedback/migrations/0004_contentcurationrequest_created_at_and_more.py                                    4      0   100%
+feedback/migrations/__init__.py                                                                           0      0   100%
+feedback/models.py                                                                                       42     15    64%   20-29, 35-44, 61-63
+feedback/serializers.py                                                                                  10      0   100%
+feedback/tests.py                                                                                         0      0   100%
+feedback/urls.py                                                                                          4      0   100%
+feedback/views.py                                                                                         9      0   100%
+manage.py                                                                                                16     16     0%   2-31
+merge_production_dotenvs_in_dotenv.py                                                                    15      1    93%   26
+scripts/ej/cmr_processing.py                                                                            241      5    98%   160, 186-188, 397, 410
+scripts/ej/config.py                                                                                      6      0   100%
+scripts/ej/test_cmr_processing.py                                                                       225      1    99%   610
+scripts/ej/test_threshold_processing.py                                                                  97      1    99%   209
+scripts/ej/threshold_processing.py                                                                       20      0   100%
+sde_collections/__init__.py                                                                               0      0   100%
+sde_collections/admin.py                                                                                212     72    66%   22-24, 29, 34, 40-60, 65-81, 86-89, 98-101, 110-112, 120-134, 143, 148, 153, 158, 163, 168, 173, 178-189, 196-197, 260, 265, 270, 275, 302-303, 308-309, 314-316, 345-372, 478-480
+sde_collections/apps.py                                                                                   4      0   100%
+sde_collections/forms.py                                                                                 15      0   100%
+sde_collections/management/commands/database_backup.py                                                   62      1    98%   68
+sde_collections/management/commands/database_restore.py                                                  83      8    90%   34, 36, 87-89, 142-145
+sde_collections/migrations/0001_initial.py                                                                6      0   100%
+sde_collections/migrations/0002_remove_collection_machine_name.py                                         4      0   100%
+sde_collections/migrations/0003_alter_collection_config_folder.py                                         4      0   100%
+sde_collections/migrations/0004_collection_cleaning_order.py                                              4      0   100%
+sde_collections/migrations/0005_alter_candidateurl_url.py                                                 4      0   100%
+sde_collections/migrations/0006_alter_candidateurl_generated_title_and_more.py                            4      0   100%
+sde_collections/migrations/0007_excludepattern_pattern_type.py                                            4      0   100%
+sde_collections/migrations/0008_alter_excludepattern_match_pattern.py                                     4      0   100%
+sde_collections/migrations/0009_titlepattern_pattern_type.py                                              4      0   100%
+sde_collections/migrations/0010_rename_pattern_type_titlepattern_match_pattern_type_and_more.py           4      0   100%
+sde_collections/migrations/0011_alter_titlepattern_title_pattern_type.py                                  4      0   100%
+sde_collections/migrations/0012_collection_curated_by_collection_curation_started_and_more.py             6      0   100%
+sde_collections/migrations/0013_alter_titlepattern_options_and_more.py                                    5      0   100%
+sde_collections/migrations/0014_alter_documenttypepattern_unique_together_and_more.py                     4      0   100%
+sde_collections/migrations/0015_candidateurl_document_type.py                                             4      0   100%
+sde_collections/migrations/0016_alter_documenttypepattern_candidate_urls_and_more.py                      5      0   100%
+sde_collections/migrations/0017_requiredurls.py                                                           5      0   100%
+sde_collections/migrations/0018_alter_requiredurls_url.py                                                 4      0   100%
+sde_collections/migrations/0019_alter_requiredurls_url.py                                                 4      0   100%
+sde_collections/migrations/0020_alter_collection_curation_status.py                                       4      0   100%
+sde_collections/migrations/0021_alter_collection_curation_status.py                                       4      0   100%
+sde_collections/migrations/0022_alter_candidateurl_unique_together.py                                     4      0   100%
+sde_collections/migrations/0023_collection_github_issue_number.py                                         4      0   100%
+sde_collections/migrations/0024_alter_collection_curation_status.py                                       4      0   100%
+sde_collections/migrations/0025_alter_documenttypepattern_match_pattern_type_and_more.py                  4      0   100%
+sde_collections/migrations/0026_alter_collection_curation_status_and_more.py                              4      0   100%
+sde_collections/migrations/0027_alter_collection_connector.py                                             4      0   100%
+sde_collections/migrations/0028_collection_has_sinequa_config.py                                          4      0   100%
+sde_collections/migrations/0029_alter_candidateurl_document_type_and_more.py                              4      0   100%
+sde_collections/migrations/0030_candidateurl_inference_by.py                                              4      0   100%
+sde_collections/migrations/0031_candidateurl_is_pdf.py                                                    4      0   100%
+sde_collections/migrations/0032_collection_workflow_status.py                                             4      0   100%
+sde_collections/migrations/0033_alter_collection_config_folder.py                                         4      0   100%
+sde_collections/migrations/0034_rename_tree_root_collection_tree_root_deprecated.py                       4      0   100%
+sde_collections/migrations/0035_alter_candidateurl_unique_together.py                                     4      0   100%
+sde_collections/migrations/0036_candidateurl_present_on_prod_and_more.py                                  4      0   100%
+sde_collections/migrations/0037_alter_collection_source.py                                                4      0   100%
+sde_collections/migrations/0037_remove_collection_has_sinequa_config.py                                   4      0   100%
+sde_collections/migrations/0038_merge_20231126_1152.py                                                    4      0   100%
+sde_collections/migrations/0039_includepattern.py                                                         5      0   100%
+sde_collections/migrations/0040_candidateurl_hash.py                                                      4      0   100%
+sde_collections/migrations/0041_alter_candidateurl_hash.py                                                4      0   100%
+sde_collections/migrations/0042_alter_collection_division_and_more.py                                     4      0   100%
+sde_collections/migrations/0043_comments.py                                                               6      0   100%
+sde_collections/migrations/0044_alter_collection_document_type.py                                         4      0   100%
+sde_collections/migrations/0045_alter_collection_workflow_status.py                                       4      0   100%
+sde_collections/migrations/0045_workflowhistory.py                                                        6      0   100%
+sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py                              6      0   100%
+sde_collections/migrations/0046_workflowhistory_old_status.py                                             4      0   100%
+sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py                            5      0   100%
+sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py                                      5      0   100%
+sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py                               4      0   100%
+sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py                                     4      0   100%
+sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py                              4      0   100%
+sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py          5      0   100%
+sde_collections/migrations/0053_alter_collection_url.py                                                   4      0   100%
+sde_collections/migrations/0054_merge_20240531_1332.py                                                    4      0   100%
+sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py                              4      0   100%
+sde_collections/migrations/0056_alter_candidateurl_document_type_and_more.py                              4      0   100%
+sde_collections/migrations/0057_alter_collection_workflow_status_and_more.py                              4      0   100%
+sde_collections/migrations/0058_candidateurl_division_collection_is_multi_division_and_more.py            5      0   100%
+sde_collections/migrations/0059_candidateurl_scraped_text.py                                              4      0   100%
+sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py                                 5      0   100%
+sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py                                        5      0   100%
+sde_collections/migrations/0060_alter_candidateurl_scraped_text.py                                        4      0   100%
+sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py                4      0   100%
+sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py                                            5      0   100%
+sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py                     6      0   100%
+sde_collections/migrations/0063_merge_20241112_1428.py                                                    4      0   100%
+sde_collections/migrations/0064_alter_curatedurl_options_and_more.py                                      4      0   100%
+sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py                              5      0   100%
+sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py                    5      0   100%
+sde_collections/migrations/0066_merge_20241120_0158.py                                                    4      0   100%
+sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py                            4      0   100%
+sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py                          4      0   100%
+sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py                         6      0   100%
+sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py           5      0   100%
+sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py                                 5      0   100%
+sde_collections/migrations/0070_merge_20241205_1437.py                                                    4      0   100%
+sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py                           5      0   100%
+sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py                        26     19    27%   8-52
+sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py                              4      0   100%
+sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py                            4      0   100%
+sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py                           25     19    24%   7-20, 24-39
+sde_collections/migrations/__init__.py                                                                    0      0   100%
+sde_collections/models/__init__.py                                                                        0      0   100%
+sde_collections/models/candidate_url.py                                                                  89     16    82%   124, 128-134, 138-142, 145, 176-177
+sde_collections/models/collection.py                                                                    414    144    65%   241, 269, 277-287, 291-301, 305-315, 319-344, 348-357, 361, 365, 369-376, 380-387, 394, 403-406, 419, 436-439, 449-470, 478, 482-515, 519, 523, 527, 531-532, 536, 540-546, 550-553, 558-567, 575-617, 640, 679, 689, 703, 707-732, 765, 769-777, 785
+sde_collections/models/collection_choice_fields.py                                                      138     20    86%   14-17, 36-39, 56-59, 74-77, 168-171
+sde_collections/models/delta_patterns.py                                                                313     33    89%   119, 123, 139, 226-227, 263, 267, 291, 382-389, 439-449, 498, 503-506, 592, 627-641
+sde_collections/models/delta_url.py                                                                      81     19    77%   117-125, 129-135, 139-143, 146
+sde_collections/models/pattern.py                                                                       145     79    46%   40-48, 56-63, 66, 69, 73-74, 78-79, 87, 94-96, 105, 117-119, 128, 139-151, 163-205, 208-212, 215-216, 230-233, 243, 257-260, 268
+sde_collections/serializers.py                                                                          191     47    75%   80-81, 84-85, 88-89, 92-93, 129-130, 133-134, 137-138, 141-142, 197, 201, 211-214, 244-247, 257-260, 271, 274, 307-315, 335-343, 358-366
+sde_collections/sinequa_api.py                                                                          102      3    97%   65, 255, 289
+sde_collections/tasks.py                                                                                119     67    44%   25-67, 72-108, 113-117, 122-125, 130-148, 153-155, 215-216
+sde_collections/tests.py                                                                                 24     24     0%   1-36
+sde_collections/tests/__init__.py                                                                         0      0   100%
+sde_collections/tests/factories.py                                                                       57      0   100%
+sde_collections/tests/test_database_backup.py                                                            96      0   100%
+sde_collections/tests/test_database_restore.py                                                          139      0   100%
+sde_collections/tests/test_delta_patterns.py                                                            118      0   100%
+sde_collections/tests/test_exclude_patterns.py                                                          142      7    95%   21-41
+sde_collections/tests/test_field_modifier_patterns.py                                                   170      4    98%   20-31
+sde_collections/tests/test_field_modifier_unapply.py                                                     85      0   100%
+sde_collections/tests/test_fileext.py                                                                    15      0   100%
+sde_collections/tests/test_import_fulltexts.py                                                           43      0   100%
+sde_collections/tests/test_include_patterns.py                                                           53      0   100%
+sde_collections/tests/test_migrate_dump.py                                                              188      0   100%
+sde_collections/tests/test_migration.py                                                                  93      0   100%
+sde_collections/tests/test_pattern_specificity.py                                                        84      0   100%
+sde_collections/tests/test_promote_collection.py                                                        164      2    99%   169, 184
+sde_collections/tests/test_sinequa_api.py                                                               147      2    99%   25-26
+sde_collections/tests/test_tdamm_tags.py                                                                108      0   100%
+sde_collections/tests/test_title_pattern_unapply.py                                                      94      0   100%
+sde_collections/tests/test_title_resolution.py                                                           59      0   100%
+sde_collections/tests/test_url_apis.py                                                                  162      0   100%
+sde_collections/tests/test_workflow_status_triggers.py                                                  105      0   100%
+sde_collections/urls.py                                                                                  17      0   100%
+sde_collections/utils/__init__.py                                                                         0      0   100%
+sde_collections/utils/bulk_github_push.py                                                                 8      8     0%   7-22
+sde_collections/utils/generate_deployment_message.py                                                      8      8     0%   1-24
+sde_collections/utils/github_helper.py                                                                  115     93    19%   12-18, 30-42, 49-52, 60-68, 81-96, 104-110, 119-123, 127-129, 132-142, 145-152, 155-172, 175, 178-185, 189-192, 196-224, 227
+sde_collections/utils/health_check.py                                                                   123    106    14%   33-46, 51-57, 61-98, 102-143, 155-165, 172-187, 191-273
+sde_collections/utils/paired_field_descriptor.py                                                         33      2    94%   35, 52
+sde_collections/utils/slack_utils.py                                                                     19      4    79%   57-58, 66-67
+sde_collections/utils/title_resolver.py                                                                  90      5    94%   64, 75, 83, 85, 92
+sde_collections/views.py                                                                                368    229    38%   70, 82-89, 102-141, 144-187, 194, 208-212, 215-223, 226-237, 246, 249-251, 256-265, 273-277, 280-306, 309-315, 323-327, 330-336, 339-345, 353-355, 358-368, 410, 413-422, 430, 433-442, 450, 458, 461-475, 483, 486-490, 505-511, 523-530, 538-566, 577-583, 586-607, 610-613, 628-634
+sde_indexing_helper/__init__.py                                                                           2      0   100%
+sde_indexing_helper/conftest.py                                                                           9      0   100%
+sde_indexing_helper/contrib/__init__.py                                                                   0      0   100%
+sde_indexing_helper/contrib/sites/__init__.py                                                             0      0   100%
+sde_indexing_helper/contrib/sites/migrations/0001_initial.py                                              6      0   100%
+sde_indexing_helper/contrib/sites/migrations/0002_alter_domain_unique.py                                  5      0   100%
+sde_indexing_helper/contrib/sites/migrations/0003_set_site_domain_and_name.py                            20     12    40%   12-31, 39-40, 50-51
+sde_indexing_helper/contrib/sites/migrations/0004_alter_options_ordering_domain.py                        4      0   100%
+sde_indexing_helper/contrib/sites/migrations/__init__.py                                                  0      0   100%
+sde_indexing_helper/users/__init__.py                                                                     0      0   100%
+sde_indexing_helper/users/adapters.py                                                                    11     11     0%   1-16
+sde_indexing_helper/users/admin.py                                                                       13      0   100%
+sde_indexing_helper/users/apps.py                                                                        10      0   100%
+sde_indexing_helper/users/context_processors.py                                                           3      0   100%
+sde_indexing_helper/users/forms.py                                                                       15      0   100%
+sde_indexing_helper/users/migrations/0001_initial.py                                                      8      0   100%
+sde_indexing_helper/users/migrations/0002_contactformmodel_contentcurationrequestmodel.py                 4      0   100%
+sde_indexing_helper/users/migrations/0003_delete_contactformmodel_and_more.py                             4      0   100%
+sde_indexing_helper/users/migrations/__init__.py                                                          0      0   100%
+sde_indexing_helper/users/models.py                                                                      10      0   100%
+sde_indexing_helper/users/tasks.py                                                                        6      0   100%
+sde_indexing_helper/users/tests/__init__.py                                                               0      0   100%
+sde_indexing_helper/users/tests/factories.py                                                             16      0   100%
+sde_indexing_helper/users/tests/test_admin.py                                                            23      0   100%
+sde_indexing_helper/users/tests/test_forms.py                                                            10      0   100%
+sde_indexing_helper/users/tests/test_models.py                                                            3      0   100%
+sde_indexing_helper/users/tests/test_tasks.py                                                            11      0   100%
+sde_indexing_helper/users/tests/test_urls.py                                                             11      0   100%
+sde_indexing_helper/users/tests/test_views.py                                                            65      1    98%   30
+sde_indexing_helper/users/urls.py                                                                         4      0   100%
+sde_indexing_helper/users/views.py                                                                       27      0   100%
+sde_indexing_helper/utils/__init__.py                                                                     0      0   100%
+sde_indexing_helper/utils/exceptions.py                                                                   7      0   100%
+sde_indexing_helper/utils/storages.py                                                                     7      7     0%   1-11
+tests/test_merge_production_dotenvs_in_dotenv.py                                                         13      0   100%
+-----------------------------------------------------------------------------------------------------------------------------------
+TOTAL                                                                                                  7449   1794    76%
+All tests passed successfully!
+Coverage summary has been output to the terminal.
\ No newline at end of file

From cf71691e825a87cea9aee72294de27272c6b01a8 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 7 Feb 2025 14:27:20 -0600
Subject: [PATCH 407/441] update testing strategy document

---
 .../testing_strategy.md                       | 493 +++++++++---------
 1 file changed, 251 insertions(+), 242 deletions(-)

diff --git a/docs/architecture-decisions/testing_strategy.md b/docs/architecture-decisions/testing_strategy.md
index 2b81a62a..0a6a820f 100644
--- a/docs/architecture-decisions/testing_strategy.md
+++ b/docs/architecture-decisions/testing_strategy.md
@@ -7,250 +7,259 @@ This document outlines a testing strategy for the project, which will guide us t
 
 ## Current Coverage
 Using the coverage library, the following report was generated:
+Name      |                                                                                            Stmts |  Miss | Cover |  Missing
+----------|--------------------------------------------------------------------------------------------------|-------|--------|--------
+config/__init__.py  |                                                                                        2 |      0 |   100% |
+config/celery_app.py  |                                                                                      6 |      0 |   100% |
+config/settings/__init__.py  |                                                                               0 |      0 |   100% |
+config/settings/base.py  |                                                                                  94 |      0 |   100% |
+config/settings/local.py  |                                                                                 20 |     20 |     0% |   1-65
+config/settings/production.py  |                                                                            48 |     48 |     0% |   1-162
+config/settings/test.py  |                                                                                   7 |      0 |   100% |
+config/urls.py  |                                                                                           14 |      4 |    71% |   26-47
+config/wsgi.py  |                                                                                            8 |      8 |     0% |   17-36
+config_generation/__init__.py  |                                                                             0 |      0 |   100% |
+config_generation/api.py  |                                                                                 34 |     34 |     0% |   1-88
+config_generation/config_example.py  |                                                                      15 |     15 |     0% |   1-69
+config_generation/db_to_xml.py  |                                                                          203 |    133 |    34% |   45, 47, 50, 96, 119-125, 129-136, 142-149, 197-200, 206-214, 225-230, 242-271, 274-278, 285-292, 303-308, 311, 317, 326-332, 342-349, 361-368, 371-374, 377-378, 382-390, 393-399, 402-412, 415-429
+config_generation/db_to_xml_file_based.py  |                                                                52 |     52 |     0% |   4-119
+config_generation/delete_config_folders.py  |                                                               24 |     24 |     0% |   9-50
+config_generation/delete_server_content.py  |                                                               12 |     12 |     0% |   3-25
+config_generation/delete_webapp_collections.py  |                                                            5 |      5 |     0% |   6-12
+config_generation/export_collections.py  |                                                                  36 |     36 |     0% |   1-73
+config_generation/export_whole_index.py  |                                                                  28 |     28 |     0% |   1-58
+config_generation/generate_collection_list.py  |                                                            29 |     29 |     0% |   8-69
+config_generation/generate_commands.py  |                                                                   41 |     41 |     0% |   6-87
+config_generation/generate_emac_indexer.py  |                                                               24 |     24 |     0% |   1-81
+config_generation/generate_jobs.py  |                                                                       42 |     42 |     0% |   8-100
+config_generation/generate_scrapers.py  |                                                                   15 |     15 |     0% |   2-54
+config_generation/minimum_api.py  |                                                                         33 |     33 |     0% |   1-81
+config_generation/preprocess_sources.py  |                                                                  25 |     25 |     0% |   1-50
+config_generation/sources_to_scrape.py  |                                                                   28 |     28 |     0% |   2-1631
+config_generation/tests/__init__.py  |                                                                       0 |      0 |   100% |
+config_generation/tests/test_db_to_xml.py  |                                                                23 |      3 |    87% |   20, 24, 28
+docs/__init__.py  |                                                                                          0 |      0 |   100% |
+docs/conf.py  |                                                                                             17 |     17 |     0% |   13-62
+environmental_justice/__init__.py  |                                                                         0 |      0 |   100% |
+environmental_justice/admin.py  |                                                                            5 |      0 |   100% |
+environmental_justice/apps.py  |                                                                             4 |      0 |   100% |
+environmental_justice/migrations/0001_initial.py  |                                                          5 |      0 |   100% |
+environmental_justice/migrations/0002_environmentaljusticerow_sde_links.py  |                                4 |      0 |   100% |
+environmental_justice/migrations/0003_remove_environmentaljusticerow_sde_links_and_more.py  |                4 |      0 |   100% |
+environmental_justice/migrations/0004_alter_environmentaljusticerow_data_visualization_and_more.py  |        4 |      0 |   100% |
+environmental_justice/migrations/0005_environmentaljusticerow_destination_server.py  |                       7 |      2 |    71% |   5-6
+environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py  |       9 |      4 |    56% |   7-20
+environmental_justice/migrations/__init__.py  |                                                              0 |      0 |   100% |
+environmental_justice/models.py  |                                                                          29 |      1 |    97% |   44
+environmental_justice/serializers.py  |                                                                      6 |      0 |   100% |
+environmental_justice/tests/conftest.py  |                                                                  15 |      0 |   100% |
+environmental_justice/tests/factories.py  |                                                                 24 |      0 |   100% |
+environmental_justice/tests/test_ej_api.py  |                                                               74 |      0 |   100% |
+environmental_justice/views.py  |                                                                           23 |      0 |   100% |
+feedback/__init__.py  |                                                                                      0 |      0 |   100% |
+feedback/admin.py  |                                                                                        14 |      0 |   100% |
+feedback/apps.py  |                                                                                          4 |      0 |   100% |
+feedback/migrations/0001_initial.py  |                                                                       5 |      0 |   100% |
+feedback/migrations/0002_alter_contentcurationrequest_additional_info.py  |                                  4 |      0 |   100% |
+feedback/migrations/0003_feedback_source.py  |                                                               4 |      0 |   100% |
+feedback/migrations/0004_contentcurationrequest_created_at_and_more.py  |                                    4 |      0 |   100% |
+feedback/migrations/__init__.py  |                                                                           0 |      0 |   100% |
+feedback/models.py  |                                                                                       42 |     15 |    64% |   20-29, 35-44, 61-63
+feedback/serializers.py  |                                                                                  10 |      0 |   100% |
+feedback/tests.py  |                                                                                         0 |      0 |   100% |
+feedback/urls.py  |                                                                                          4 |      0 |   100% |
+feedback/views.py  |                                                                                         9 |      0 |   100% |
+manage.py  |                                                                                                16 |     16 |     0% |   2-31
+merge_production_dotenvs_in_dotenv.py  |                                                                    15 |      1 |    93% |   26
+scripts/ej/cmr_processing.py  |                                                                            241 |      5 |    98% |   160, 186-188, 397, 410
+scripts/ej/config.py  |                                                                                      6 |      0 |   100% |
+scripts/ej/test_cmr_processing.py  |                                                                       225 |      1 |    99% |   610
+scripts/ej/test_threshold_processing.py  |                                                                  97 |      1 |    99% |   209
+scripts/ej/threshold_processing.py  |                                                                       20 |      0 |   100% |
+sde_collections/__init__.py  |                                                                               0 |      0 |   100% |
+sde_collections/admin.py  |                                                                                212 |     72 |    66% |   22-24, 29, 34, 40-60, 65-81, 86-89, 98-101, 110-112, 120-134, 143, 148, 153, 158, 163, 168, 173, 178-189, 196-197, 260, 265, 270, 275, 302-303, 308-309, 314-316, 345-372, 478-480
+sde_collections/apps.py  |                                                                                   4 |      0 |   100% |
+sde_collections/forms.py  |                                                                                 15 |      0 |   100% |
+sde_collections/management/commands/database_backup.py  |                                                   62 |      1 |    98% |   68
+sde_collections/management/commands/database_restore.py  |                                                  83 |      8 |    90% |   34, 36, 87-89, 142-145
+sde_collections/migrations/0001_initial.py  |                                                                6 |      0 |   100% |
+sde_collections/migrations/0002_remove_collection_machine_name.py  |                                         4 |      0 |   100% |
+sde_collections/migrations/0003_alter_collection_config_folder.py  |                                         4 |      0 |   100% |
+sde_collections/migrations/0004_collection_cleaning_order.py  |                                              4 |      0 |   100% |
+sde_collections/migrations/0005_alter_candidateurl_url.py  |                                                 4 |      0 |   100% |
+sde_collections/migrations/0006_alter_candidateurl_generated_title_and_more.py  |                            4 |      0 |   100% |
+sde_collections/migrations/0007_excludepattern_pattern_type.py  |                                            4 |      0 |   100% |
+sde_collections/migrations/0008_alter_excludepattern_match_pattern.py  |                                     4 |      0 |   100% |
+sde_collections/migrations/0009_titlepattern_pattern_type.py  |                                              4 |      0 |   100% |
+sde_collections/migrations/0010_rename_pattern_type_titlepattern_match_pattern_type_and_more.py  |           4 |      0 |   100% |
+sde_collections/migrations/0011_alter_titlepattern_title_pattern_type.py  |                                  4 |      0 |   100% |
+sde_collections/migrations/0012_collection_curated_by_collection_curation_started_and_more.py  |             6 |      0 |   100% |
+sde_collections/migrations/0013_alter_titlepattern_options_and_more.py  |                                    5 |      0 |   100% |
+sde_collections/migrations/0014_alter_documenttypepattern_unique_together_and_more.py  |                     4 |      0 |   100% |
+sde_collections/migrations/0015_candidateurl_document_type.py  |                                             4 |      0 |   100% |
+sde_collections/migrations/0016_alter_documenttypepattern_candidate_urls_and_more.py  |                      5 |      0 |   100% |
+sde_collections/migrations/0017_requiredurls.py  |                                                           5 |      0 |   100% |
+sde_collections/migrations/0018_alter_requiredurls_url.py  |                                                 4 |      0 |   100% |
+sde_collections/migrations/0019_alter_requiredurls_url.py  |                                                 4 |      0 |   100% |
+sde_collections/migrations/0020_alter_collection_curation_status.py  |                                       4 |      0 |   100% |
+sde_collections/migrations/0021_alter_collection_curation_status.py  |                                       4 |      0 |   100% |
+sde_collections/migrations/0022_alter_candidateurl_unique_together.py  |                                     4 |      0 |   100% |
+sde_collections/migrations/0023_collection_github_issue_number.py  |                                         4 |      0 |   100% |
+sde_collections/migrations/0024_alter_collection_curation_status.py  |                                       4 |      0 |   100% |
+sde_collections/migrations/0025_alter_documenttypepattern_match_pattern_type_and_more.py  |                  4 |      0 |   100% |
+sde_collections/migrations/0026_alter_collection_curation_status_and_more.py  |                              4 |      0 |   100% |
+sde_collections/migrations/0027_alter_collection_connector.py  |                                             4 |      0 |   100% |
+sde_collections/migrations/0028_collection_has_sinequa_config.py  |                                          4 |      0 |   100% |
+sde_collections/migrations/0029_alter_candidateurl_document_type_and_more.py  |                              4 |      0 |   100% |
+sde_collections/migrations/0030_candidateurl_inference_by.py  |                                              4 |      0 |   100% |
+sde_collections/migrations/0031_candidateurl_is_pdf.py  |                                                    4 |      0 |   100% |
+sde_collections/migrations/0032_collection_workflow_status.py  |                                             4 |      0 |   100% |
+sde_collections/migrations/0033_alter_collection_config_folder.py  |                                         4 |      0 |   100% |
+sde_collections/migrations/0034_rename_tree_root_collection_tree_root_deprecated.py  |                       4 |      0 |   100% |
+sde_collections/migrations/0035_alter_candidateurl_unique_together.py  |                                     4 |      0 |   100% |
+sde_collections/migrations/0036_candidateurl_present_on_prod_and_more.py  |                                  4 |      0 |   100% |
+sde_collections/migrations/0037_alter_collection_source.py  |                                                4 |      0 |   100% |
+sde_collections/migrations/0037_remove_collection_has_sinequa_config.py  |                                   4 |      0 |   100% |
+sde_collections/migrations/0038_merge_20231126_1152.py  |                                                    4 |      0 |   100% |
+sde_collections/migrations/0039_includepattern.py  |                                                         5 |      0 |   100% |
+sde_collections/migrations/0040_candidateurl_hash.py  |                                                      4 |      0 |   100% |
+sde_collections/migrations/0041_alter_candidateurl_hash.py  |                                                4 |      0 |   100% |
+sde_collections/migrations/0042_alter_collection_division_and_more.py  |                                     4 |      0 |   100% |
+sde_collections/migrations/0043_comments.py  |                                                               6 |      0 |   100% |
+sde_collections/migrations/0044_alter_collection_document_type.py  |                                         4 |      0 |   100% |
+sde_collections/migrations/0045_alter_collection_workflow_status.py  |                                       4 |      0 |   100% |
+sde_collections/migrations/0045_workflowhistory.py  |                                                        6 |      0 |   100% |
+sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py  |                              6 |      0 |   100% |
+sde_collections/migrations/0046_workflowhistory_old_status.py  |                                             4 |      0 |   100% |
+sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py  |                            5 |      0 |   100% |
+sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py  |                                      5 |      0 |   100% |
+sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py  |                               4 |      0 |   100% |
+sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py  |                                     4 |      0 |   100% |
+sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py  |                              4 |      0 |   100% |
+sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py  |          5 |      0 |   100% |
+sde_collections/migrations/0053_alter_collection_url.py  |                                                   4 |      0 |   100% |
+sde_collections/migrations/0054_merge_20240531_1332.py  |                                                    4 |      0 |   100% |
+sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py  |                              4 |      0 |   100% |
+sde_collections/migrations/0056_alter_candidateurl_document_type_and_more.py  |                              4 |      0 |   100% |
+sde_collections/migrations/0057_alter_collection_workflow_status_and_more.py  |                              4 |      0 |   100% |
+sde_collections/migrations/0058_candidateurl_division_collection_is_multi_division_and_more.py  |            5 |      0 |   100% |
+sde_collections/migrations/0059_candidateurl_scraped_text.py  |                                              4 |      0 |   100% |
+sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py  |                                 5 |      0 |   100% |
+sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py  |                                        5 |      0 |   100% |
+sde_collections/migrations/0060_alter_candidateurl_scraped_text.py  |                                        4 |      0 |   100% |
+sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py  |                4 |      0 |   100% |
+sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py  |                                            5 |      0 |   100% |
+sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py  |                     6 |      0 |   100% |
+sde_collections/migrations/0063_merge_20241112_1428.py  |                                                    4 |      0 |   100% |
+sde_collections/migrations/0064_alter_curatedurl_options_and_more.py  |                                      4 |      0 |   100% |
+sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py  |                              5 |      0 |   100% |
+sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py  |                    5 |      0 |   100% |
+sde_collections/migrations/0066_merge_20241120_0158.py  |                                                    4 |      0 |   100% |
+sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py  |                            4 |      0 |   100% |
+sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py  |                          4 |      0 |   100% |
+sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py  |                         6 |      0 |   100% |
+sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py  |           5 |      0 |   100% |
+sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py  |                                 5 |      0 |   100% |
+sde_collections/migrations/0070_merge_20241205_1437.py  |                                                    4 |      0 |   100% |
+sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py  |                           5 |      0 |   100% |
+sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py  |                        26 |     19 |    27% |   8-52
+sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py  |                              4 |      0 |   100% |
+sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py  |                            4 |      0 |   100% |
+sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py  |                           25 |     19 |    24% |   7-20, 24-39
+sde_collections/migrations/__init__.py  |                                                                    0 |      0 |   100% |
+sde_collections/models/__init__.py  |                                                                        0 |      0 |   100% |
+sde_collections/models/candidate_url.py  |                                                                  89 |     16 |    82% |   124, 128-134, 138-142, 145, 176-177
+sde_collections/models/collection.py  |                                                                    414 |    144 |    65% |   241, 269, 277-287, 291-301, 305-315, 319-344, 348-357, 361, 365, 369-376, 380-387, 394, 403-406, 419, 436-439, 449-470, 478, 482-515, 519, 523, 527, 531-532, 536, 540-546, 550-553, 558-567, 575-617, 640, 679, 689, 703, 707-732, 765, 769-777, 785
+sde_collections/models/collection_choice_fields.py  |                                                      138 |     20 |    86% |   14-17, 36-39, 56-59, 74-77, 168-171
+sde_collections/models/delta_patterns.py  |                                                                313 |     33 |    89% |   119, 123, 139, 226-227, 263, 267, 291, 382-389, 439-449, 498, 503-506, 592, 627-641
+sde_collections/models/delta_url.py  |                                                                      81 |     19 |    77% |   117-125, 129-135, 139-143, 146
+sde_collections/models/pattern.py  |                                                                       145 |     79 |    46% |   40-48, 56-63, 66, 69, 73-74, 78-79, 87, 94-96, 105, 117-119, 128, 139-151, 163-205, 208-212, 215-216, 230-233, 243, 257-260, 268
+sde_collections/serializers.py  |                                                                          191 |     47 |    75% |   80-81, 84-85, 88-89, 92-93, 129-130, 133-134, 137-138, 141-142, 197, 201, 211-214, 244-247, 257-260, 271, 274, 307-315, 335-343, 358-366
+sde_collections/sinequa_api.py  |                                                                          102 |      3 |    97% |   65, 255, 289
+sde_collections/tasks.py  |                                                                                119 |     67 |    44% |   25-67, 72-108, 113-117, 122-125, 130-148, 153-155, 215-216
+sde_collections/tests.py  |                                                                                 24 |     24 |     0% |   1-36
+sde_collections/tests/__init__.py  |                                                                         0 |      0 |   100% |
+sde_collections/tests/factories.py  |                                                                       57 |      0 |   100% |
+sde_collections/tests/test_database_backup.py  |                                                            96 |      0 |   100% |
+sde_collections/tests/test_database_restore.py  |                                                          139 |      0 |   100% |
+sde_collections/tests/test_delta_patterns.py  |                                                            118 |      0 |   100% |
+sde_collections/tests/test_exclude_patterns.py  |                                                          142 |      7 |    95% |   21-41
+sde_collections/tests/test_field_modifier_patterns.py  |                                                   170 |      4 |    98% |   20-31
+sde_collections/tests/test_field_modifier_unapply.py  |                                                     85 |      0 |   100% |
+sde_collections/tests/test_fileext.py  |                                                                    15 |      0 |   100% |
+sde_collections/tests/test_import_fulltexts.py  |                                                           43 |      0 |   100% |
+sde_collections/tests/test_include_patterns.py  |                                                           53 |      0 |   100% |
+sde_collections/tests/test_migrate_dump.py  |                                                              188 |      0 |   100% |
+sde_collections/tests/test_migration.py  |                                                                  93 |      0 |   100% |
+sde_collections/tests/test_pattern_specificity.py  |                                                        84 |      0 |   100% |
+sde_collections/tests/test_promote_collection.py  |                                                        164 |      2 |    99% |   169, 184
+sde_collections/tests/test_sinequa_api.py  |                                                               147 |      2 |    99% |   25-26
+sde_collections/tests/test_tdamm_tags.py  |                                                                108 |      0 |   100% |
+sde_collections/tests/test_title_pattern_unapply.py  |                                                      94 |      0 |   100% |
+sde_collections/tests/test_title_resolution.py  |                                                           59 |      0 |   100% |
+sde_collections/tests/test_url_apis.py  |                                                                  162 |      0 |   100% |
+sde_collections/tests/test_workflow_status_triggers.py  |                                                  105 |      0 |   100% |
+sde_collections/urls.py  |                                                                                  17 |      0 |   100% |
+sde_collections/utils/__init__.py  |                                                                         0 |      0 |   100% |
+sde_collections/utils/bulk_github_push.py  |                                                                 8 |      8 |     0% |   7-22
+sde_collections/utils/generate_deployment_message.py  |                                                      8 |      8 |     0% |   1-24
+sde_collections/utils/github_helper.py  |                                                                  115 |     93 |    19% |   12-18, 30-42, 49-52, 60-68, 81-96, 104-110, 119-123, 127-129, 132-142, 145-152, 155-172, 175, 178-185, 189-192, 196-224, 227
+sde_collections/utils/health_check.py  |                                                                   123 |    106 |    14% |   33-46, 51-57, 61-98, 102-143, 155-165, 172-187, 191-273
+sde_collections/utils/paired_field_descriptor.py  |                                                         33 |      2 |    94% |   35, 52
+sde_collections/utils/slack_utils.py  |                                                                     19 |      4 |    79% |   57-58, 66-67
+sde_collections/utils/title_resolver.py  |                                                                  90 |      5 |    94% |   64, 75, 83, 85, 92
+sde_collections/views.py  |                                                                                368 |    229 |    38% |   70, 82-89, 102-141, 144-187, 194, 208-212, 215-223, 226-237, 246, 249-251, 256-265, 273-277, 280-306, 309-315, 323-327, 330-336, 339-345, 353-355, 358-368, 410, 413-422, 430, 433-442, 450, 458, 461-475, 483, 486-490, 505-511, 523-530, 538-566, 577-583, 586-607, 610-613, 628-634
+sde_indexing_helper/__init__.py  |                                                                           2 |      0 |   100% |
+sde_indexing_helper/conftest.py  |                                                                           9 |      0 |   100% |
+sde_indexing_helper/contrib/__init__.py  |                                                                   0 |      0 |   100% |
+sde_indexing_helper/contrib/sites/__init__.py  |                                                             0 |      0 |   100% |
+sde_indexing_helper/contrib/sites/migrations/0001_initial.py  |                                              6 |      0 |   100% |
+sde_indexing_helper/contrib/sites/migrations/0002_alter_domain_unique.py  |                                  5 |      0 |   100% |
+sde_indexing_helper/contrib/sites/migrations/0003_set_site_domain_and_name.py  |                            20 |     12 |    40% |   12-31, 39-40, 50-51
+sde_indexing_helper/contrib/sites/migrations/0004_alter_options_ordering_domain.py  |                        4 |      0 |   100% |
+sde_indexing_helper/contrib/sites/migrations/__init__.py  |                                                  0 |      0 |   100% |
+sde_indexing_helper/users/__init__.py  |                                                                     0 |      0 |   100% |
+sde_indexing_helper/users/adapters.py  |                                                                    11 |     11 |     0% |   1-16
+sde_indexing_helper/users/admin.py  |                                                                       13 |      0 |   100% |
+sde_indexing_helper/users/apps.py  |                                                                        10 |      0 |   100% |
+sde_indexing_helper/users/context_processors.py  |                                                           3 |      0 |   100% |
+sde_indexing_helper/users/forms.py  |                                                                       15 |      0 |   100% |
+sde_indexing_helper/users/migrations/0001_initial.py  |                                                      8 |      0 |   100% |
+sde_indexing_helper/users/migrations/0002_contactformmodel_contentcurationrequestmodel.py  |                 4 |      0 |   100% |
+sde_indexing_helper/users/migrations/0003_delete_contactformmodel_and_more.py  |                             4 |      0 |   100% |
+sde_indexing_helper/users/migrations/__init__.py  |                                                          0 |      0 |   100% |
+sde_indexing_helper/users/models.py  |                                                                      10 |      0 |   100% |
+sde_indexing_helper/users/tasks.py  |                                                                        6 |      0 |   100% |
+sde_indexing_helper/users/tests/__init__.py  |                                                               0 |      0 |   100% |
+sde_indexing_helper/users/tests/factories.py  |                                                             16 |      0 |   100% |
+sde_indexing_helper/users/tests/test_admin.py  |                                                            23 |      0 |   100% |
+sde_indexing_helper/users/tests/test_forms.py  |                                                            10 |      0 |   100% |
+sde_indexing_helper/users/tests/test_models.py  |                                                            3 |      0 |   100% |
+sde_indexing_helper/users/tests/test_tasks.py  |                                                            11 |      0 |   100% |
+sde_indexing_helper/users/tests/test_urls.py  |                                                             11 |      0 |   100% |
+sde_indexing_helper/users/tests/test_views.py  |                                                            65 |      1 |    98% |   30
+sde_indexing_helper/users/urls.py  |                                                                         4 |      0 |   100% |
+sde_indexing_helper/users/views.py  |                                                                       27 |      0 |   100% |
+sde_indexing_helper/utils/__init__.py  |                                                                     0 |      0 |   100% |
+sde_indexing_helper/utils/exceptions.py  |                                                                   7 |      0 |   100% |
+sde_indexing_helper/utils/storages.py  |                                                                     7 |      7 |     0% |   1-11
+tests/test_merge_production_dotenvs_in_dotenv.py  |                                                         13 |      0 |   100 |%
 
+## Critical Areas
+### Config Generation Pipeline
+This portion of the code creates config files that are used by Sinequa to index new content. A incomplete list of critical files is given here: 
+- config_generation/db_to_xml.py
+- sde_collections/models/collection.py
+  - create_scraper_config()
+  - create_indexer_config()
+  - _write_to_github()
+- sde_collections/utils/github_helper.py
 
+### APIs, Serializers, and Views
+Any APIs that serve data to Sinequa, such as at the DeltaURLAPIView.
 
+### Project Configs
+etc.
 
 
-Generating coverage report...
-Name                                                                                                  Stmts   Miss  Cover   Missing
------------------------------------------------------------------------------------------------------------------------------------
-config/__init__.py                                                                                        2      0   100%
-config/celery_app.py                                                                                      6      0   100%
-config/settings/__init__.py                                                                               0      0   100%
-config/settings/base.py                                                                                  94      0   100%
-config/settings/local.py                                                                                 20     20     0%   1-65
-config/settings/production.py                                                                            48     48     0%   1-162
-config/settings/test.py                                                                                   7      0   100%
-config/urls.py                                                                                           14      4    71%   26-47
-config/wsgi.py                                                                                            8      8     0%   17-36
-config_generation/__init__.py                                                                             0      0   100%
-config_generation/api.py                                                                                 34     34     0%   1-88
-config_generation/config_example.py                                                                      15     15     0%   1-69
-config_generation/db_to_xml.py                                                                          203    133    34%   45, 47, 50, 96, 119-125, 129-136, 142-149, 197-200, 206-214, 225-230, 242-271, 274-278, 285-292, 303-308, 311, 317, 326-332, 342-349, 361-368, 371-374, 377-378, 382-390, 393-399, 402-412, 415-429
-config_generation/db_to_xml_file_based.py                                                                52     52     0%   4-119
-config_generation/delete_config_folders.py                                                               24     24     0%   9-50
-config_generation/delete_server_content.py                                                               12     12     0%   3-25
-config_generation/delete_webapp_collections.py                                                            5      5     0%   6-12
-config_generation/export_collections.py                                                                  36     36     0%   1-73
-config_generation/export_whole_index.py                                                                  28     28     0%   1-58
-config_generation/generate_collection_list.py                                                            29     29     0%   8-69
-config_generation/generate_commands.py                                                                   41     41     0%   6-87
-config_generation/generate_emac_indexer.py                                                               24     24     0%   1-81
-config_generation/generate_jobs.py                                                                       42     42     0%   8-100
-config_generation/generate_scrapers.py                                                                   15     15     0%   2-54
-config_generation/minimum_api.py                                                                         33     33     0%   1-81
-config_generation/preprocess_sources.py                                                                  25     25     0%   1-50
-config_generation/sources_to_scrape.py                                                                   28     28     0%   2-1631
-config_generation/tests/__init__.py                                                                       0      0   100%
-config_generation/tests/test_db_to_xml.py                                                                23      3    87%   20, 24, 28
-docs/__init__.py                                                                                          0      0   100%
-docs/conf.py                                                                                             17     17     0%   13-62
-environmental_justice/__init__.py                                                                         0      0   100%
-environmental_justice/admin.py                                                                            5      0   100%
-environmental_justice/apps.py                                                                             4      0   100%
-environmental_justice/migrations/0001_initial.py                                                          5      0   100%
-environmental_justice/migrations/0002_environmentaljusticerow_sde_links.py                                4      0   100%
-environmental_justice/migrations/0003_remove_environmentaljusticerow_sde_links_and_more.py                4      0   100%
-environmental_justice/migrations/0004_alter_environmentaljusticerow_data_visualization_and_more.py        4      0   100%
-environmental_justice/migrations/0005_environmentaljusticerow_destination_server.py                       7      2    71%   5-6
-environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py       9      4    56%   7-20
-environmental_justice/migrations/__init__.py                                                              0      0   100%
-environmental_justice/models.py                                                                          29      1    97%   44
-environmental_justice/serializers.py                                                                      6      0   100%
-environmental_justice/tests/conftest.py                                                                  15      0   100%
-environmental_justice/tests/factories.py                                                                 24      0   100%
-environmental_justice/tests/test_ej_api.py                                                               74      0   100%
-environmental_justice/views.py                                                                           23      0   100%
-feedback/__init__.py                                                                                      0      0   100%
-feedback/admin.py                                                                                        14      0   100%
-feedback/apps.py                                                                                          4      0   100%
-feedback/migrations/0001_initial.py                                                                       5      0   100%
-feedback/migrations/0002_alter_contentcurationrequest_additional_info.py                                  4      0   100%
-feedback/migrations/0003_feedback_source.py                                                               4      0   100%
-feedback/migrations/0004_contentcurationrequest_created_at_and_more.py                                    4      0   100%
-feedback/migrations/__init__.py                                                                           0      0   100%
-feedback/models.py                                                                                       42     15    64%   20-29, 35-44, 61-63
-feedback/serializers.py                                                                                  10      0   100%
-feedback/tests.py                                                                                         0      0   100%
-feedback/urls.py                                                                                          4      0   100%
-feedback/views.py                                                                                         9      0   100%
-manage.py                                                                                                16     16     0%   2-31
-merge_production_dotenvs_in_dotenv.py                                                                    15      1    93%   26
-scripts/ej/cmr_processing.py                                                                            241      5    98%   160, 186-188, 397, 410
-scripts/ej/config.py                                                                                      6      0   100%
-scripts/ej/test_cmr_processing.py                                                                       225      1    99%   610
-scripts/ej/test_threshold_processing.py                                                                  97      1    99%   209
-scripts/ej/threshold_processing.py                                                                       20      0   100%
-sde_collections/__init__.py                                                                               0      0   100%
-sde_collections/admin.py                                                                                212     72    66%   22-24, 29, 34, 40-60, 65-81, 86-89, 98-101, 110-112, 120-134, 143, 148, 153, 158, 163, 168, 173, 178-189, 196-197, 260, 265, 270, 275, 302-303, 308-309, 314-316, 345-372, 478-480
-sde_collections/apps.py                                                                                   4      0   100%
-sde_collections/forms.py                                                                                 15      0   100%
-sde_collections/management/commands/database_backup.py                                                   62      1    98%   68
-sde_collections/management/commands/database_restore.py                                                  83      8    90%   34, 36, 87-89, 142-145
-sde_collections/migrations/0001_initial.py                                                                6      0   100%
-sde_collections/migrations/0002_remove_collection_machine_name.py                                         4      0   100%
-sde_collections/migrations/0003_alter_collection_config_folder.py                                         4      0   100%
-sde_collections/migrations/0004_collection_cleaning_order.py                                              4      0   100%
-sde_collections/migrations/0005_alter_candidateurl_url.py                                                 4      0   100%
-sde_collections/migrations/0006_alter_candidateurl_generated_title_and_more.py                            4      0   100%
-sde_collections/migrations/0007_excludepattern_pattern_type.py                                            4      0   100%
-sde_collections/migrations/0008_alter_excludepattern_match_pattern.py                                     4      0   100%
-sde_collections/migrations/0009_titlepattern_pattern_type.py                                              4      0   100%
-sde_collections/migrations/0010_rename_pattern_type_titlepattern_match_pattern_type_and_more.py           4      0   100%
-sde_collections/migrations/0011_alter_titlepattern_title_pattern_type.py                                  4      0   100%
-sde_collections/migrations/0012_collection_curated_by_collection_curation_started_and_more.py             6      0   100%
-sde_collections/migrations/0013_alter_titlepattern_options_and_more.py                                    5      0   100%
-sde_collections/migrations/0014_alter_documenttypepattern_unique_together_and_more.py                     4      0   100%
-sde_collections/migrations/0015_candidateurl_document_type.py                                             4      0   100%
-sde_collections/migrations/0016_alter_documenttypepattern_candidate_urls_and_more.py                      5      0   100%
-sde_collections/migrations/0017_requiredurls.py                                                           5      0   100%
-sde_collections/migrations/0018_alter_requiredurls_url.py                                                 4      0   100%
-sde_collections/migrations/0019_alter_requiredurls_url.py                                                 4      0   100%
-sde_collections/migrations/0020_alter_collection_curation_status.py                                       4      0   100%
-sde_collections/migrations/0021_alter_collection_curation_status.py                                       4      0   100%
-sde_collections/migrations/0022_alter_candidateurl_unique_together.py                                     4      0   100%
-sde_collections/migrations/0023_collection_github_issue_number.py                                         4      0   100%
-sde_collections/migrations/0024_alter_collection_curation_status.py                                       4      0   100%
-sde_collections/migrations/0025_alter_documenttypepattern_match_pattern_type_and_more.py                  4      0   100%
-sde_collections/migrations/0026_alter_collection_curation_status_and_more.py                              4      0   100%
-sde_collections/migrations/0027_alter_collection_connector.py                                             4      0   100%
-sde_collections/migrations/0028_collection_has_sinequa_config.py                                          4      0   100%
-sde_collections/migrations/0029_alter_candidateurl_document_type_and_more.py                              4      0   100%
-sde_collections/migrations/0030_candidateurl_inference_by.py                                              4      0   100%
-sde_collections/migrations/0031_candidateurl_is_pdf.py                                                    4      0   100%
-sde_collections/migrations/0032_collection_workflow_status.py                                             4      0   100%
-sde_collections/migrations/0033_alter_collection_config_folder.py                                         4      0   100%
-sde_collections/migrations/0034_rename_tree_root_collection_tree_root_deprecated.py                       4      0   100%
-sde_collections/migrations/0035_alter_candidateurl_unique_together.py                                     4      0   100%
-sde_collections/migrations/0036_candidateurl_present_on_prod_and_more.py                                  4      0   100%
-sde_collections/migrations/0037_alter_collection_source.py                                                4      0   100%
-sde_collections/migrations/0037_remove_collection_has_sinequa_config.py                                   4      0   100%
-sde_collections/migrations/0038_merge_20231126_1152.py                                                    4      0   100%
-sde_collections/migrations/0039_includepattern.py                                                         5      0   100%
-sde_collections/migrations/0040_candidateurl_hash.py                                                      4      0   100%
-sde_collections/migrations/0041_alter_candidateurl_hash.py                                                4      0   100%
-sde_collections/migrations/0042_alter_collection_division_and_more.py                                     4      0   100%
-sde_collections/migrations/0043_comments.py                                                               6      0   100%
-sde_collections/migrations/0044_alter_collection_document_type.py                                         4      0   100%
-sde_collections/migrations/0045_alter_collection_workflow_status.py                                       4      0   100%
-sde_collections/migrations/0045_workflowhistory.py                                                        6      0   100%
-sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py                              6      0   100%
-sde_collections/migrations/0046_workflowhistory_old_status.py                                             4      0   100%
-sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py                            5      0   100%
-sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py                                      5      0   100%
-sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py                               4      0   100%
-sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py                                     4      0   100%
-sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py                              4      0   100%
-sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py          5      0   100%
-sde_collections/migrations/0053_alter_collection_url.py                                                   4      0   100%
-sde_collections/migrations/0054_merge_20240531_1332.py                                                    4      0   100%
-sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py                              4      0   100%
-sde_collections/migrations/0056_alter_candidateurl_document_type_and_more.py                              4      0   100%
-sde_collections/migrations/0057_alter_collection_workflow_status_and_more.py                              4      0   100%
-sde_collections/migrations/0058_candidateurl_division_collection_is_multi_division_and_more.py            5      0   100%
-sde_collections/migrations/0059_candidateurl_scraped_text.py                                              4      0   100%
-sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py                                 5      0   100%
-sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py                                        5      0   100%
-sde_collections/migrations/0060_alter_candidateurl_scraped_text.py                                        4      0   100%
-sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py                4      0   100%
-sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py                                            5      0   100%
-sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py                     6      0   100%
-sde_collections/migrations/0063_merge_20241112_1428.py                                                    4      0   100%
-sde_collections/migrations/0064_alter_curatedurl_options_and_more.py                                      4      0   100%
-sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py                              5      0   100%
-sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py                    5      0   100%
-sde_collections/migrations/0066_merge_20241120_0158.py                                                    4      0   100%
-sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py                            4      0   100%
-sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py                          4      0   100%
-sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py                         6      0   100%
-sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py           5      0   100%
-sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py                                 5      0   100%
-sde_collections/migrations/0070_merge_20241205_1437.py                                                    4      0   100%
-sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py                           5      0   100%
-sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py                        26     19    27%   8-52
-sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py                              4      0   100%
-sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py                            4      0   100%
-sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py                           25     19    24%   7-20, 24-39
-sde_collections/migrations/__init__.py                                                                    0      0   100%
-sde_collections/models/__init__.py                                                                        0      0   100%
-sde_collections/models/candidate_url.py                                                                  89     16    82%   124, 128-134, 138-142, 145, 176-177
-sde_collections/models/collection.py                                                                    414    144    65%   241, 269, 277-287, 291-301, 305-315, 319-344, 348-357, 361, 365, 369-376, 380-387, 394, 403-406, 419, 436-439, 449-470, 478, 482-515, 519, 523, 527, 531-532, 536, 540-546, 550-553, 558-567, 575-617, 640, 679, 689, 703, 707-732, 765, 769-777, 785
-sde_collections/models/collection_choice_fields.py                                                      138     20    86%   14-17, 36-39, 56-59, 74-77, 168-171
-sde_collections/models/delta_patterns.py                                                                313     33    89%   119, 123, 139, 226-227, 263, 267, 291, 382-389, 439-449, 498, 503-506, 592, 627-641
-sde_collections/models/delta_url.py                                                                      81     19    77%   117-125, 129-135, 139-143, 146
-sde_collections/models/pattern.py                                                                       145     79    46%   40-48, 56-63, 66, 69, 73-74, 78-79, 87, 94-96, 105, 117-119, 128, 139-151, 163-205, 208-212, 215-216, 230-233, 243, 257-260, 268
-sde_collections/serializers.py                                                                          191     47    75%   80-81, 84-85, 88-89, 92-93, 129-130, 133-134, 137-138, 141-142, 197, 201, 211-214, 244-247, 257-260, 271, 274, 307-315, 335-343, 358-366
-sde_collections/sinequa_api.py                                                                          102      3    97%   65, 255, 289
-sde_collections/tasks.py                                                                                119     67    44%   25-67, 72-108, 113-117, 122-125, 130-148, 153-155, 215-216
-sde_collections/tests.py                                                                                 24     24     0%   1-36
-sde_collections/tests/__init__.py                                                                         0      0   100%
-sde_collections/tests/factories.py                                                                       57      0   100%
-sde_collections/tests/test_database_backup.py                                                            96      0   100%
-sde_collections/tests/test_database_restore.py                                                          139      0   100%
-sde_collections/tests/test_delta_patterns.py                                                            118      0   100%
-sde_collections/tests/test_exclude_patterns.py                                                          142      7    95%   21-41
-sde_collections/tests/test_field_modifier_patterns.py                                                   170      4    98%   20-31
-sde_collections/tests/test_field_modifier_unapply.py                                                     85      0   100%
-sde_collections/tests/test_fileext.py                                                                    15      0   100%
-sde_collections/tests/test_import_fulltexts.py                                                           43      0   100%
-sde_collections/tests/test_include_patterns.py                                                           53      0   100%
-sde_collections/tests/test_migrate_dump.py                                                              188      0   100%
-sde_collections/tests/test_migration.py                                                                  93      0   100%
-sde_collections/tests/test_pattern_specificity.py                                                        84      0   100%
-sde_collections/tests/test_promote_collection.py                                                        164      2    99%   169, 184
-sde_collections/tests/test_sinequa_api.py                                                               147      2    99%   25-26
-sde_collections/tests/test_tdamm_tags.py                                                                108      0   100%
-sde_collections/tests/test_title_pattern_unapply.py                                                      94      0   100%
-sde_collections/tests/test_title_resolution.py                                                           59      0   100%
-sde_collections/tests/test_url_apis.py                                                                  162      0   100%
-sde_collections/tests/test_workflow_status_triggers.py                                                  105      0   100%
-sde_collections/urls.py                                                                                  17      0   100%
-sde_collections/utils/__init__.py                                                                         0      0   100%
-sde_collections/utils/bulk_github_push.py                                                                 8      8     0%   7-22
-sde_collections/utils/generate_deployment_message.py                                                      8      8     0%   1-24
-sde_collections/utils/github_helper.py                                                                  115     93    19%   12-18, 30-42, 49-52, 60-68, 81-96, 104-110, 119-123, 127-129, 132-142, 145-152, 155-172, 175, 178-185, 189-192, 196-224, 227
-sde_collections/utils/health_check.py                                                                   123    106    14%   33-46, 51-57, 61-98, 102-143, 155-165, 172-187, 191-273
-sde_collections/utils/paired_field_descriptor.py                                                         33      2    94%   35, 52
-sde_collections/utils/slack_utils.py                                                                     19      4    79%   57-58, 66-67
-sde_collections/utils/title_resolver.py                                                                  90      5    94%   64, 75, 83, 85, 92
-sde_collections/views.py                                                                                368    229    38%   70, 82-89, 102-141, 144-187, 194, 208-212, 215-223, 226-237, 246, 249-251, 256-265, 273-277, 280-306, 309-315, 323-327, 330-336, 339-345, 353-355, 358-368, 410, 413-422, 430, 433-442, 450, 458, 461-475, 483, 486-490, 505-511, 523-530, 538-566, 577-583, 586-607, 610-613, 628-634
-sde_indexing_helper/__init__.py                                                                           2      0   100%
-sde_indexing_helper/conftest.py                                                                           9      0   100%
-sde_indexing_helper/contrib/__init__.py                                                                   0      0   100%
-sde_indexing_helper/contrib/sites/__init__.py                                                             0      0   100%
-sde_indexing_helper/contrib/sites/migrations/0001_initial.py                                              6      0   100%
-sde_indexing_helper/contrib/sites/migrations/0002_alter_domain_unique.py                                  5      0   100%
-sde_indexing_helper/contrib/sites/migrations/0003_set_site_domain_and_name.py                            20     12    40%   12-31, 39-40, 50-51
-sde_indexing_helper/contrib/sites/migrations/0004_alter_options_ordering_domain.py                        4      0   100%
-sde_indexing_helper/contrib/sites/migrations/__init__.py                                                  0      0   100%
-sde_indexing_helper/users/__init__.py                                                                     0      0   100%
-sde_indexing_helper/users/adapters.py                                                                    11     11     0%   1-16
-sde_indexing_helper/users/admin.py                                                                       13      0   100%
-sde_indexing_helper/users/apps.py                                                                        10      0   100%
-sde_indexing_helper/users/context_processors.py                                                           3      0   100%
-sde_indexing_helper/users/forms.py                                                                       15      0   100%
-sde_indexing_helper/users/migrations/0001_initial.py                                                      8      0   100%
-sde_indexing_helper/users/migrations/0002_contactformmodel_contentcurationrequestmodel.py                 4      0   100%
-sde_indexing_helper/users/migrations/0003_delete_contactformmodel_and_more.py                             4      0   100%
-sde_indexing_helper/users/migrations/__init__.py                                                          0      0   100%
-sde_indexing_helper/users/models.py                                                                      10      0   100%
-sde_indexing_helper/users/tasks.py                                                                        6      0   100%
-sde_indexing_helper/users/tests/__init__.py                                                               0      0   100%
-sde_indexing_helper/users/tests/factories.py                                                             16      0   100%
-sde_indexing_helper/users/tests/test_admin.py                                                            23      0   100%
-sde_indexing_helper/users/tests/test_forms.py                                                            10      0   100%
-sde_indexing_helper/users/tests/test_models.py                                                            3      0   100%
-sde_indexing_helper/users/tests/test_tasks.py                                                            11      0   100%
-sde_indexing_helper/users/tests/test_urls.py                                                             11      0   100%
-sde_indexing_helper/users/tests/test_views.py                                                            65      1    98%   30
-sde_indexing_helper/users/urls.py                                                                         4      0   100%
-sde_indexing_helper/users/views.py                                                                       27      0   100%
-sde_indexing_helper/utils/__init__.py                                                                     0      0   100%
-sde_indexing_helper/utils/exceptions.py                                                                   7      0   100%
-sde_indexing_helper/utils/storages.py                                                                     7      7     0%   1-11
-tests/test_merge_production_dotenvs_in_dotenv.py                                                         13      0   100%
------------------------------------------------------------------------------------------------------------------------------------
-TOTAL                                                                                                  7449   1794    76%
-All tests passed successfully!
-Coverage summary has been output to the terminal.
\ No newline at end of file
+### Add additional areas below

From dd9e50e9f9d2d42aea405f841766db8582ca2de7 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 7 Feb 2025 14:35:32 -0600
Subject: [PATCH 408/441] add example of What Pipelines Does the Repo Have to
 the testing_strategy.md

---
 docs/architecture-decisions/testing_strategy.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/architecture-decisions/testing_strategy.md b/docs/architecture-decisions/testing_strategy.md
index 0a6a820f..ab171123 100644
--- a/docs/architecture-decisions/testing_strategy.md
+++ b/docs/architecture-decisions/testing_strategy.md
@@ -263,3 +263,12 @@ etc.
 
 
 ### Add additional areas below
+
+
+### What Pipelines Does the Repo Have?
+You can use this list as a starting point to think about what the COSMOS repo does, from a process perspective:
+1. COSMOS imports data from LRM Dev
+2. Sinequa config files are generated
+3. Imported data is processed
+4. Curators update URL metadata
+5. Sinequa reads results from the COSMOS APIs
\ No newline at end of file

From 38697ceeab2518abba0eb79ae21bbf2c4d1ad2fa Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 10 Feb 2025 07:10:53 -0600
Subject: [PATCH 409/441] testing_strategy_1

---
 .../testing_strategy.md                       | 230 ++++++------------
 1 file changed, 70 insertions(+), 160 deletions(-)

diff --git a/docs/architecture-decisions/testing_strategy.md b/docs/architecture-decisions/testing_strategy.md
index ab171123..a8754750 100644
--- a/docs/architecture-decisions/testing_strategy.md
+++ b/docs/architecture-decisions/testing_strategy.md
@@ -15,7 +15,6 @@ config/settings/__init__.py  |
 config/settings/base.py  |                                                                                  94 |      0 |   100% |
 config/settings/local.py  |                                                                                 20 |     20 |     0% |   1-65
 config/settings/production.py  |                                                                            48 |     48 |     0% |   1-162
-config/settings/test.py  |                                                                                   7 |      0 |   100% |
 config/urls.py  |                                                                                           14 |      4 |    71% |   26-47
 config/wsgi.py  |                                                                                            8 |      8 |     0% |   17-36
 config_generation/__init__.py  |                                                                             0 |      0 |   100% |
@@ -36,37 +35,19 @@ config_generation/generate_scrapers.py  |
 config_generation/minimum_api.py  |                                                                         33 |     33 |     0% |   1-81
 config_generation/preprocess_sources.py  |                                                                  25 |     25 |     0% |   1-50
 config_generation/sources_to_scrape.py  |                                                                   28 |     28 |     0% |   2-1631
-config_generation/tests/__init__.py  |                                                                       0 |      0 |   100% |
-config_generation/tests/test_db_to_xml.py  |                                                                23 |      3 |    87% |   20, 24, 28
 docs/__init__.py  |                                                                                          0 |      0 |   100% |
 docs/conf.py  |                                                                                             17 |     17 |     0% |   13-62
 environmental_justice/__init__.py  |                                                                         0 |      0 |   100% |
 environmental_justice/admin.py  |                                                                            5 |      0 |   100% |
 environmental_justice/apps.py  |                                                                             4 |      0 |   100% |
-environmental_justice/migrations/0001_initial.py  |                                                          5 |      0 |   100% |
-environmental_justice/migrations/0002_environmentaljusticerow_sde_links.py  |                                4 |      0 |   100% |
-environmental_justice/migrations/0003_remove_environmentaljusticerow_sde_links_and_more.py  |                4 |      0 |   100% |
-environmental_justice/migrations/0004_alter_environmentaljusticerow_data_visualization_and_more.py  |        4 |      0 |   100% |
-environmental_justice/migrations/0005_environmentaljusticerow_destination_server.py  |                       7 |      2 |    71% |   5-6
-environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py  |       9 |      4 |    56% |   7-20
-environmental_justice/migrations/__init__.py  |                                                              0 |      0 |   100% |
 environmental_justice/models.py  |                                                                          29 |      1 |    97% |   44
 environmental_justice/serializers.py  |                                                                      6 |      0 |   100% |
-environmental_justice/tests/conftest.py  |                                                                  15 |      0 |   100% |
-environmental_justice/tests/factories.py  |                                                                 24 |      0 |   100% |
-environmental_justice/tests/test_ej_api.py  |                                                               74 |      0 |   100% |
 environmental_justice/views.py  |                                                                           23 |      0 |   100% |
 feedback/__init__.py  |                                                                                      0 |      0 |   100% |
 feedback/admin.py  |                                                                                        14 |      0 |   100% |
 feedback/apps.py  |                                                                                          4 |      0 |   100% |
-feedback/migrations/0001_initial.py  |                                                                       5 |      0 |   100% |
-feedback/migrations/0002_alter_contentcurationrequest_additional_info.py  |                                  4 |      0 |   100% |
-feedback/migrations/0003_feedback_source.py  |                                                               4 |      0 |   100% |
-feedback/migrations/0004_contentcurationrequest_created_at_and_more.py  |                                    4 |      0 |   100% |
-feedback/migrations/__init__.py  |                                                                           0 |      0 |   100% |
 feedback/models.py  |                                                                                       42 |     15 |    64% |   20-29, 35-44, 61-63
 feedback/serializers.py  |                                                                                  10 |      0 |   100% |
-feedback/tests.py  |                                                                                         0 |      0 |   100% |
 feedback/urls.py  |                                                                                          4 |      0 |   100% |
 feedback/views.py  |                                                                                         9 |      0 |   100% |
 manage.py  |                                                                                                16 |     16 |     0% |   2-31
@@ -82,91 +63,6 @@ sde_collections/apps.py  |
 sde_collections/forms.py  |                                                                                 15 |      0 |   100% |
 sde_collections/management/commands/database_backup.py  |                                                   62 |      1 |    98% |   68
 sde_collections/management/commands/database_restore.py  |                                                  83 |      8 |    90% |   34, 36, 87-89, 142-145
-sde_collections/migrations/0001_initial.py  |                                                                6 |      0 |   100% |
-sde_collections/migrations/0002_remove_collection_machine_name.py  |                                         4 |      0 |   100% |
-sde_collections/migrations/0003_alter_collection_config_folder.py  |                                         4 |      0 |   100% |
-sde_collections/migrations/0004_collection_cleaning_order.py  |                                              4 |      0 |   100% |
-sde_collections/migrations/0005_alter_candidateurl_url.py  |                                                 4 |      0 |   100% |
-sde_collections/migrations/0006_alter_candidateurl_generated_title_and_more.py  |                            4 |      0 |   100% |
-sde_collections/migrations/0007_excludepattern_pattern_type.py  |                                            4 |      0 |   100% |
-sde_collections/migrations/0008_alter_excludepattern_match_pattern.py  |                                     4 |      0 |   100% |
-sde_collections/migrations/0009_titlepattern_pattern_type.py  |                                              4 |      0 |   100% |
-sde_collections/migrations/0010_rename_pattern_type_titlepattern_match_pattern_type_and_more.py  |           4 |      0 |   100% |
-sde_collections/migrations/0011_alter_titlepattern_title_pattern_type.py  |                                  4 |      0 |   100% |
-sde_collections/migrations/0012_collection_curated_by_collection_curation_started_and_more.py  |             6 |      0 |   100% |
-sde_collections/migrations/0013_alter_titlepattern_options_and_more.py  |                                    5 |      0 |   100% |
-sde_collections/migrations/0014_alter_documenttypepattern_unique_together_and_more.py  |                     4 |      0 |   100% |
-sde_collections/migrations/0015_candidateurl_document_type.py  |                                             4 |      0 |   100% |
-sde_collections/migrations/0016_alter_documenttypepattern_candidate_urls_and_more.py  |                      5 |      0 |   100% |
-sde_collections/migrations/0017_requiredurls.py  |                                                           5 |      0 |   100% |
-sde_collections/migrations/0018_alter_requiredurls_url.py  |                                                 4 |      0 |   100% |
-sde_collections/migrations/0019_alter_requiredurls_url.py  |                                                 4 |      0 |   100% |
-sde_collections/migrations/0020_alter_collection_curation_status.py  |                                       4 |      0 |   100% |
-sde_collections/migrations/0021_alter_collection_curation_status.py  |                                       4 |      0 |   100% |
-sde_collections/migrations/0022_alter_candidateurl_unique_together.py  |                                     4 |      0 |   100% |
-sde_collections/migrations/0023_collection_github_issue_number.py  |                                         4 |      0 |   100% |
-sde_collections/migrations/0024_alter_collection_curation_status.py  |                                       4 |      0 |   100% |
-sde_collections/migrations/0025_alter_documenttypepattern_match_pattern_type_and_more.py  |                  4 |      0 |   100% |
-sde_collections/migrations/0026_alter_collection_curation_status_and_more.py  |                              4 |      0 |   100% |
-sde_collections/migrations/0027_alter_collection_connector.py  |                                             4 |      0 |   100% |
-sde_collections/migrations/0028_collection_has_sinequa_config.py  |                                          4 |      0 |   100% |
-sde_collections/migrations/0029_alter_candidateurl_document_type_and_more.py  |                              4 |      0 |   100% |
-sde_collections/migrations/0030_candidateurl_inference_by.py  |                                              4 |      0 |   100% |
-sde_collections/migrations/0031_candidateurl_is_pdf.py  |                                                    4 |      0 |   100% |
-sde_collections/migrations/0032_collection_workflow_status.py  |                                             4 |      0 |   100% |
-sde_collections/migrations/0033_alter_collection_config_folder.py  |                                         4 |      0 |   100% |
-sde_collections/migrations/0034_rename_tree_root_collection_tree_root_deprecated.py  |                       4 |      0 |   100% |
-sde_collections/migrations/0035_alter_candidateurl_unique_together.py  |                                     4 |      0 |   100% |
-sde_collections/migrations/0036_candidateurl_present_on_prod_and_more.py  |                                  4 |      0 |   100% |
-sde_collections/migrations/0037_alter_collection_source.py  |                                                4 |      0 |   100% |
-sde_collections/migrations/0037_remove_collection_has_sinequa_config.py  |                                   4 |      0 |   100% |
-sde_collections/migrations/0038_merge_20231126_1152.py  |                                                    4 |      0 |   100% |
-sde_collections/migrations/0039_includepattern.py  |                                                         5 |      0 |   100% |
-sde_collections/migrations/0040_candidateurl_hash.py  |                                                      4 |      0 |   100% |
-sde_collections/migrations/0041_alter_candidateurl_hash.py  |                                                4 |      0 |   100% |
-sde_collections/migrations/0042_alter_collection_division_and_more.py  |                                     4 |      0 |   100% |
-sde_collections/migrations/0043_comments.py  |                                                               6 |      0 |   100% |
-sde_collections/migrations/0044_alter_collection_document_type.py  |                                         4 |      0 |   100% |
-sde_collections/migrations/0045_alter_collection_workflow_status.py  |                                       4 |      0 |   100% |
-sde_collections/migrations/0045_workflowhistory.py  |                                                        6 |      0 |   100% |
-sde_collections/migrations/0046_resolvedtitle_candidateurl_resolved_title.py  |                              6 |      0 |   100% |
-sde_collections/migrations/0046_workflowhistory_old_status.py  |                                             4 |      0 |   100% |
-sde_collections/migrations/0047_remove_candidateurl_resolved_title_and_more.py  |                            5 |      0 |   100% |
-sde_collections/migrations/0048_alter_resolvedtitle_candidate_url.py  |                                      5 |      0 |   100% |
-sde_collections/migrations/0049_alter_resolvedtitle_resolution_date_time.py  |                               4 |      0 |   100% |
-sde_collections/migrations/0050_alter_resolvedtitle_resolved_title.py  |                                     4 |      0 |   100% |
-sde_collections/migrations/0051_alter_resolvedtitle_error_string_and_more.py  |                              4 |      0 |   100% |
-sde_collections/migrations/0052_rename_resolution_date_time_resolvedtitle_created_at_and_more.py  |          5 |      0 |   100% |
-sde_collections/migrations/0053_alter_collection_url.py  |                                                   4 |      0 |   100% |
-sde_collections/migrations/0054_merge_20240531_1332.py  |                                                    4 |      0 |   100% |
-sde_collections/migrations/0055_alter_workflowhistory_old_status_and_more.py  |                              4 |      0 |   100% |
-sde_collections/migrations/0056_alter_candidateurl_document_type_and_more.py  |                              4 |      0 |   100% |
-sde_collections/migrations/0057_alter_collection_workflow_status_and_more.py  |                              4 |      0 |   100% |
-sde_collections/migrations/0058_candidateurl_division_collection_is_multi_division_and_more.py  |            5 |      0 |   100% |
-sde_collections/migrations/0059_candidateurl_scraped_text.py  |                                              4 |      0 |   100% |
-sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py  |                                 5 |      0 |   100% |
-sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py  |                                        5 |      0 |   100% |
-sde_collections/migrations/0060_alter_candidateurl_scraped_text.py  |                                        4 |      0 |   100% |
-sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py  |                4 |      0 |   100% |
-sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py  |                                            5 |      0 |   100% |
-sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py  |                     6 |      0 |   100% |
-sde_collections/migrations/0063_merge_20241112_1428.py  |                                                    4 |      0 |   100% |
-sde_collections/migrations/0064_alter_curatedurl_options_and_more.py  |                                      4 |      0 |   100% |
-sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py  |                              5 |      0 |   100% |
-sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py  |                    5 |      0 |   100% |
-sde_collections/migrations/0066_merge_20241120_0158.py  |                                                    4 |      0 |   100% |
-sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py  |                            4 |      0 |   100% |
-sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py  |                          4 |      0 |   100% |
-sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py  |                         6 |      0 |   100% |
-sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py  |           5 |      0 |   100% |
-sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py  |                                 5 |      0 |   100% |
-sde_collections/migrations/0070_merge_20241205_1437.py  |                                                    4 |      0 |   100% |
-sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py  |                           5 |      0 |   100% |
-sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py  |                        26 |     19 |    27% |   8-52
-sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py  |                              4 |      0 |   100% |
-sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py  |                            4 |      0 |   100% |
-sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py  |                           25 |     19 |    24% |   7-20, 24-39
-sde_collections/migrations/__init__.py  |                                                                    0 |      0 |   100% |
 sde_collections/models/__init__.py  |                                                                        0 |      0 |   100% |
 sde_collections/models/candidate_url.py  |                                                                  89 |     16 |    82% |   124, 128-134, 138-142, 145, 176-177
 sde_collections/models/collection.py  |                                                                    414 |    144 |    65% |   241, 269, 277-287, 291-301, 305-315, 319-344, 348-357, 361, 365, 369-376, 380-387, 394, 403-406, 419, 436-439, 449-470, 478, 482-515, 519, 523, 527, 531-532, 536, 540-546, 550-553, 558-567, 575-617, 640, 679, 689, 703, 707-732, 765, 769-777, 785
@@ -177,28 +73,6 @@ sde_collections/models/pattern.py  |
 sde_collections/serializers.py  |                                                                          191 |     47 |    75% |   80-81, 84-85, 88-89, 92-93, 129-130, 133-134, 137-138, 141-142, 197, 201, 211-214, 244-247, 257-260, 271, 274, 307-315, 335-343, 358-366
 sde_collections/sinequa_api.py  |                                                                          102 |      3 |    97% |   65, 255, 289
 sde_collections/tasks.py  |                                                                                119 |     67 |    44% |   25-67, 72-108, 113-117, 122-125, 130-148, 153-155, 215-216
-sde_collections/tests.py  |                                                                                 24 |     24 |     0% |   1-36
-sde_collections/tests/__init__.py  |                                                                         0 |      0 |   100% |
-sde_collections/tests/factories.py  |                                                                       57 |      0 |   100% |
-sde_collections/tests/test_database_backup.py  |                                                            96 |      0 |   100% |
-sde_collections/tests/test_database_restore.py  |                                                          139 |      0 |   100% |
-sde_collections/tests/test_delta_patterns.py  |                                                            118 |      0 |   100% |
-sde_collections/tests/test_exclude_patterns.py  |                                                          142 |      7 |    95% |   21-41
-sde_collections/tests/test_field_modifier_patterns.py  |                                                   170 |      4 |    98% |   20-31
-sde_collections/tests/test_field_modifier_unapply.py  |                                                     85 |      0 |   100% |
-sde_collections/tests/test_fileext.py  |                                                                    15 |      0 |   100% |
-sde_collections/tests/test_import_fulltexts.py  |                                                           43 |      0 |   100% |
-sde_collections/tests/test_include_patterns.py  |                                                           53 |      0 |   100% |
-sde_collections/tests/test_migrate_dump.py  |                                                              188 |      0 |   100% |
-sde_collections/tests/test_migration.py  |                                                                  93 |      0 |   100% |
-sde_collections/tests/test_pattern_specificity.py  |                                                        84 |      0 |   100% |
-sde_collections/tests/test_promote_collection.py  |                                                        164 |      2 |    99% |   169, 184
-sde_collections/tests/test_sinequa_api.py  |                                                               147 |      2 |    99% |   25-26
-sde_collections/tests/test_tdamm_tags.py  |                                                                108 |      0 |   100% |
-sde_collections/tests/test_title_pattern_unapply.py  |                                                      94 |      0 |   100% |
-sde_collections/tests/test_title_resolution.py  |                                                           59 |      0 |   100% |
-sde_collections/tests/test_url_apis.py  |                                                                  162 |      0 |   100% |
-sde_collections/tests/test_workflow_status_triggers.py  |                                                  105 |      0 |   100% |
 sde_collections/urls.py  |                                                                                  17 |      0 |   100% |
 sde_collections/utils/__init__.py  |                                                                         0 |      0 |   100% |
 sde_collections/utils/bulk_github_push.py  |                                                                 8 |      8 |     0% |   7-22
@@ -213,31 +87,14 @@ sde_indexing_helper/__init__.py  |
 sde_indexing_helper/conftest.py  |                                                                           9 |      0 |   100% |
 sde_indexing_helper/contrib/__init__.py  |                                                                   0 |      0 |   100% |
 sde_indexing_helper/contrib/sites/__init__.py  |                                                             0 |      0 |   100% |
-sde_indexing_helper/contrib/sites/migrations/0001_initial.py  |                                              6 |      0 |   100% |
-sde_indexing_helper/contrib/sites/migrations/0002_alter_domain_unique.py  |                                  5 |      0 |   100% |
-sde_indexing_helper/contrib/sites/migrations/0003_set_site_domain_and_name.py  |                            20 |     12 |    40% |   12-31, 39-40, 50-51
-sde_indexing_helper/contrib/sites/migrations/0004_alter_options_ordering_domain.py  |                        4 |      0 |   100% |
-sde_indexing_helper/contrib/sites/migrations/__init__.py  |                                                  0 |      0 |   100% |
 sde_indexing_helper/users/__init__.py  |                                                                     0 |      0 |   100% |
 sde_indexing_helper/users/adapters.py  |                                                                    11 |     11 |     0% |   1-16
 sde_indexing_helper/users/admin.py  |                                                                       13 |      0 |   100% |
 sde_indexing_helper/users/apps.py  |                                                                        10 |      0 |   100% |
 sde_indexing_helper/users/context_processors.py  |                                                           3 |      0 |   100% |
 sde_indexing_helper/users/forms.py  |                                                                       15 |      0 |   100% |
-sde_indexing_helper/users/migrations/0001_initial.py  |                                                      8 |      0 |   100% |
-sde_indexing_helper/users/migrations/0002_contactformmodel_contentcurationrequestmodel.py  |                 4 |      0 |   100% |
-sde_indexing_helper/users/migrations/0003_delete_contactformmodel_and_more.py  |                             4 |      0 |   100% |
-sde_indexing_helper/users/migrations/__init__.py  |                                                          0 |      0 |   100% |
 sde_indexing_helper/users/models.py  |                                                                      10 |      0 |   100% |
 sde_indexing_helper/users/tasks.py  |                                                                        6 |      0 |   100% |
-sde_indexing_helper/users/tests/__init__.py  |                                                               0 |      0 |   100% |
-sde_indexing_helper/users/tests/factories.py  |                                                             16 |      0 |   100% |
-sde_indexing_helper/users/tests/test_admin.py  |                                                            23 |      0 |   100% |
-sde_indexing_helper/users/tests/test_forms.py  |                                                            10 |      0 |   100% |
-sde_indexing_helper/users/tests/test_models.py  |                                                            3 |      0 |   100% |
-sde_indexing_helper/users/tests/test_tasks.py  |                                                            11 |      0 |   100% |
-sde_indexing_helper/users/tests/test_urls.py  |                                                             11 |      0 |   100% |
-sde_indexing_helper/users/tests/test_views.py  |                                                            65 |      1 |    98% |   30
 sde_indexing_helper/users/urls.py  |                                                                         4 |      0 |   100% |
 sde_indexing_helper/users/views.py  |                                                                       27 |      0 |   100% |
 sde_indexing_helper/utils/__init__.py  |                                                                     0 |      0 |   100% |
@@ -246,29 +103,82 @@ sde_indexing_helper/utils/storages.py  |
 tests/test_merge_production_dotenvs_in_dotenv.py  |                                                         13 |      0 |   100 |%
 
 ## Critical Areas
-### Config Generation Pipeline
-This portion of the code creates config files that are used by Sinequa to index new content. A incomplete list of critical files is given here: 
+### Config Generation
 - config_generation/db_to_xml.py
-- sde_collections/models/collection.py
-  - create_scraper_config()
-  - create_indexer_config()
-  - _write_to_github()
-- sde_collections/utils/github_helper.py
+   - update_or_add_element_value()
+   - _update_config_xml()
+   - convert_template_to_scraper()
+   - add_document_type()
+   - add_url_exclude()
+   - add_title_mapping()
+   - add_job_list_item()
+   - get_tag_value()
+   - fetch_treeroot()
+   - fetch_document_type()
+- config_generation/generate_jobs.py
+   - make_all_parallel_jobs()
 
-### APIs, Serializers, and Views
-Any APIs that serve data to Sinequa, such as at the DeltaURLAPIView.
+### Models
+  - environmental_justice/models.py
+  - sde_collections/models/collection.py
+    - clear_delta_urls()
+    - clear_dump_urls()
+    - refresh_url_lists_for_all_patterns ()
+    - migrate_dump_to_delta ()
+    - create_or_update_delta_url
+    - promote_to_curate
+    - add_to_public_query()
+    - create_scraper_config()
+    - create_indexer_config()
+    - create_plugin_config()
+    - _write_to_github()
+    - update_config_xml()
+    - apply_all_patterns()
+    - create_configs_on_status_change()
+  - sde_collections/models/collection_choice_fields.py
+  - sde_collections/models/delta_patterns.py
+  - sde_collections/models/delta_url.py
+  - sde_collections/models/pattern.py
+  - sde_indexing_helper/users/models.py
 
-### Project Configs
-etc.
+### Views
+  - environmental_justice/views.py
+  - sde_collections/views.py
+  - sde_indexing_helper/users/views.py
 
+### Serializers and APIs
+  - environmental_justice/serializers.py
+  - sde_collections/serializers.py
 
-### Add additional areas below
+### Admin Interface
+  - environmental_justice/admin.py
+  - sde_collections/admin.py
+    - fetch_full_text_lrm_dev_action()
+    - fetch_full_text_xli_action()
+  - sde_indexing_helper/users/admin.py
 
+### Utilities and Helpers
+  - sde_collections/utils/github_helper.py
+  - sde_collections/utils/health_check.py
+  - sde_collections/utils/title_resolver.py
+  - sde_collections/utils/github_helper.py
+     - fetch_metadata()
+     - _get_contents_from_path()
 
-### What Pipelines Does the Repo Have?
-You can use this list as a starting point to think about what the COSMOS repo does, from a process perspective:
-1. COSMOS imports data from LRM Dev
-2. Sinequa config files are generated
+### Task Automation and Background Jobs
+  - sde_collections/tasks.py
+
+### Key Operational Pipelines in the Repository
+The selection of critical areas for testing is guided by the following pipelines of the repository:
+1. Sinequa config files are generated
+2. COSMOS imports data from LRM Dev
 3. Imported data is processed
 4. Curators update URL metadata
-5. Sinequa reads results from the COSMOS APIs
\ No newline at end of file
+5. Sinequa reads results from the COSMOS APIs
+
+### Critical Areas Lacking Tests
+- **Config Generation**: Config generation files are under-tested. Develop unit tests for all critical functions in the config_generation files.
+- **Project Settings**: Environment-specific configurations (`local.py`, `production.py`) have no tests.
+- **Frontend Features**: Currently, there are no tests covering frontend logic and interactions.
+- **Utilities and Helpers**: Essential utility modules like github_helper.py and health_check.py lack tests
+

From d3b7fd5465e18266f3630477805f641ee35d678e Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 10 Feb 2025 07:14:44 -0600
Subject: [PATCH 410/441] testing_strategy2

---
 docs/Tests/testing_plan.md   | 62 -------------------------
 docs/Tests/testing_plan_2.md | 79 --------------------------------
 docs/Tests/testing_plan_3.md | 88 ------------------------------------
 3 files changed, 229 deletions(-)
 delete mode 100644 docs/Tests/testing_plan.md
 delete mode 100644 docs/Tests/testing_plan_2.md
 delete mode 100644 docs/Tests/testing_plan_3.md

diff --git a/docs/Tests/testing_plan.md b/docs/Tests/testing_plan.md
deleted file mode 100644
index 0694cce6..00000000
--- a/docs/Tests/testing_plan.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# Testing Plan for Project
-
-## Overview
-This document outlines the current testing status and identifies priority areas where additional testing is necessary to ensure application robustness and maintainability.
-
-## Note on Test Coverage
-The percentages of test coverage reported below are derived using the `coverage` tool. This tool generates a coverage report as part of the Run Full Test Suite, which triggers every time a commit is made to a branch for which there is an open PR. Observations from these reports can be viewed directly in the terminal.
-
-## Current Testing Status Overview
-
-### Well-Covered Areas (90-100% Coverage)
-- **Configuration and Initial Setup**
-  - `config/__init__.py`, `config/celery_app.py`, `config/settings/base.py`, `config/settings/__init__.py`, `config/settings/test.py`: These configurations are crucial for the application's initial setup and runtime environment, and they are well-tested ensuring stability in configuration loading and processing.
-
-- **Environmental Justice Features**
-  - All modules within `environmental_justice` except some migrations showing high test coverage. This includes models, views, and serializers which are critical to the functionality of the environmental justice features.
-
-- **Feedback System**
-  - Similar to environmental justice, the feedback system shows robust testing across its serializers, views, and most models.
-
-### Areas with Moderate Coverage (70-89% Coverage)
-- **sde_collections**
-  - Some modules in `sde_collections` like `admin.py`, `models/collection.py`, and `models/candidate_url.py` show moderate coverage, suggesting that critical business logic related to collection management could benefit from additional testing.
-
-### Critical Areas with Insufficient Tests (<70% Coverage)
-- **Settings Files**
-  - `config/settings/local.py` and `config/settings/production.py`: These files are crucial as they define environment-specific settings but currently have 0% coverage. Immediate attention is required to ensure that all configurations hold up under various environments.
-
-- **Configuration Generation**
-  - Most of the `config_generation` scripts such as `db_to_xml.py`, `api.py`, `generate_commands.py`, etc., are critically under-tested. Given that these scripts likely play a significant role in setting up and maintaining the application state, comprehensive tests are essential.
-
-- **Utilities and Helpers**
-  - Utility scripts and helpers like those in `sde_collections/utils`, particularly `github_helper.py` and `health_check.py`, show significant gaps in testing. These are important for the application's integration and maintenance operations.
-
-## Recommendations for Immediate Test Development
-
-### High Priority Tests
-1. **Complete Testing for Settings Management**
-   - Develop tests for `config/settings/local.py` and `config/settings/production.py` to validate all configurations under simulated environments to prevent deployment issues.
-
-2. **Robust Tests for Configuration Generation**
-   - Implement unit and integration tests for the `config_generation` module to ensure all configurations and settings are generated correctly and errors are handled gracefully.
-
-3. **Enhanced Testing for Collection Management**
-   - Focus on increasing coverage for `sde_collections/models` and `sde_collections/views`. Given their direct impact on user data management and interface behavior, it is critical to cover these with more comprehensive tests including edge cases and failure modes.
-
-### Medium Priority Tests
-1. **Utility Scripts and Helpers**
-   - Scripts that support operational tasks such as `sde_collections/utils/github_helper.py` and `health_check.py` need thorough testing to ensure reliability in operational tasks.
-
-2. **Admin Interfaces and Forms**
-   - Given the moderate coverage in some admin and form-related areas, additional tests should be considered to cover all user interactions and data validation scenarios.
-
-### Lower Priority Tests
-1. **Further Testing of Well-Covered Areas**
-   - While not immediate, ensure that any new features or changes in areas like `environmental_justice` and `feedback` modules continue to maintain high coverage and reflect any new business logic or changes.
-
-## Testing Plan Execution
-
-- **Schedule and Assignments**: Assign test development tasks according to priority, with scheduled milestones for high-priority tests within the next sprint. Medium and lower priority tests can be scheduled for subsequent sprints.
-- **Resources**: Allocate resources not only for writing tests but also for setting up better test environments and potentially integrating more comprehensive CI/CD pipelines to automate and validate coverage on each build.
-- **Review and Adapt**: Continuously monitor test coverage metrics and adapt the testing plan as the project evolves. This dynamic approach will help ensure that the testing strategy remains aligned with project goals and technological shifts.
diff --git a/docs/Tests/testing_plan_2.md b/docs/Tests/testing_plan_2.md
deleted file mode 100644
index d495e462..00000000
--- a/docs/Tests/testing_plan_2.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# Testing Plan
-
-## Overview
-This document provides a comprehensive outline of our application's current test coverage, identifies areas that require immediate testing attention, and guides future test development priorities.
-
-## Note on Test Coverage
-The percentages of test coverage reported below are derived using the `coverage` tool. This tool generates a coverage report as part of the Run Ful Test Suite, which triggers every time a commit is made to a branch for which there is an open PR. Observations from these reports can be viewed directly in the terminal.
-
-## Testing Categories and Current Coverage
-
-### Configuration and Setup
-- **Files and Coverage**:
-  - `config/settings/local.py` (0%)
-  - `config/settings/production.py` (0%)
-  - `config/settings/base.py` (100%)
-  - `config/urls.py` (71%)
-  - `config/wsgi.py` (0%)
-
-### Model Layer
-- **Files and Coverage**:
-  - `environmental_justice/models.py` (97%)
-  - `feedback/models.py` (64%)
-  - `sde_collections/models/candidate_url.py` (82%)
-  - `sde_collections/models/collection.py` (65%)
-  - `sde_collections/models/collection_choice_fields.py` (86%)
-  - `sde_collections/models/delta_patterns.py` (89%)
-  - `sde_collections/models/delta_url.py` (77%)
-  - `sde_collections/models/pattern.py` (46%)
-  - `sde_indexing_helper/users/models.py` (100%)
-
-### Views and Controllers
-- **Files and Coverage**:
-  - `environmental_justice/views.py` (100%)
-  - `feedback/views.py` (100%)
-  - `sde_collections/views.py` (38%)
-
-### Data Serialization and APIs
-- **Files and Coverage**:
-  - `environmental_justice/serializers.py` (100%)
-  - `feedback/serializers.py` (100%)
-  - `sde_collections/serializers.py` (75%)
-
-### Admin Interface
-- **Files and Coverage**:
-  - `environmental_justice/admin.py` (100%)
-  - `feedback/admin.py` (100%)
-  - `sde_collections/admin.py` (66%)
-
-### Utilities and Helpers
-- **Files and Coverage**:
-  - `sde_collections/utils/*.py` (Various coverage)
-  - `sde_indexing_helper/utils/*.py` (0% to 100% coverage)
-
-### Testing Infrastructure
-- **Files and Coverage**:
-  - `sde_collections/tests/*.py` (High coverage with some gaps)
-  - `sde_indexing_helper/users/tests/*.py` (High coverage with minor gaps)
-
-### Database and Migration Scripts
-- **Files and Coverage**:
-  - `config_generation/*.py` (Mostly 0% with some critical scripts untested)
-  - All migration files across modules (Varied coverage, some with 0%)
-
-### Task Automation and Background Jobs
-- **Files and Coverage**:
-  - `sde_collections/tasks.py` (44%)
-
-### Critical Areas for Immediate Testing
-- **Configuration Files**: Immediate attention to `local.py` and `production.py` is crucial for environment-specific settings.
-- **View Layers in `sde_collections`**: Given their centrality to application logic and low coverage.
-- **Model Complexity**: Specifically, the `sde_collections/models/collection.py` and `pattern.py` due to their low coverage and complexity.
-
-## Recommended Actions
-1. **Expand Unit Tests**: Focus on untested configuration files and complex models.
-2. **Integration Tests**: Develop tests for views to ensure full application flow is covered.
-3. **Continuous Integration Improvements**: Ensure that all tests are executed during CI processes, and add checks for test coverage thresholds.
-
-## Conclusion
-Prioritizing these areas will ensure robustness and reliability of our application, addressing both immediate testing gaps and setting a foundation for continuous quality assurance.
diff --git a/docs/Tests/testing_plan_3.md b/docs/Tests/testing_plan_3.md
deleted file mode 100644
index 14e6827f..00000000
--- a/docs/Tests/testing_plan_3.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# Testing Plan
-
-## Overview
-This document provides a comprehensive outline of our application's current test coverage, identifies areas that require immediate testing attention, and guides future test development priorities.
-
-## Note on Test Coverage
-The percentages of test coverage reported below are derived using the `coverage` tool. This tool generates a coverage report as part of the Run Full Test Suite, which triggers every time a commit is made to a branch for which there is an open PR. Observations from these reports can be viewed directly in the terminal.
-
-## Testing Categories and Current Coverage
-
-### Configuration and Setup
-- **Critical and Untested**:
-  - `config/settings/local.py` (0%): Environment-specific settings for local development.
-  - `config/settings/production.py` (0%): Environment-specific settings for production environments.
-  - `config/wsgi.py` (0%): WSGI configuration for deployment.
-- **Partially Tested**:
-  - `config/urls.py` (71%): URL dispatching and routing configurations.
-- **Tested**:
-  - `config/settings/base.py` (100%): Base settings including middleware, database configurations, etc.
-
-### Model Layer
-- **Critical and Undertested**:
-  - `sde_collections/models/collection.py` (65%): Core model for handling collections, requires deeper testing due to its critical role in data management.
-  - `sde_collections/models/pattern.py` (46%): Handles complex pattern matching logic, significantly under-tested.
-  - `feedback/models.py` (64%): Involves important logic for managing feedback, requires additional coverage to ensure robust data integrity and operations.
-- **Tested**:
-  - `environmental_justice/models.py` (97%)
-  - `sde_collections/models/candidate_url.py` (82%)
-  - `sde_collections/models/collection_choice_fields.py` (86%)
-  - `sde_collections/models/delta_patterns.py` (89%)
-  - `sde_collections/models/delta_url.py` (77%)
-  - `sde_indexing_helper/users/models.py` (100%)
-
-### Views and Controllers
-- **Critical and Undertested**:
-  - `sde_collections/views.py` (38%): Central component for application's user interface logic, critically under-tested.
-- **Tested**:
-  - `environmental_justice/views.py` (100%)
-  - `feedback/views.py` (100%)
-
-### Data Serialization and APIs
-- **Critical and Undertested**:
-  - `sde_collections/serializers.py` (75%): Essential for API interaction, requires further testing to ensure robust data serialization.
-- **Tested**:
-  - `environmental_justice/serializers.py` (100%)
-  - `feedback/serializers.py` (100%)
-
-### Admin Interface
-- **Critical and Undertested**:
-  - `sde_collections/admin.py` (66%): Admin interface for managing application data, requires additional tests.
-- **Tested**:
-  - `environmental_justice/admin.py` (100%)
-  - `feedback/admin.py` (100%)
-
-### Utilities and Helpers
-- **Critical and Undertested/Untested**:
-  - `sde_collections/utils/github_helper.py` (19%)
-  - `sde_collections/utils/health_check.py` (14%)
-  - `sde_collections/utils/bulk_github_push.py` (0%)
-  - `sde_collections/utils/generate_deployment_message.py` (0%)
-  - `sde_collections/utils/slack_utils.py` (79%)
-  - `sde_indexing_helper/utils/storages.py` (0%)
-
-### Testing Infrastructure
-- **Tested with Gaps**:
-  - `sde_collections/tests/*.py`
-  - `sde_indexing_helper/users/tests/*.py`
-
-### Database and Migration Scripts
-- **Critical and Mostly Untested**:
-  - `config_generation/*.py`: Most scripts involved in database configuration and migration scripts are critically under-tested, posing a risk to data integrity and application setup.
-
-### Task Automation and Background Jobs
-- **Critical and Undertested**:
-  - `sde_collections/tasks.py` (44%): Handles background tasks and automation, significantly under-tested considering their operational importance.
-
-## Critical Areas for Immediate Testing
-1. **Configuration Files**: Immediate testing for `config/settings/local.py` and `config/settings/production.py` to ensure they function correctly across different environments.
-2. **Core Business Logic in Models and Views**: Focus on significantly under-tested `sde_collections/models/collection.py` and `sde_collections/views.py`.
-3. **Utility Scripts and Background Jobs**: Address the lack of tests for critical utilities such as `sde_collections/utils/github_helper.py` and `sde_collections/tasks.py`.
-
-## Recommended Actions
-- **Expand Unit Tests**: For critical configuration files and complex models.
-- **Integration Tests**: Enhance coverage for views and serializers to ensure complete application workflows.
-- **Continuous Integration Improvements**: Implement tests during CI processes and enforce coverage thresholds.
-
-## Conclusion
-This plan highlights critical testing needs and outlines a structured approach to addressing immediate testing gaps while setting a long-term foundation for continuous quality assurance. By following this plan, we aim to improve the robustness and reliability of our application systematically.

From 297641e1fa92498bd94cc81fd17f9b5d59049830 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 10 Feb 2025 10:31:34 -0600
Subject: [PATCH 411/441] update serializer file with deleted email field

---
 feedback/serializers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/feedback/serializers.py b/feedback/serializers.py
index 8359b3bc..a823ef35 100644
--- a/feedback/serializers.py
+++ b/feedback/serializers.py
@@ -18,7 +18,6 @@ def to_internal_value(self, data):
 class FeedbackSerializer(serializers.ModelSerializer):
 
     name = HTMLFreeCharField(max_length=150)
-    email = serializers.EmailField()
     subject = HTMLFreeCharField(max_length=400)
     comments = HTMLFreeCharField()
     source = HTMLFreeCharField(max_length=50, required=False, default="SDE")

From fa0b07d98bde9d2e4634abf0eeaf4cf110285b79 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Mon, 10 Feb 2025 10:56:45 -0600
Subject: [PATCH 412/441] Create CHANGELOG.md

---
 CHANGELOG.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..7ee57a10
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,14 @@
+## Overview
+These are not the release notes, which can be found https://github.com/NASA-IMPACT/COSMOS/releases. Instead, this is a changelog that developers use to log key changes to the codebase with each pull request.
+
+## What to Include
+For each PR made, an entry should be added to this changelog. It should contain 
+- a brief description of the deliverable of the feature or bugfix
+- exact listing of key changes such as:
+  - API endpoint modified
+  - frontend components added
+  - model updates
+  - deployment changes needed on the servers
+  - etc.
+
+## Changelog

From 20b6127c17dd708d3dc989e9f37cb1a9610e97ca Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 10 Feb 2025 16:57:05 +0000
Subject: [PATCH 413/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ee57a10..cb0ee17c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 These are not the release notes, which can be found https://github.com/NASA-IMPACT/COSMOS/releases. Instead, this is a changelog that developers use to log key changes to the codebase with each pull request.
 
 ## What to Include
-For each PR made, an entry should be added to this changelog. It should contain 
+For each PR made, an entry should be added to this changelog. It should contain
 - a brief description of the deliverable of the feature or bugfix
 - exact listing of key changes such as:
   - API endpoint modified

From 0ea9976ca88b1990280e143f82f849a2d5275271 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 10 Feb 2025 13:37:48 -0600
Subject: [PATCH 414/441] added change log to the PR

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb0ee17c..234e5663 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,3 +12,10 @@ For each PR made, an entry should be added to this changelog. It should contain
   - etc.
 
 ## Changelog
+
+- 1217-add-data-validation-to-the-feedback-form-api-to-restrict-html-content
+  - Description: The feedback form API does not currently have any form of data validation on the backend which makes it easy for the user with the endpoint to send in data with html tags. We need to have a validation scheme on the backend to protect this from happening.
+  - Changes:
+    - Defined a class `HTMLFreeCharField` which inherits `serializers.CharField`
+    - Used regex to catch any HTML content comming in as an input to form fields
+    - Called this class within the serializer for necessary fields

From f11ddc3d1a6deef057499d25d13e30e3d30fbafc Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 10 Feb 2025 13:51:37 -0600
Subject: [PATCH 415/441] added change log with this PR

---
 CHANGELOG.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb0ee17c..dd440880 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,3 +12,14 @@ For each PR made, an entry should be added to this changelog. It should contain
   - etc.
 
 ## Changelog
+- 960-notifications-add-a-dropdown-with-options-on-the-feedback-form
+  - Description: Generate an API endpoint and publish all the dropdown options necessary as a list for LRM to consume it.
+  - Changes:
+    - Created a new model `FeedbackFormDropdown`
+    - Added the migration file
+    - Added the `dropdown_option` field to the `Feedback` model
+    - Updated the slack notification structure by adding the dropdown option text
+    - Created a new serializer called `FeedbackFormDropdownSerializer`
+    - Added a new API endpoint `feedback-form-dropdown-options-api/` where the list is going to be accesible
+    - Added a list view called `FeedbackFormDropdownListView`
+    - Added tests

From 97957f0f658ea645f5a537e6f035bdd4703a6e53 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 10 Feb 2025 15:31:45 -0600
Subject: [PATCH 416/441] updated serializers

---
 sde_collections/serializers.py | 52 ++++------------------------------
 1 file changed, 6 insertions(+), 46 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 48e739e2..8d512d54 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -1,7 +1,7 @@
 from rest_framework import serializers
 
 from .models.collection import Collection, ReindexingHistory, WorkflowHistory
-from .models.collection_choice_fields import Divisions, DocumentTypes
+from .models.collection_choice_fields import Divisions, DocumentTypes, TDAMMTags
 from .models.delta_patterns import (
     DeltaDivisionPattern,
     DeltaDocumentTypePattern,
@@ -240,62 +240,22 @@ def get_tdamm_tag(self, obj):
 
         categorized_tags = {"messengers": [], "objects": [], "signals": []}
 
-        tag_transformations = {
-            "MMA_M_EM": "Messenger/EM Radiation",
-            "MMA_M_EM_G": "Messenger/EM Radiation/Gamma rays",
-            "MMA_M_EM_X": "Messenger/EM Radiation/X-rays",
-            "MMA_M_EM_U": "Messenger/EM Radiation/Ultraviolet",
-            "MMA_M_EM_O": "Messenger/EM Radiation/Optical",
-            "MMA_M_EM_I": "Messenger/EM Radiation/Infrared",
-            "MMA_M_EM_M": "Messenger/EM Radiation/Microwave",
-            "MMA_M_EM_R": "Messenger/EM Radiation/Radio",
-            "MMA_M_G": "Messenger/Gravitational Waves",
-            "MMA_M_G_CBI": "Messenger/Gravitational Waves/Compact Binary Inspiral",
-            "MMA_M_G_S": "Messenger/Gravitational Waves/Stochastic",
-            "MMA_M_G_CON": "Messenger/Gravitational Waves/Continuous",
-            "MMA_M_G_B": "Messenger/Gravitational Waves/Burst",
-            "MMA_M_C": "Messenger/Cosmic Rays",
-            "MMA_M_N": "Messenger/Neutrinos",
-            "MMA_O_BI": "Objects/Binaries",
-            "MMA_O_BI_BBH": "Objects/Binaries/Binary Black Holes",
-            "MMA_O_BI_BNS": "Objects/Binaries/Binary Neutron Stars",
-            "MMA_O_BI_C": "Objects/Binaries/Cataclysmic Variables",
-            "MMA_O_BI_N": "Objects/Binaries/Neutron Star-Black Hole",
-            "MMA_O_BI_B": "Objects/Binaries/Binary Pulsars",
-            "MMA_O_BI_W": "Objects/Binaries/White Dwarf Binaries",
-            "MMA_O_BH": "Objects/Black Holes",
-            "MMA_O_BH_AGN": "Objects/Black Holes/Active Galactic Nuclei",
-            "MMA_O_BH_IM": "Objects/Black Holes/Intermediate mass",
-            "MMA_O_BH_STM": "Objects/Black Holes/Stellar mass",
-            "MMA_O_BH_SUM": "Objects/Black Holes/Supermassive",
-            "MMA_O_E": "Objects/Exoplanets",
-            "MMA_O_N": "Objects/Neutron Stars",
-            "MMA_O_N_M": "Objects/Neutron Stars/Magnetars",
-            "MMA_O_N_P": "Objects/Neutron Stars/Pulsars",
-            "MMA_O_N_PWN": "Objects/Neutron Stars/Pulsar Wind Nebula",
-            "MMA_O_S": "Objects/Supernova Remnants",
-            "MMA_S_F": "Signals/Fast Radio Bursts",
-            "MMA_S_G": "Signals/Gamma-ray Bursts",
-            "MMA_S_K": "Signals/Kilonovae",
-            "MMA_S_N": "Signals/Novae",
-            "MMA_S_P": "Signals/Pevatrons",
-            "MMA_S_ST": "Signals/Stellar flares",
-            "MMA_S_SU": "Signals/Supernovae",
-        }
-
         for tag in obj.tdamm_tag:
             if tag == "NOT_TDAMM":
                 continue
 
-            transformed_tag = tag_transformations.get(tag)
-            if not transformed_tag:
+            tag_text = dict(TDAMMTags.choices).get(tag)
+            if not tag_text:
                 continue
 
             if tag.startswith("MMA_M_"):
+                transformed_tag = tag_text.replace(" - ", "/")
                 categorized_tags["messengers"].append(transformed_tag)
             elif tag.startswith("MMA_O_"):
+                transformed_tag = tag_text.replace(" - ", "/")
                 categorized_tags["objects"].append(transformed_tag)
             elif tag.startswith("MMA_S_"):
+                transformed_tag = tag_text.replace(" - ", "/")
                 categorized_tags["signals"].append(transformed_tag)
 
         return categorized_tags

From 4b3436d762076c2070470b597051497bfae461a4 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Mon, 10 Feb 2025 15:35:12 -0600
Subject: [PATCH 417/441] added change log file with PR

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb0ee17c..9ebfd840 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,3 +12,7 @@ For each PR made, an entry should be added to this changelog. It should contain
   - etc.
 
 ## Changelog
+- 2889-serialize-the-tdamm-tags
+  - Description: Have TDAMM serialzed in a specific way and exposed via the Curated URLs API to be consumed into SDE Test/Prod
+  - Changes:
+    - Changed `get_tdamm_tag` method in the `CuratedURLAPISerializer` to process the TDAMM tags and pass them to the API endpoint

From 9e44e3acab8d81d725f8566837bb45638e315051 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Mon, 10 Feb 2025 18:40:34 -0600
Subject: [PATCH 418/441] Refactor test modules

---
 sde_collections/tests/frontend/base.py        | 107 ++----------------
 sde_collections/tests/frontend/mixins.py      |  55 +++++++++
 sde_collections/tests/frontend/test_auth.py   |  65 ++++++-----
 .../tests/frontend/test_collections.py        |  28 +----
 sde_collections/tests/frontend/test_setup.py  |  16 ---
 5 files changed, 104 insertions(+), 167 deletions(-)
 create mode 100644 sde_collections/tests/frontend/mixins.py
 delete mode 100644 sde_collections/tests/frontend/test_setup.py

diff --git a/sde_collections/tests/frontend/base.py b/sde_collections/tests/frontend/base.py
index 55fc6c43..66a82941 100644
--- a/sde_collections/tests/frontend/base.py
+++ b/sde_collections/tests/frontend/base.py
@@ -1,53 +1,46 @@
 import shutil
-import subprocess
 
 import pytest
-from django.contrib.auth import get_user_model
 from django.contrib.staticfiles.testing import StaticLiveServerTestCase
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
+from .mixins import AuthenticationMixin
 
-class BaseTestCase(StaticLiveServerTestCase):
+
+class BaseTestCase(StaticLiveServerTestCase, AuthenticationMixin):
     """Base class for all frontend tests using Selenium."""
 
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
 
-        # Verify ChromeDriver is available
+        # Verify ChromeDriver and Chromium are available
         chromedriver_path = shutil.which("chromedriver")
+        chromium_path = shutil.which("chromium")
+
         if not chromedriver_path:
             pytest.fail("ChromeDriver not found. Please ensure chromium-driver is installed.")
+        if not chromium_path:
+            pytest.fail("Chromium not found. Please ensure chromium is installed.")
 
-        # # Set up Chrome options
+        # Set up Chrome options
         chrome_options = Options()
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-dev-shm-usage")
-        chrome_options.binary_location = "/usr/bin/chromium"
+        chrome_options.binary_location = chromium_path
 
         try:
-            # Create service with explicit path
-            service = Service(executable_path=chromedriver_path, log_path="/tmp/chromedriver.log")
-
-            # Initialize WebDriver with service and options
+            service = Service(executable_path=chromedriver_path)
             cls.driver = webdriver.Chrome(service=service, options=chrome_options)
-
             cls.driver.set_window_size(1920, 1080)
             cls.driver.implicitly_wait(10)
             cls.wait = WebDriverWait(cls.driver, 10)
 
         except Exception as e:
-            # Print debugging information
-            subprocess.run(["which", "chromium"])
-            subprocess.run(["which", "chromedriver"])
-            subprocess.run(["chromium", "--version"])
-            subprocess.run(["chromedriver", "--version"])
             pytest.fail(f"Failed to initialize ChromeDriver: {str(e)}")
 
     @classmethod
@@ -55,81 +48,3 @@ def tearDownClass(cls):
         if hasattr(cls, "driver"):
             cls.driver.quit()
         super().tearDownClass()
-
-    def setUp(self):
-        """Set up test case."""
-        super().setUp()
-        # Add any additional setup here
-
-    def create_test_user(self, username="test_user", password="test_password123", **kwargs):
-        """Create a test user for login testing."""
-        User = get_user_model()
-
-        # Delete user if it already exists
-        User.objects.filter(username=username).delete()
-
-        user_data = {
-            "username": username,
-            "is_active": True,
-            "is_staff": True,  # Ensure user is staff
-            "is_superuser": False,  # Ensure user is superuser
-        }
-        user_data.update(kwargs)
-
-        user = User.objects.create_user(**user_data)
-        user.set_password(password)
-        user.save()
-
-        # Verify user was created correctly
-        print(f"\nCreated user: {username}")
-        print(f"Is active: {user.is_active}")
-        print(f"Is staff: {user.is_staff}")
-        print(f"Is superuser: {user.is_superuser}")
-
-        return user, password
-
-    def login(self, username="test_user", password="test_password123"):
-        """
-        Login helper method.
-        Returns True if login successful, False otherwise.
-        """
-        # Navigate to login page
-        self.driver.get(f"{self.live_server_url}/accounts/login/")
-
-        try:
-            # Wait for and fill username
-            username_input = self.wait.until(EC.presence_of_element_located((By.NAME, "login")))
-            username_input.send_keys(username)
-
-            # Fill password
-            password_input = self.driver.find_element(By.NAME, "password")
-            password_input.send_keys(password)
-
-            # Find and click the login button
-            login_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
-            login_button.click()
-
-            # Wait for successful login by checking for redirect
-            self.wait.until(EC.url_changes("/accounts/login/"))
-
-            # Print debug information
-            print(f"Current URL after login: {self.driver.current_url}")
-            return True
-
-        except Exception as e:
-            print(f"Login failed: {str(e)}")
-            return False
-
-    def logout(self):
-        """Logout helper method."""
-        try:
-            # Click logout link/button (adjust selector based on your UI)
-            logout_link = self.driver.find_element(By.CSS_SELECTOR, "a[href*='logout']")
-            logout_link.click()
-
-            # Wait for redirect to login page
-            self.wait.until(EC.presence_of_element_located((By.NAME, "username")))
-            return True
-        except Exception as e:
-            print(f"Logout failed: {str(e)}")
-            return False
diff --git a/sde_collections/tests/frontend/mixins.py b/sde_collections/tests/frontend/mixins.py
new file mode 100644
index 00000000..8b24cac1
--- /dev/null
+++ b/sde_collections/tests/frontend/mixins.py
@@ -0,0 +1,55 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+
+from ..factories import UserFactory
+
+
+class AuthenticationMixin:
+    """Mixin for authentication-related test methods."""
+
+    def create_test_user(self, username="test_user", password="test_password123", **kwargs):
+        """Create a test user using UserFactory."""
+        # Delete user if it already exists
+        UserFactory._meta.model.objects.filter(username=username).delete()
+
+        user = UserFactory(username=username, is_active=True, **kwargs)
+        user.set_password(password)
+        user.save()
+
+        return user, password
+
+    def login(self, username="test_user", password="test_password123"):
+        """
+        Login helper method.
+        Returns True if login successful, False otherwise.
+        """
+        self.driver.get(f"{self.live_server_url}/accounts/login/")
+
+        try:
+            username_input = self.wait.until(EC.presence_of_element_located((By.NAME, "login")))
+            username_input.send_keys(username)
+
+            password_input = self.driver.find_element(By.NAME, "password")
+            password_input.send_keys(password)
+
+            login_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
+            login_button.click()
+
+            self.wait.until(EC.title_is("Collections | COSMOS"))
+            return True
+
+        except Exception as e:
+            print(f"Login failed: {str(e)}")
+            return False
+
+    def logout(self):
+        """Logout helper method."""
+        try:
+            logout_link = self.driver.find_element(By.CSS_SELECTOR, "a[href='/accounts/logout/']")
+            self.driver.execute_script("arguments[0].click();", logout_link)
+
+            self.wait.until(EC.presence_of_element_located((By.NAME, "login")))
+            return True
+        except Exception as e:
+            print(f"Logout failed: {str(e)}")
+            return False
diff --git a/sde_collections/tests/frontend/test_auth.py b/sde_collections/tests/frontend/test_auth.py
index 3c84cef0..1a31fac7 100644
--- a/sde_collections/tests/frontend/test_auth.py
+++ b/sde_collections/tests/frontend/test_auth.py
@@ -1,3 +1,5 @@
+from selenium.webdriver.common.by import By
+
 from .base import BaseTestCase
 
 
@@ -6,52 +8,49 @@ class TestAuthentication(BaseTestCase):
 
     def setUp(self):
         super().setUp()
-        self.test_username = "test_user"
-        self.test_password = "test_password123"
-        self.user, _ = self.create_test_user(username=self.test_username, password=self.test_password)
+        # Create test user with factory
+        self.user, self.password = self.create_test_user(
+            username="test_user", password="test_password123", is_staff=True
+        )
 
     def test_successful_login(self):
         """Test successful login process."""
         # Attempt login
-        login_success = self.login(self.test_username, self.test_password)
-        assert login_success, "Login should be successful"
+        login_success = self.login(self.user.username, self.password)
+        assert login_success, "Login Failed"
 
-        # Verify we're on the dashboard or home page
-        assert "Welcome back!" in self.driver.page_source
+        # Verify successful login by checking welcome message
+        assert "Welcome back!" in self.driver.page_source, "Welcome message not found"
 
-        # print(self.driver.page_source)
+    def test_failed_login(self):
+        """Test login failure with incorrect credentials."""
+        # Attempt login with wrong password
+        login_success = self.login(self.user.username, "wrong_password")
+        assert not login_success, "Login should fail with incorrect password"
 
-        # # Verify user menu is present
-        # user_menu = self.wait.until(
-        #     EC.presence_of_element_located((By.CLASS_NAME, "user-menu"))
-        # )
-        # assert self.test_username in user_menu.text
+        # Verify we're still on login page
+        assert "/accounts/login/" in self.driver.current_url, "Should remain on login page"
 
-    # def test_failed_login(self):
-    #     """Test login failure with incorrect credentials."""
-    #     login_success = self.login(self.test_username, "wrong_password")
-    #     assert not login_success, "Login should fail with incorrect password"
+        # Verify error message is displayed
+        error_message = (self.driver.find_element(By.CLASS_NAME, "alert")).text
+        assert "The username and/or password you specified are not correct" in error_message, "Error message not found"
 
-    #     # Verify error message
-    #     error_message = self.wait.until(
-    #         EC.presence_of_element_located((By.CLASS_NAME, "alert-error"))
-    #     )
-    #     assert "Please enter a correct username and password" in error_message.text
+    def test_logout(self):
+        """Test logout functionality."""
+        # First login
+        login_success = self.login(self.user.username, self.password)
+        assert login_success, "Initial login failed"
 
-    # def test_logout(self):
-    #     """Test logout functionality."""
-    #     # First login
-    #     login_success = self.login(self.test_username, self.test_password)
-    #     assert login_success, "Login should be successful"
+        # Verify we're logged in
+        assert "Welcome back!" in self.driver.page_source, "Not properly logged in"
 
-    #     # Then logout
-    #     logout_success = self.logout()
-    #     assert logout_success, "Logout should be successful"
+        # Perform logout
+        logout_success = self.logout()
+        assert logout_success, "Logout failed"
 
-    #     # Verify we're back at login page
-    #     assert "login" in self.driver.current_url.lower()
+        # Verify redirect to login page
+        assert "/accounts/login/" in self.driver.current_url, "Should redirect to login page after logout"
 
     def tearDown(self):
         """Clean up after each test."""
-        self.user.delete()
         super().tearDown()
diff --git a/sde_collections/tests/frontend/test_collections.py b/sde_collections/tests/frontend/test_collections.py
index 02f17a0f..3ec6fe5e 100644
--- a/sde_collections/tests/frontend/test_collections.py
+++ b/sde_collections/tests/frontend/test_collections.py
@@ -1,7 +1,9 @@
+# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/frontend/test_collections.py
+
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 
-from ..factories import CollectionFactory, UserFactory
+from ..factories import CollectionFactory
 from .base import BaseTestCase
 
 
@@ -11,45 +13,27 @@ class TestCollections(BaseTestCase):
     def setUp(self):
         """Set up test data."""
         super().setUp()
-        # Create test user and collections
-        self.user = UserFactory(is_staff=True)
-        self.user.set_password("test_password123")
-        self.user.save()
+        self.user, self.password = self.create_test_user(is_staff=True)
 
         # Create 3 test collections
         self.collections = [CollectionFactory(curated_by=self.user) for _ in range(3)]
-        # Store collection names for verification
         self.collection_names = [collection.name for collection in self.collections]
 
     def test_collections_display(self):
         """Test that collections are displayed after login."""
-        # Login
-        self.login(self.user.username, "test_password123")
+        self.login(self.user.username, self.password)
 
         # Navigate to collections page
         self.driver.get(f"{self.live_server_url}/")
 
-        # Print page source for debugging
-        # print(f"\nCurrent URL: {self.driver.current_url}")
-        print(f"Page Source: {self.driver.page_source}")
-
         # Wait for specific table to load using ID
         table = self.wait.until(EC.presence_of_element_located((By.ID, "collection_table")))
-
-        # Additional verification that it's the right table
         assert "table-striped dataTable" in table.get_attribute("class")
 
-        # Print debug info
-        print(f"\nCurrent URL: {self.driver.current_url}")
-        print(f"Table HTML: {table.get_attribute('outerHTML')}")
-
-        # Get all table text
-        table_text = table.text
-
         # Verify each collection name is present
+        table_text = table.text
         for collection_name in self.collection_names:
             assert collection_name in table_text, f"Collection '{collection_name}' not found in table"
-            print(f"Found collection: {collection_name}")
 
     def tearDown(self):
         """Clean up test data."""
diff --git a/sde_collections/tests/frontend/test_setup.py b/sde_collections/tests/frontend/test_setup.py
deleted file mode 100644
index dc8561dd..00000000
--- a/sde_collections/tests/frontend/test_setup.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from .base import BaseTestCase
-
-
-class TestSetup(BaseTestCase):
-    """Verify Selenium setup is working correctly."""
-
-    def test_basic_page_load(self):
-        """Test that we can load a page."""
-        # Print the live server URL
-        print(f"\nTest server running at: {self.live_server_url}")
-
-        self.driver.get(self.live_server_url)
-        print(f"Current URL: {self.driver.current_url}")
-        print(f"Page Title: {self.driver.title}")
-
-        assert self.driver.title == "Sign In | COSMOS"

From efdba33323a2c91356d9421aec5b2431bf0a0695 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Tue, 11 Feb 2025 16:43:13 -0600
Subject: [PATCH 419/441] Add homepage features and pattern application tests

---
 ...llections.py => test_homepage_features.py} |  25 ++-
 .../frontend/test_pattern_application.py      | 192 ++++++++++++++++++
 2 files changed, 213 insertions(+), 4 deletions(-)
 rename sde_collections/tests/frontend/{test_collections.py => test_homepage_features.py} (60%)
 create mode 100644 sde_collections/tests/frontend/test_pattern_application.py

diff --git a/sde_collections/tests/frontend/test_collections.py b/sde_collections/tests/frontend/test_homepage_features.py
similarity index 60%
rename from sde_collections/tests/frontend/test_collections.py
rename to sde_collections/tests/frontend/test_homepage_features.py
index 3ec6fe5e..29744fef 100644
--- a/sde_collections/tests/frontend/test_collections.py
+++ b/sde_collections/tests/frontend/test_homepage_features.py
@@ -7,8 +7,8 @@
 from .base import BaseTestCase
 
 
-class TestCollections(BaseTestCase):
-    """Test collection-related functionality."""
+class TestHomepageFeatures(BaseTestCase):
+    """Test features available in COSMOS Homepage"""
 
     def setUp(self):
         """Set up test data."""
@@ -19,10 +19,10 @@ def setUp(self):
         self.collections = [CollectionFactory(curated_by=self.user) for _ in range(3)]
         self.collection_names = [collection.name for collection in self.collections]
 
-    def test_collections_display(self):
-        """Test that collections are displayed after login."""
         self.login(self.user.username, self.password)
 
+    def test_collections_display(self):
+        """Test that collections are displayed after login."""
         # Navigate to collections page
         self.driver.get(f"{self.live_server_url}/")
 
@@ -35,6 +35,23 @@ def test_collections_display(self):
         for collection_name in self.collection_names:
             assert collection_name in table_text, f"Collection '{collection_name}' not found in table"
 
+    def test_universal_search(self):
+        """Test universal search functionality."""
+
+        self.driver.get(f"{self.live_server_url}/")
+        # Wait for search input and enter search term
+        search_input = self.wait.until(EC.presence_of_element_located((By.ID, "collectionSearch")))
+        search_input.send_keys(self.collections[0].name)  # Search for first collection
+
+        # Wait for table to update
+        table = self.wait.until(EC.presence_of_element_located((By.ID, "collection_table")))
+
+        # Verify search results
+        table_text = table.text
+        assert self.collections[0].name in table_text, "Target collection should be present"
+        assert self.collections[1].name not in table_text, "Collection #2 should not be present"
+        assert self.collections[2].name not in table_text, "Collection #3 should not be present"
+
     def tearDown(self):
         """Clean up test data."""
         super().tearDown()
diff --git a/sde_collections/tests/frontend/test_pattern_application.py b/sde_collections/tests/frontend/test_pattern_application.py
new file mode 100644
index 00000000..bf9fedd1
--- /dev/null
+++ b/sde_collections/tests/frontend/test_pattern_application.py
@@ -0,0 +1,192 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+
+from ..factories import CollectionFactory, CuratedUrlFactory, DeltaUrlFactory
+from .base import BaseTestCase
+
+
+class TestPatternApplication(BaseTestCase):
+    """Test different types of pattern application"""
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.user, self.password = self.create_test_user(is_staff=True)
+
+        self.collection = CollectionFactory(curated_by=self.user)
+
+        self.delta_urls = [
+            DeltaUrlFactory(collection=self.collection, url="https://example.com/docs/page1.html"),
+            DeltaUrlFactory(collection=self.collection, url="https://example.com/docs/page2.html"),
+        ]
+
+        self.curated_urls = [
+            CuratedUrlFactory(collection=self.collection, url="https://example.com/docs/page3.html"),
+            CuratedUrlFactory(collection=self.collection, url="https://example.com/index.html"),
+        ]
+
+        self.login(self.user.username, self.password)
+        self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls")
+
+    def test_create_exclude_pattern(self):
+        """Test creating a new exclude pattern."""
+        # Click Exclude Patterns tab
+        exclude_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "excludePatternsTab")))
+        exclude_patterns_tab.click()
+
+        # Click Add Pattern button
+        add_pattern_button = self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.addPattern[aria-controls='exclude_patterns_table']"))
+        )
+        add_pattern_button.click()
+
+        # Fill up the form using JavaScript and close modal properly
+        self.driver.execute_script(
+            """
+            document.querySelector("#excludePatternModal #match_pattern_input").value = 'example.com/docs/';
+            document.querySelector('#excludePatternModal .pattern_type_form_select[value="2"]').click();
+            document.querySelector("#excludePatternModal button.btn-primary[type='submit']").click();
+        """
+        )
+
+        # Verify pattern details
+        pattern_row = self.wait.until(
+            EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]"))
+        )
+        row_text = pattern_row.find_element(By.XPATH, "..").text
+
+        assert "example.com/docs/" in row_text
+        assert "Multi-URL Pattern" in row_text
+        assert "3" in row_text
+
+        self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls")
+
+        # Verify exclude checkmark for each delta URL
+        for delta_url in self.delta_urls:
+            row = self.driver.find_element(By.ID, delta_url.url)
+            check_icon = row.find_element(By.CSS_SELECTOR, "i[style*='color: green']")
+            assert check_icon.text == "check"
+
+    def test_create_include_pattern(self):
+        """Test creating a new include pattern."""
+        # Click Include Patterns tab
+        include_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "includePatternsTab")))
+        include_patterns_tab.click()
+
+        # Click Add Pattern button
+        add_pattern_button = self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.addPattern[aria-controls='include_patterns_table']"))
+        )
+        add_pattern_button.click()
+
+        # Fill up the form using JavaScript and close modal properly
+        self.driver.execute_script(
+            """
+            document.querySelector("#includePatternModal #match_pattern_input").value = 'example.com/docs/';
+            document.querySelector('#includePatternModal .pattern_type_form_select[value="2"]').click();
+            document.querySelector("#includePatternModal button.btn-primary[type='submit']").click();
+        """
+        )
+
+        # Verify pattern details
+        pattern_row = self.wait.until(
+            EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]"))
+        )
+        row_text = pattern_row.find_element(By.XPATH, "..").text
+
+        assert "example.com/docs/" in row_text
+        assert "Multi-URL Pattern" in row_text
+        assert "3" in row_text
+
+        self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls")
+
+        # Verify no exclude checkmark for each delta URL
+        for delta_url in self.delta_urls:
+            row = self.driver.find_element(By.ID, delta_url.url)
+            check_icon = row.find_element(By.CSS_SELECTOR, "i[style*='color: red']")
+            assert check_icon.text == "close"
+
+    def test_create_title_pattern(self):
+        """Test creating a new title pattern."""
+        # Click Title Patterns tab
+        title_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "titlePatternsTab")))
+        title_patterns_tab.click()
+
+        # Click Add Pattern button
+        add_pattern_button = self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.addPattern[aria-controls='title_patterns_table']"))
+        )
+        add_pattern_button.click()
+
+        # Fill up the form using JavaScript and close modal properly
+        self.driver.execute_script(
+            """
+            document.querySelector("#titlePatternModal #match_pattern_input").value = 'example.com/docs/';
+            document.querySelector("#titlePatternModal #title_pattern_input").value = 'Documentation: {title}';
+            document.querySelector('#titlePatternModal .pattern_type_form_select[value="2"]').click();
+            document.querySelector("#titlePatternModal button.btn-primary[type='submit']").click();
+        """
+        )
+
+        # Verify pattern details
+        pattern_row = self.wait.until(
+            EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]"))
+        )
+        row_text = pattern_row.find_element(By.XPATH, "..").text
+
+        assert "example.com/docs/" in row_text
+        assert "Documentation: {title}" in row_text
+        assert "Multi-URL Pattern" in row_text
+        assert "3" in row_text
+
+        self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls")
+        table_html = self.driver.find_element(By.ID, "delta_urls_table").get_attribute("outerHTML")
+
+        # Verify that previous curated_url now appear in delta_urls page after pattern application
+        assert "example.com/docs/page3.html" in table_html
+
+        # Verify each delta URL's title has been updated with the pattern
+        for delta_url in self.collection.delta_urls.all():
+            expected_title = f"Documentation: {delta_url.scraped_title}"
+            assert expected_title in table_html, f"Expected title '{expected_title}' not found in table"
+
+    def test_create_documenttype_pattern(self):
+        """Test creating a new document type pattern."""
+        # Click Document Type Patterns tab
+        documenttype_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "documentTypePatternsTab")))
+        documenttype_patterns_tab.click()
+
+        # Click Add Pattern button
+        add_pattern_button = self.wait.until(
+            EC.element_to_be_clickable(
+                (By.CSS_SELECTOR, "button.addPattern[aria-controls='document_type_patterns_table']")
+            )
+        )
+        add_pattern_button.click()
+
+        # Fill up the form using JavaScript and close modal properly
+        self.driver.execute_script(
+            """
+            document.querySelector("#documentTypePatternModal #match_pattern_input").value = 'example.com/docs/';
+            document.querySelector('#documentTypePatternModal .document_type_form_select[value="2"]').click();  // DATA
+            document.querySelector('#documentTypePatternModal .pattern_type_form_select[value="2"]').click();
+            document.querySelector("#documentTypePatternModal button.btn-primary[type='submit']").click();
+        """
+        )
+
+        # Verify pattern details
+        pattern_row = self.wait.until(
+            EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]"))
+        )
+        row_text = pattern_row.find_element(By.XPATH, "..").text
+
+        assert "example.com/docs/" in row_text
+        assert "Multi-URL Pattern" in row_text
+        assert "3" in row_text
+
+        self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls")
+
+        # Verify document type is set to Data
+        for delta_url in self.delta_urls:
+            row = self.driver.find_element(By.ID, delta_url.url)
+            doc_type_button = row.find_element(By.CSS_SELECTOR, "button.btn-success")
+            assert doc_type_button.text == "DATA"

From a1dd50ea8378ef05c5d934abbc0357e5bac1137b Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 11 Feb 2025 20:03:06 -0600
Subject: [PATCH 420/441] Separation of coverage Report

---
 .github/workflows/run_full_test_suite.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 89880f51..17e0a8b0 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -35,3 +35,24 @@ jobs:
 
       - name: Cleanup
         run: docker-compose -f local.yml down --volumes
+
+  report-coverage:
+    needs: run-tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out merged code
+        uses: actions/checkout@v2
+
+      - name: Set up Docker Compose
+        run: |
+          sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+          sudo chmod +x /usr/local/bin/docker-compose
+
+      - name: Build the Docker environment
+        run: docker-compose -f local.yml build
+
+      - name: Extract coverage data
+        run: docker-compose -f local.yml run --rm django coverage report
+
+      - name: Cleanup after coverage
+        run: docker-compose -f local.yml down --volumes

From f6b3d33459dbc077022716ec605c90d28f08ddf5 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 11 Feb 2025 20:19:59 -0600
Subject: [PATCH 421/441] Separation of coverage Report_2

---
 .github/workflows/run_full_test_suite.yml | 20 +++++++++++++++++++-
 init.sh                                   | 12 ++++--------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 17e0a8b0..fdec476e 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -33,12 +33,25 @@ jobs:
           DJANGO_ENV: test
         run: docker-compose -f local.yml run --rm django bash ./init.sh
 
+      - name: Save coverage data
+        run: |
+          mkdir -p coverage_data
+          docker-compose -f local.yml run --rm django cp .coverage coverage_data/
+        continue-on-error: true
+
+      - name: Upload coverage data as artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: coverage-data
+          path: coverage_data/.coverage
+
       - name: Cleanup
         run: docker-compose -f local.yml down --volumes
 
   report-coverage:
     needs: run-tests
     runs-on: ubuntu-latest
+
     steps:
       - name: Check out merged code
         uses: actions/checkout@v2
@@ -51,7 +64,12 @@ jobs:
       - name: Build the Docker environment
         run: docker-compose -f local.yml build
 
-      - name: Extract coverage data
+      - name: Download coverage data
+        uses: actions/download-artifact@v2
+        with:
+          name: coverage-data
+
+      - name: Generate Coverage Report
         run: docker-compose -f local.yml run --rm django coverage report
 
       - name: Cleanup after coverage
diff --git a/init.sh b/init.sh
index 1e86d679..5b15e0e4 100644
--- a/init.sh
+++ b/init.sh
@@ -21,10 +21,10 @@ include = */*.py
 [report]
 show_missing = True" > .coveragerc
 
-# Run each test file with coverage
+# Run each test file with coverage (without generating report yet)
 for test_file in $test_files; do
     echo "Running $test_file..."
-    coverage run --append -m pytest "$test_file"  # Using settings from .coveragerc
+    coverage run --append -m pytest "$test_file"  # Collect coverage data
 
     # Check the exit status of pytest
     if [ $? -ne 0 ]; then
@@ -33,15 +33,11 @@ for test_file in $test_files; do
     fi
 done
 
-# Generate coverage reports
-echo "Generating coverage report..."
-coverage report
-
-# Report the results
+# Report the results without generating the coverage report
 if [ $failure_count -ne 0 ]; then
     echo "$failure_count test(s) failed. Refer to the terminal output for details."
     exit 1
 else
     echo "All tests passed successfully!"
-    echo "Coverage summary has been output to the terminal."
+    echo "Coverage data collected. Coverage report will be generated separately."
 fi

From cddb39c436cee5762ed70b3f2556ee92fe6c63c8 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 11 Feb 2025 20:23:44 -0600
Subject: [PATCH 422/441] Separation of coverage Report_3

---
 .github/workflows/run_full_test_suite.yml | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index fdec476e..b3a99203 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -18,7 +18,7 @@ jobs:
 
     steps:
       - name: Check out merged code
-        uses: actions/checkout@v2
+        uses: actions/checkout@latest
 
       - name: Set up Docker Compose
         run: |
@@ -35,12 +35,10 @@ jobs:
 
       - name: Save coverage data
         run: |
-          mkdir -p coverage_data
-          docker-compose -f local.yml run --rm django cp .coverage coverage_data/
-        continue-on-error: true
+          docker-compose -f local.yml run --rm django bash -c "mkdir -p coverage_data && cp .coverage coverage_data/"
 
       - name: Upload coverage data as artifact
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: coverage-data
           path: coverage_data/.coverage
@@ -54,23 +52,23 @@ jobs:
 
     steps:
       - name: Check out merged code
-        uses: actions/checkout@v2
+        uses: actions/checkout@latest
 
       - name: Set up Docker Compose
         run: |
           sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
           sudo chmod +x /usr/local/bin/docker-compose
 
-      - name: Build the Docker environment
-        run: docker-compose -f local.yml build
-
       - name: Download coverage data
-        uses: actions/download-artifact@v2
+        uses: actions/download-artifact@v3
         with:
           name: coverage-data
 
       - name: Generate Coverage Report
-        run: docker-compose -f local.yml run --rm django coverage report
+        run: |
+          mkdir -p coverage_data
+          mv coverage-data/.coverage ./.coverage
+          docker-compose -f local.yml run --rm django coverage report
 
       - name: Cleanup after coverage
         run: docker-compose -f local.yml down --volumes

From 34ce09231eddedfa389e969fb5aeb85860fe17f5 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 11 Feb 2025 20:33:25 -0600
Subject: [PATCH 423/441] Separation of coverage Report_4

---
 .github/workflows/run_full_test_suite.yml | 42 +++--------------------
 generate_coverage_report.sh               |  3 ++
 2 files changed, 8 insertions(+), 37 deletions(-)
 create mode 100644 generate_coverage_report.sh

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index b3a99203..9893013b 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -18,7 +18,7 @@ jobs:
 
     steps:
       - name: Check out merged code
-        uses: actions/checkout@latest
+        uses: actions/checkout@v2
 
       - name: Set up Docker Compose
         run: |
@@ -33,42 +33,10 @@ jobs:
           DJANGO_ENV: test
         run: docker-compose -f local.yml run --rm django bash ./init.sh
 
-      - name: Save coverage data
-        run: |
-          docker-compose -f local.yml run --rm django bash -c "mkdir -p coverage_data && cp .coverage coverage_data/"
-
-      - name: Upload coverage data as artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: coverage-data
-          path: coverage_data/.coverage
-
-      - name: Cleanup
-        run: docker-compose -f local.yml down --volumes
-
-  report-coverage:
-    needs: run-tests
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Check out merged code
-        uses: actions/checkout@latest
-
-      - name: Set up Docker Compose
-        run: |
-          sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
-          sudo chmod +x /usr/local/bin/docker-compose
-
-      - name: Download coverage data
-        uses: actions/download-artifact@v3
-        with:
-          name: coverage-data
-
       - name: Generate Coverage Report
-        run: |
-          mkdir -p coverage_data
-          mv coverage-data/.coverage ./.coverage
-          docker-compose -f local.yml run --rm django coverage report
+        env:
+          DJANGO_ENV: test
+        run: docker-compose -f local.yml run --rm django bash -c "coverage report"
 
-      - name: Cleanup after coverage
+      - name: Cleanup
         run: docker-compose -f local.yml down --volumes
diff --git a/generate_coverage_report.sh b/generate_coverage_report.sh
new file mode 100644
index 00000000..5612f177
--- /dev/null
+++ b/generate_coverage_report.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo "Generating coverage report..."
+coverage report

From 5b8576f7157afb75a91402e469a514eb3dcb7525 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 11 Feb 2025 21:23:27 -0600
Subject: [PATCH 424/441] Separation of coverage Report_latest

---
 generate_coverage_report.sh |  3 ---
 init.sh                     | 24 ++++++------------------
 2 files changed, 6 insertions(+), 21 deletions(-)
 delete mode 100644 generate_coverage_report.sh

diff --git a/generate_coverage_report.sh b/generate_coverage_report.sh
deleted file mode 100644
index 5612f177..00000000
--- a/generate_coverage_report.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-echo "Generating coverage report..."
-coverage report
diff --git a/init.sh b/init.sh
index 5b15e0e4..93f75acb 100644
--- a/init.sh
+++ b/init.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-echo "Running all test cases across the project with coverage analysis..."
+echo "Running all test cases across the project..."
 
 # Initialize a failure counter
 failure_count=0
@@ -10,21 +10,10 @@ excluded_dirs="document_classifier functional_tests"
 # Find all test files except those in excluded directories
 test_files=$(find . -type f -name "test_*.py" | grep -Ev "$(echo $excluded_dirs | sed 's/ /|/g')")
 
-# Begin coverage tracking
-coverage erase  # Clear any existing coverage data
-
-# Setup .coveragerc configuration to include all Python files
-echo "[run]
-source = .
-include = */*.py
-
-[report]
-show_missing = True" > .coveragerc
-
-# Run each test file with coverage (without generating report yet)
+# Run each test file
 for test_file in $test_files; do
     echo "Running $test_file..."
-    coverage run --append -m pytest "$test_file"  # Collect coverage data
+    pytest "$test_file"
 
     # Check the exit status of pytest
     if [ $? -ne 0 ]; then
@@ -33,11 +22,10 @@ for test_file in $test_files; do
     fi
 done
 
-# Report the results without generating the coverage report
+# Report the results
 if [ $failure_count -ne 0 ]; then
-    echo "$failure_count test(s) failed. Refer to the terminal output for details."
+    echo "$failure_count test(s) failed."
     exit 1
 else
     echo "All tests passed successfully!"
-    echo "Coverage data collected. Coverage report will be generated separately."
-fi
+fi
\ No newline at end of file

From f3045884a1d844582e60325f193b4724380968b7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 03:23:47 +0000
Subject: [PATCH 425/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 init.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init.sh b/init.sh
index 93f75acb..be89aad0 100644
--- a/init.sh
+++ b/init.sh
@@ -28,4 +28,4 @@ if [ $failure_count -ne 0 ]; then
     exit 1
 else
     echo "All tests passed successfully!"
-fi
\ No newline at end of file
+fi

From 0c2c8c6564ae9e1288b434c37c0985265deeca6d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Tue, 11 Feb 2025 21:43:48 -0600
Subject: [PATCH 426/441] Separation of coverage Report_latest2

---
 .coveragerc |  6 ------
 init.sh     | 23 +++++++++++++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)
 delete mode 100644 .coveragerc

diff --git a/.coveragerc b/.coveragerc
deleted file mode 100644
index 300e53ba..00000000
--- a/.coveragerc
+++ /dev/null
@@ -1,6 +0,0 @@
-[run]
-source = .
-include = */*.py
-
-[report]
-show_missing = True
diff --git a/init.sh b/init.sh
index 93f75acb..c6d357a4 100644
--- a/init.sh
+++ b/init.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-echo "Running all test cases across the project..."
+echo "Running all test cases across the project with coverage analysis..."
 
 # Initialize a failure counter
 failure_count=0
@@ -10,10 +10,20 @@ excluded_dirs="document_classifier functional_tests"
 # Find all test files except those in excluded directories
 test_files=$(find . -type f -name "test_*.py" | grep -Ev "$(echo $excluded_dirs | sed 's/ /|/g')")
 
-# Run each test file
+coverage erase  # Clear any existing coverage data
+
+# Setup .coveragerc configuration to include all Python files
+echo "[run]
+source = .
+include = */*.py
+
+[report]
+show_missing = True" > .coveragerc
+
+# Run each test file with coverage (without generating report yet)
 for test_file in $test_files; do
     echo "Running $test_file..."
-    pytest "$test_file"
+    coverage run --append -m pytest "$test_file"  # Collect coverage data
 
     # Check the exit status of pytest
     if [ $? -ne 0 ]; then
@@ -22,10 +32,11 @@ for test_file in $test_files; do
     fi
 done
 
-# Report the results
+# Report the results without generating the coverage report
 if [ $failure_count -ne 0 ]; then
-    echo "$failure_count test(s) failed."
+    echo "$failure_count test(s) failed. Refer to the terminal output for details."
     exit 1
 else
     echo "All tests passed successfully!"
-fi
\ No newline at end of file
+    echo "Coverage data collected. Coverage report will be generated separately."
+fi

From a0a11716d0d4717112e3b5225c511b829209e349 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 03:46:35 +0000
Subject: [PATCH 427/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 init.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/init.sh b/init.sh
index 41be7d5e..c6d357a4 100644
--- a/init.sh
+++ b/init.sh
@@ -40,4 +40,3 @@ else
     echo "All tests passed successfully!"
     echo "Coverage data collected. Coverage report will be generated separately."
 fi
-

From b78f4bf6177ffd9f2aa79896b5fc762e78640d3e Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Fri, 14 Feb 2025 22:36:35 -0600
Subject: [PATCH 428/441] Add searchpane tests

---
 .../tests/frontend/test_homepage_features.py  | 196 +++++++++++++++++-
 .../frontend/test_pattern_application.py      |   2 +
 .../static/js/collection_list.js              |  23 +-
 3 files changed, 209 insertions(+), 12 deletions(-)

diff --git a/sde_collections/tests/frontend/test_homepage_features.py b/sde_collections/tests/frontend/test_homepage_features.py
index 29744fef..8bb439ae 100644
--- a/sde_collections/tests/frontend/test_homepage_features.py
+++ b/sde_collections/tests/frontend/test_homepage_features.py
@@ -1,9 +1,14 @@
-# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/frontend/test_collections.py
+# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/frontend/test_homepage_features.py
 
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 
-from ..factories import CollectionFactory
+from ..factories import (
+    CollectionFactory,
+    CuratedUrlFactory,
+    DeltaUrlFactory,
+    UserFactory,
+)
 from .base import BaseTestCase
 
 
@@ -26,7 +31,6 @@ def test_collections_display(self):
         # Navigate to collections page
         self.driver.get(f"{self.live_server_url}/")
 
-        # Wait for specific table to load using ID
         table = self.wait.until(EC.presence_of_element_located((By.ID, "collection_table")))
         assert "table-striped dataTable" in table.get_attribute("class")
 
@@ -52,6 +56,190 @@ def test_universal_search(self):
         assert self.collections[1].name not in table_text, "Collection #2 should not be present"
         assert self.collections[2].name not in table_text, "Collection #3 should not be present"
 
+
+class TestSearchPaneFeatures(BaseTestCase):
+    """Test search pane features on homepage"""
+
+    def setUp(self):
+        super().setUp()
+        self.user, self.password = self.create_test_user(is_staff=True)
+        self.second_test_user = UserFactory()
+        self.third_test_user = UserFactory()
+
+        # Create collections with diverse attributes
+        self.collections = [
+            CollectionFactory(curated_by=self.user, division=1, workflow_status=3, connector=1, reindexing_status=1),
+            CollectionFactory(
+                curated_by=self.second_test_user, division=3, workflow_status=1, connector=1, reindexing_status=2
+            ),
+            CollectionFactory(
+                curated_by=self.third_test_user, division=4, workflow_status=3, connector=2, reindexing_status=4
+            ),
+        ]
+
+        # Factory sometimes struggle to generate unique URLs by itself, so applying this technique
+        self.delta_urls = []
+        self.curated_urls = []
+        for i, collection in enumerate(self.collections):
+            num_urls = 10**i  # 1, 10, 100, ...
+            self.delta_urls.extend(
+                [
+                    DeltaUrlFactory(collection=collection, url=f"https://example-{collection.id}-{j}.com")
+                    for j in range(num_urls)
+                ]
+            )
+            self.curated_urls.extend(
+                [
+                    CuratedUrlFactory(collection=collection, url=f"https://example-{collection.id}-{j}.com")
+                    for j in range(num_urls)
+                ]
+            )
+
+        self.login(self.user.username, self.password)
+        self.driver.get(f"{self.live_server_url}/")
+        self.COLUMNS = self.driver.execute_script("return COLUMNS;")
+
+    def test_division_searchpane(self):
+        """Test division search pane filtering"""
+
+        # Find and click Astrophysics option
+        astrophysics_option = self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, "span.dtsp-name[title='Astrophysics']"))
+        )
+        astrophysics_option.click()
+
+        # Get all rows from the filtered table
+        rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr")
+        assert len(rows) > 0, "No rows found after filtering"
+
+        # Verify each row shows Astrophysics division
+        for row in rows:
+            division_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["DIVISION"]]
+            assert division_cell.text.lower() == "astrophysics", f"Expected Astrophysics but found {division_cell.text}"
+
+    def test_delta_urls_searchpane(self):
+        """Test Delta URLs search pane filtering"""
+
+        # Find the Delta URLs pane using its index and then find the "1 solo URL" option within it
+        search_panes = self.driver.find_elements(By.CSS_SELECTOR, "div.dtsp-searchPane")
+        delta_urls_pane = search_panes[self.COLUMNS["DELTA_URLS"]]
+        delta_url_option = delta_urls_pane.find_element(By.CSS_SELECTOR, "span.dtsp-name[title='1 solo URL']")
+        delta_url_option.click()
+
+        # Get all rows from the filtered table
+        rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr")
+        assert len(rows) > 0, "No rows found after filtering"
+
+        # Verify each row shows "1" in Delta URLs column
+        for row in rows:
+            delta_urls_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["DELTA_URLS"]]
+            assert delta_urls_cell.text == "1", f"Expected '1' but found {delta_urls_cell.text}"
+
+    def test_curated_urls_searchpane(self):
+        """Test Curated URLs search pane filtering"""
+
+        # Find the Curated URLs pane using its index and then find the "1 to 100 URLs" option within it
+        search_panes = self.driver.find_elements(By.CSS_SELECTOR, "div.dtsp-searchPane")
+        curated_urls_pane = search_panes[self.COLUMNS["CURATED_URLS"]]
+        curated_url_option = curated_urls_pane.find_element(By.CSS_SELECTOR, "span.dtsp-name[title='1 to 100 URLs']")
+        curated_url_option.click()
+
+        # Get all rows from the filtered table
+        rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr")
+        assert len(rows) > 0, "No rows found after filtering"
+
+        # Verify each row shows a number between 1 and 100 in Curated URLs column
+        for row in rows:
+            curated_urls_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["CURATED_URLS"]]
+            url_count = int(curated_urls_cell.text)
+            assert 1 < url_count <= 100, f"Expected number between 1 and 100 but found {url_count}"
+
+    def test_workflow_status_searchpane(self):
+        """Test Workflow Status search pane filtering"""
+
+        # Find and click the option with "Engineering in Progress" button
+        workflow_status_option = self.wait.until(
+            EC.element_to_be_clickable(
+                (By.XPATH, "//div[@class='dtsp-nameCont']//button[text()='Engineering in Progress']")
+            )
+        )
+        workflow_status_option.click()
+
+        # Get all rows from the filtered table
+        rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr")
+        assert len(rows) > 0, "No rows found after filtering"
+
+        # Verify each row shows "ENGINEERING IN PROGRESS" in Workflow Status column
+        for row in rows:
+            workflow_status_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["WORKFLOW_STATUS"]]
+            assert (
+                workflow_status_cell.text.lower() == "engineering in progress"
+            ), f"Expected 'ENGINEERING IN PROGRESS' but found {workflow_status_cell.text}"
+
+    def test_curator_searchpane(self):
+        """Test Curator search pane filtering"""
+
+        # Find and click the option with "test_user" button
+        curator_option = self.wait.until(
+            EC.element_to_be_clickable((By.XPATH, "//div[@class='dtsp-nameCont']//button[text()='test_user']"))
+        )
+        curator_option.click()
+
+        # Get all rows from the filtered table
+        rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr")
+        assert len(rows) > 0, "No rows found after filtering"
+
+        # Verify each row shows "test_user" in Curator column
+        for row in rows:
+            curator_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["CURATOR"]]
+            assert curator_cell.text.lower() == "test_user", f"Expected 'test_user' but found {curator_cell.text}"
+
+    def test_connector_type_searchpane(self):
+        """Test Connector Type search pane filtering"""
+
+        # Find and click "crawler2" option
+        crawler2_option = self.wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, "span.dtsp-name[title='crawler2']"))
+        )
+        crawler2_option.click()
+
+        # Get all rows from the filtered table
+        rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr")
+        assert len(rows) > 0, "No rows found after filtering"
+
+        # Verify each row shows "crawler2" connector type
+        for row in rows:
+            connector_type_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["CONNECTOR_TYPE"]]
+            assert (
+                connector_type_cell.text.lower() == "crawler2"
+            ), f"Expected 'crawler2' but found {connector_type_cell.text}"
+
+    def test_reindexing_status_searchpane(self):
+        """Test Reindexing Status search pane filtering"""
+
+        # Find and click the option with "Re-Indexing Not Needed" button
+        reindexing_option = self.wait.until(
+            EC.element_to_be_clickable(
+                (By.XPATH, "//div[@class='dtsp-nameCont']//button[text()='Re-Indexing Not Needed']")
+            )
+        )
+        reindexing_option.click()
+
+        # Get all rows from the filtered table
+        rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr")
+        assert len(rows) > 0, "No rows found after filtering"
+
+        # Verify each row shows "RE-INDEXING NOT NEEDED" in Reindexing Status column
+        for row in rows:
+            reindexing_status_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["REINDEXING_STATUS"]]
+            assert (
+                reindexing_status_cell.text.lower() == "re-indexing not needed"
+            ), f"Expected 'RE-INDEXING NOT NEEDED' but found {reindexing_status_cell.text}"
+
     def tearDown(self):
-        """Clean up test data."""
+        """Clear all filters after each test"""
+
+        clear_all_button = self.driver.find_element(By.CSS_SELECTOR, "button.dtsp-clearAll")
+        if "disabled" not in clear_all_button.get_attribute("class"):
+            clear_all_button.click()
         super().tearDown()
diff --git a/sde_collections/tests/frontend/test_pattern_application.py b/sde_collections/tests/frontend/test_pattern_application.py
index bf9fedd1..28974c9b 100644
--- a/sde_collections/tests/frontend/test_pattern_application.py
+++ b/sde_collections/tests/frontend/test_pattern_application.py
@@ -1,3 +1,5 @@
+# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/frontend/test_pattern_application.py
+
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 
diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js
index 0cd5d8d7..78fd4894 100644
--- a/sde_indexing_helper/static/js/collection_list.js
+++ b/sde_indexing_helper/static/js/collection_list.js
@@ -123,10 +123,13 @@ let table = $("#collection_table").DataTable({
     },
   ],
   columnDefs: [
-    // hide the data columns
+    // hide the data columns and search panes for these columns
     {
       targets: [COLUMNS.WORKFLOW_STATUS_RAW, COLUMNS.CURATOR_ID, COLUMNS.REINDEXING_STATUS_RAW],
       visible: false,
+      searchPanes: {
+        show: false,
+      },
     },
     { width: "200px", targets: COLUMNS.URL },
     {
@@ -175,6 +178,7 @@ let table = $("#collection_table").DataTable({
             },
           },
         ],
+        show: true,
       },
       targets: [COLUMNS.DELTA_URLS],
       type: "num-fmt",
@@ -225,19 +229,14 @@ let table = $("#collection_table").DataTable({
             },
           },
         ],
+        show: true,
       },
       targets: [COLUMNS.CURATED_URLS],
       type: "num-fmt",
     },
-    // hide the data panes
-    {
-      searchPanes: {
-        show: false,
-      },
-      targets: [COLUMNS.WORKFLOW_STATUS_RAW, COLUMNS.CURATOR_ID, COLUMNS.REINDEXING_STATUS_RAW],
-    },
     {
       searchPanes: {
+        show: true,
         dtOpts: {
           scrollY: "100%",
         },
@@ -246,12 +245,20 @@ let table = $("#collection_table").DataTable({
     },
     {
       searchPanes: {
+        show: true,
         dtOpts: {
           scrollY: "100%",
         },
       },
       targets: [COLUMNS.CONNECTOR_TYPE],
     },
+    // Explicitly enable required searchPanes – Selenium requires searchPanes to be explicitly enabled for proper functionality during testing.
+    {
+      searchPanes: {
+        show: true,
+      },
+      targets: [COLUMNS.DIVISION, COLUMNS.DELTA_URLS, COLUMNS.CURATED_URLS, COLUMNS.WORKFLOW_STATUS, COLUMNS.CURATOR, COLUMNS.CONNECTOR_TYPE, COLUMNS.REINDEXING_STATUS ],
+    },
   ],
 });
 

From 92ab6cc7b619c3eb4a966214b7e82729de1c3f96 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 19 Feb 2025 14:55:56 -0600
Subject: [PATCH 429/441] made serializer code dry

---
 sde_collections/serializers.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py
index 8d512d54..4c5cc897 100644
--- a/sde_collections/serializers.py
+++ b/sde_collections/serializers.py
@@ -235,10 +235,12 @@ class Meta:
         )
 
     def get_tdamm_tag(self, obj):
+        empty_categories = {"messengers": [], "objects": [], "signals": []}
         if not obj.tdamm_tag or obj.tdamm_tag == ["NOT_TDAMM"]:
-            return {"messengers": [], "objects": [], "signals": []}
+            return empty_categories
 
-        categorized_tags = {"messengers": [], "objects": [], "signals": []}
+        categories = empty_categories.copy()
+        prefix_mapping = {"MMA_M_": "messengers", "MMA_O_": "objects", "MMA_S_": "signals"}
 
         for tag in obj.tdamm_tag:
             if tag == "NOT_TDAMM":
@@ -248,17 +250,12 @@ def get_tdamm_tag(self, obj):
             if not tag_text:
                 continue
 
-            if tag.startswith("MMA_M_"):
-                transformed_tag = tag_text.replace(" - ", "/")
-                categorized_tags["messengers"].append(transformed_tag)
-            elif tag.startswith("MMA_O_"):
-                transformed_tag = tag_text.replace(" - ", "/")
-                categorized_tags["objects"].append(transformed_tag)
-            elif tag.startswith("MMA_S_"):
-                transformed_tag = tag_text.replace(" - ", "/")
-                categorized_tags["signals"].append(transformed_tag)
-
-        return categorized_tags
+            for prefix, category in prefix_mapping.items():
+                if tag.startswith(prefix):
+                    categories[category].append(tag_text.replace(" - ", "/"))
+                    break
+
+        return categories
 
     def get_document_type(self, obj):
         if obj.document_type is not None:

From 395119cc4dfb91c8d5712414912928129dd1606e Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Wed, 19 Feb 2025 15:45:59 -0600
Subject: [PATCH 430/441] modified seralizers

---
 feedback/serializers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/feedback/serializers.py b/feedback/serializers.py
index a823ef35..3c6904b2 100644
--- a/feedback/serializers.py
+++ b/feedback/serializers.py
@@ -17,10 +17,10 @@ def to_internal_value(self, data):
 
 class FeedbackSerializer(serializers.ModelSerializer):
 
-    name = HTMLFreeCharField(max_length=150)
-    subject = HTMLFreeCharField(max_length=400)
+    name = HTMLFreeCharField()
+    subject = HTMLFreeCharField()
     comments = HTMLFreeCharField()
-    source = HTMLFreeCharField(max_length=50, required=False, default="SDE")
+    source = HTMLFreeCharField()
 
     class Meta:
         model = Feedback

From 73a2dbca77d55bfa913c3de93fb649de9c77e7f1 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanurjsharma@gmail.com>
Date: Wed, 19 Feb 2025 17:08:44 -0500
Subject: [PATCH 431/441] Update requirements/local.txt

---
 requirements/local.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/local.txt b/requirements/local.txt
index c346c77f..6db73be1 100644
--- a/requirements/local.txt
+++ b/requirements/local.txt
@@ -16,7 +16,7 @@ types-xmltodict
 pytest-xdist>=3.3.1
 pytest-cov>=4.1.0
 selenium>=4.15.2    # Selenium (Frontend Testing)
-coverage==7.6.5
+coverage==7.4.1
 
 # Documentation
 # ------------------------------------------------------------------------------

From f6db0f9d897cc290ab4e889218db7cdda770c869 Mon Sep 17 00:00:00 2001
From: Dhanur Sharma <dhanurjsharma@gmail.com>
Date: Wed, 19 Feb 2025 17:20:46 -0500
Subject: [PATCH 432/441] Update feedback/models.py

---
 feedback/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/feedback/models.py b/feedback/models.py
index 4746e5f3..843fd2fb 100644
--- a/feedback/models.py
+++ b/feedback/models.py
@@ -61,7 +61,7 @@ def format_notification_message(self):
             f"<!here> New Feedback Received : \n"  # noqa: E203
             f"Name: {self.name}\n"
             f"Email: {self.email}\n"
-            f"Dropdwon Choice: {dropdown_option_text}\n"
+            f"Dropdown Choice: {dropdown_option_text}\n"
             f"Subject: {self.subject}\n"
             f"Comments: {self.comments}\n"
             f"Source: {self.source}\n"

From c170afe8b271a26c79a3ec4cbd748e6907f95860 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Thu, 20 Feb 2025 13:49:47 -0600
Subject: [PATCH 433/441] corrected column refernce in js file

---
 sde_indexing_helper/static/js/delta_url_list.js | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 32c1667f..8a7e5c4d 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -754,12 +754,10 @@ function initializeDataTable() {
             $("#title-patterns-dropdown-1").prop("disabled", true);
           } else if (index === 1) {
             $("#title-patterns-dropdown-1").on("change", function () {
-              if ($(this).val() === "") table.columns(6).search("").draw();
+              if ($(this).val() === "") table.columns(7).search("").draw();
               else {
-                table
-                  .column(6)
-                  .search(matchPatternTypeMap[$(this).val()])
-                  .draw();
+                const patternType = matchPatternTypeMap[$(this).val()];
+                table.column(7).search(patternType).draw();
               }
             });
           }
@@ -792,7 +790,7 @@ function initializeDataTable() {
         },
       },
       { data: "id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false },
+      { data: "match_pattern_type", visible: false, searchable: true },
     ],
   });
 

From 2d7aa81200e1e3fc897bbd3315c098775c134ee2 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Fri, 21 Feb 2025 10:44:32 -0600
Subject: [PATCH 434/441] made changes on all the tables for consistency

---
 .../static/js/delta_url_list.js               | 141 +++++-------------
 1 file changed, 39 insertions(+), 102 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 8a7e5c4d..36aaca02 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -578,12 +578,10 @@ function initializeDataTable() {
             $("#exclude-patterns-dropdown-1").prop("disabled", true);
           } else if (index === 1) {
             $("#exclude-patterns-dropdown-1").on("change", function () {
-              if ($(this).val() === "") table.columns(6).search("").draw();
+              if ($(this).val() === "") table.columns(7).search("").draw();
               else {
-                table
-                  .column(6)
-                  .search(matchPatternTypeMap[$(this).val()])
-                  .draw();
+                const patternType = matchPatternTypeMap[$(this).val()];
+                table.column(7).search(patternType).draw();
               }
             });
           }
@@ -621,7 +619,7 @@ function initializeDataTable() {
         },
       },
       { data: "id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false },
+      { data: "match_pattern_type", visible: false, searchable: true },
     ],
   });
 
@@ -671,11 +669,11 @@ function initializeDataTable() {
           } else {
             if (index === 1) {
               $("#include-patterns-dropdown-1").on("change", function () {
-                if ($(this).val() === "") table.columns(5).search("").draw();
-                table
-                  .column(5)
-                  .search(matchPatternTypeMap[$(this).val()])
-                  .draw();
+                if ($(this).val() === "") table.columns(6).search("").draw();
+                else {
+                  const patternType = matchPatternTypeMap[$(this).val()];
+                  table.column(6).search(patternType).draw();
+                }
               });
             }
           }
@@ -707,7 +705,7 @@ function initializeDataTable() {
         },
       },
       { data: "id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false },
+      { data: "match_pattern_type", visible: false, searchable: true },
     ],
   });
 
@@ -831,53 +829,19 @@ function initializeDataTable() {
     pageLength: 100,
     ajax: `/api/document-type-patterns/?format=datatables&collection_id=${collection_id}`,
     initComplete: function (data) {
-      this.api()
-        .columns()
-        .every(function (index) {
-          var table = $("#document_type_patterns_table").DataTable();
-
-          let addDropdownSelect = {
-            1: {
-              columnToSearch: 6,
-              matchPattern: {
-                "Individual URL Pattern": 1,
-                "Multi-URL Pattern": 2,
-              },
-            },
-            2: {
-              columnToSearch: 7,
-              matchPattern: {
-                Images: 1,
-                Data: 2,
-                Documentation: 3,
-                "Software and Tools": 4,
-                "Missions and Instruments": 5,
-              },
-            },
-          };
-
-          let column = this;
-          if (column.data().length === 0) {
-            $(`#document-type-patterns-dropdown-${index}`).prop(
-              "disabled",
-              true
-            );
-          } else if (index in addDropdownSelect) {
-            $("#document-type-patterns-dropdown-" + index).on(
-              "change",
-              function () {
-                let col = addDropdownSelect[index].columnToSearch;
-                let searchInput =
-                  addDropdownSelect[index].matchPattern[$(this).val()];
-                if ($(this).val() === "" || $(this).val() === undefined)
-                  table.columns(col).search("").draw();
-                else {
-                  table.columns(col).search(searchInput).draw();
-                }
-              }
-            );
-          }
-        });
+      var table = $("#document_type_patterns_table").DataTable();
+      this.api().columns().every(function (index) {
+        if (index === 1) {
+          $("#document-type-patterns-dropdown-1").on("change", function () {
+            if ($(this).val() === "") {
+              table.column(7).search("").draw();
+            } else {
+              const patternType = matchPatternTypeMap[$(this).val()];
+              table.column(7).search(patternType).draw();
+            }
+          });
+        }
+      });
     },
 
     columns: [
@@ -907,7 +871,7 @@ function initializeDataTable() {
         },
       },
       { data: "id", visible: false, searchable: false },
-      { data: "match_pattern_type", visible: false },
+      { data: "match_pattern_type", visible: false, searchable: true },
       { data: "document_type", visible: false },
     ],
   });
@@ -943,47 +907,20 @@ var division_patterns_table = $("#division_patterns_table").DataTable({
   pageLength: 100,
   ajax: `/api/division-patterns/?format=datatables&collection_id=${collection_id}`,
   initComplete: function (data) {
-    this.api()
-      .columns()
-      .every(function (index) {
-        var table = $("#division_patterns_table").DataTable();
-
-        let addDropdownSelect = {
-          1: {
-            columnToSearch: 6,
-            matchPattern: {
-              "Individual URL Pattern": 1,
-              "Multi-URL Pattern": 2,
-            },
-          },
-          2: {
-            columnToSearch: 7,
-            matchPattern: {
-              "Astrophysics": 1,
-              "Biological and Physical Sciences": 2,
-              "Earth Science": 3,
-              "Heliophysics": 4,
-              "Planetary Science": 5,
-            },
-          },
-        };
-
-        let column = this;
-        if (column.data().length === 0) {
-          $(`#division-patterns-dropdown-${index}`).prop("disabled", true);
-        } else if (index in addDropdownSelect) {
-          $("#division-patterns-dropdown-" + index).on("change", function () {
-            let col = addDropdownSelect[index].columnToSearch;
-            let searchInput =
-              addDropdownSelect[index].matchPattern[$(this).val()];
-            if ($(this).val() === "" || $(this).val() === undefined)
-              table.columns(col).search("").draw();
-            else {
-              table.columns(col).search(searchInput).draw();
-            }
-          });
-        }
-      });
+    var table = $("#division_patterns_table").DataTable();
+    this.api().columns().every(function (index) {
+      if (index === 1) {
+        // Updated pattern type dropdown handler
+        $("#division-patterns-dropdown-1").on("change", function () {
+          if ($(this).val() === "") {
+            table.column(7).search("").draw();
+          } else {
+            const patternType = matchPatternTypeMap[$(this).val()];
+            table.column(7).search(patternType).draw();
+          }
+        });
+      }
+    });
   },
 
   columns: [
@@ -1013,7 +950,7 @@ var division_patterns_table = $("#division_patterns_table").DataTable({
       },
     },
     { data: "id", visible: false, searchable: false },
-    { data: "match_pattern_type", visible: false },
+    { data: "match_pattern_type", visible: false, searchable: true },
     { data: "division", visible: false },
   ],
 });

From d7c05e27c7346274d40b13a8b097c43dc022e5a4 Mon Sep 17 00:00:00 2001
From: Bishwas Praveen <bp0052@uah.edu>
Date: Fri, 21 Feb 2025 10:47:07 -0600
Subject: [PATCH 435/441] added achnage log

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9ccc3e2e..c97897a0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,3 +35,10 @@ For each PR made, an entry should be added to this changelog. It should contain
     - Defined a class `HTMLFreeCharField` which inherits `serializers.CharField`
     - Used regex to catch any HTML content comming in as an input to form fields
     - Called this class within the serializer for necessary fields
+
+- 3227-bugfix-title-patterns-selecting-multi-url-pattern-does-nothing
+  - Description: When selecting options from the match pattern type filter, the system does not filter the results as expected. Instead of displaying only the chosen variety of patterns, it continues to show all patterns.
+  - Changes:
+    - In `title_patterns_table` definition, corrected the column reference
+    - Made `match_pattern_type` searchable
+    - Corrected the column references and made code consistent on all the other tables, i.e., `exclude_patterns_table`, `include_patterns_table`, `division_patterns_table` and `document_type_patterns_table`

From 36a73179b752d4460eb4913dc2cbd326954c4947 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Fri, 21 Feb 2025 13:51:23 -0600
Subject: [PATCH 436/441] Update run_full_test_suite.yml

---
 .github/workflows/run_full_test_suite.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml
index 9893013b..10c61336 100644
--- a/.github/workflows/run_full_test_suite.yml
+++ b/.github/workflows/run_full_test_suite.yml
@@ -4,6 +4,8 @@ on:
   pull_request:
     branches:
       - dev
+    paths-ignore:
+      - '**/*.md'
 
 jobs:
   run-tests:

From 83e8f73622b8bf1f57b828156c60425a3e9d6510 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 24 Feb 2025 10:37:45 -0600
Subject: [PATCH 437/441] Added changelog for Issue_1001

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c97897a0..72a8f3b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,3 +42,9 @@ For each PR made, an entry should be added to this changelog. It should contain
     - In `title_patterns_table` definition, corrected the column reference
     - Made `match_pattern_type` searchable
     - Corrected the column references and made code consistent on all the other tables, i.e., `exclude_patterns_table`, `include_patterns_table`, `division_patterns_table` and `document_type_patterns_table`
+
+- 1001-tests-for-critical-functionalities
+  - Description: Critical functionalities have been identified and listed, and critical areas lacking tests listed
+  - Changes:
+    - Integrated coverage.py as an indicative tool in the workflow for automated coverage reports on PRs, with separate display from test results.
+    - Introduced docs/architecture-decisions/testing_strategy.md, which includes the coverage report, lists critical areas, and specifically identifies those critical areas that are untested or under-tested.

From 1f48c9d4c9f557cb5c61c4daf6d32c1d4b04d08e Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Mon, 24 Feb 2025 15:12:32 -0600
Subject: [PATCH 438/441] Wait for at least one row to be present in the table

---
 sde_collections/tests/frontend/test_pattern_application.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sde_collections/tests/frontend/test_pattern_application.py b/sde_collections/tests/frontend/test_pattern_application.py
index 28974c9b..632c5e29 100644
--- a/sde_collections/tests/frontend/test_pattern_application.py
+++ b/sde_collections/tests/frontend/test_pattern_application.py
@@ -141,6 +141,12 @@ def test_create_title_pattern(self):
         assert "3" in row_text
 
         self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls")
+
+        # Wait for at least one row to be present in the table
+        self.wait.until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, "#delta_urls_table tbody tr td:not(.dt-empty)"))
+        )
+
         table_html = self.driver.find_element(By.ID, "delta_urls_table").get_attribute("outerHTML")
 
         # Verify that previous curated_url now appear in delta_urls page after pattern application

From 333125d66254b4a87da966b78daf5043d5db7b52 Mon Sep 17 00:00:00 2001
From: Kiran Dawadi <kirandawadi11@gmail.com>
Date: Mon, 24 Feb 2025 15:54:27 -0600
Subject: [PATCH 439/441] Update Changelog.md

---
 CHANGELOG.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c97897a0..4114d502 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,3 +42,20 @@ For each PR made, an entry should be added to this changelog. It should contain
     - In `title_patterns_table` definition, corrected the column reference
     - Made `match_pattern_type` searchable
     - Corrected the column references and made code consistent on all the other tables, i.e., `exclude_patterns_table`, `include_patterns_table`, `division_patterns_table` and `document_type_patterns_table`
+
+- 1192-finalize-the-infrastructure-for-frontend-testing
+  - Description: Set up comprehensive frontend testing infrastructure using Selenium WebDriver with Chrome, establishing a foundation for automated UI testing.
+  - Changes:
+    - Added Selenium testing dependency to `requirements/local.txt`
+    - Updated Dockerfile to support Chrome and ChromeDriver
+    - Created BaseTestCase and AuthenticationMixin for reusable test components
+    - Implemented core authentication test suite
+
+- 1195-implement-unit-test-for-forms-on-the-frontend
+  - Description: Implemented comprehensive frontend test suite covering authentication, collection management, search functionality, and pattern application forms.
+  - Changes:
+    - Added tests for authentication flows
+    - Implemented collection display and data table tests
+    - Added universal search functionality tests
+    - Created search pane filter tests
+    - Added pattern application form tests with validation checks

From b6a349dceaad90a536e8636484619ca1be744ce5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 25 Feb 2025 15:42:45 +0000
Subject: [PATCH 440/441] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 45cfbf62..1a4f5035 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -65,4 +65,3 @@ For each PR made, an entry should be added to this changelog. It should contain
     - Added universal search functionality tests
     - Created search pane filter tests
     - Added pattern application form tests with validation checks
-

From 90ae57575ae2b544d07f723bc1da352a4dd2a116 Mon Sep 17 00:00:00 2001
From: Carson Davis <CarsonReidDavis@gmail.com>
Date: Tue, 25 Feb 2025 16:12:29 -0600
Subject: [PATCH 441/441] remove unused getParameterByName in delta_url_list.js

---
 sde_indexing_helper/static/js/delta_url_list.js | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sde_indexing_helper/static/js/delta_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js
index 36aaca02..33e7850d 100644
--- a/sde_indexing_helper/static/js/delta_url_list.js
+++ b/sde_indexing_helper/static/js/delta_url_list.js
@@ -1871,16 +1871,6 @@ function getCollectionId() {
   return collection_id;
 }
 
-function getParameterByName(name, url) {
-  if (!url) url = window.location.href;
-  name = name.replace(/[\[\]]/g, "\\$&");
-  var regex = new RegExp("[?&]" + name + "(=([^&#]*)|&|#|$)"),
-    results = regex.exec(url);
-  if (!results) return null;
-  if (!results[2]) return "";
-  return decodeURIComponent(results[2].replace(/\+/g, " "));
-}
-
 function remove_protocol(url) {
   return url.replace(/(^\w+:|^)\/\//, "");
 }