From 8baa94fa16c899ed4ac67b40973d85d8b853ae48 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Mon, 26 Aug 2024 14:40:56 -0400 Subject: [PATCH 001/441] Fix fake flake8 issues --- config/settings/production.py | 4 ++-- config_generation/db_to_xml_file_based.py | 2 +- feedback/models.py | 2 +- scraper/url_grouper.py | 6 ++++-- scripts/ej/create_ej_dump.py | 3 ++- scripts/find_redirects_solar_urls.py | 4 ++-- .../quality_and_indexing/restore_deleted_files.py | 3 ++- sde_collections/admin.py | 2 +- sde_collections/models/collection.py | 12 +++++------- sde_collections/models/pattern.py | 2 +- sde_collections/utils/health_check.py | 4 ++-- sde_collections/utils/slack_utils.py | 4 ++-- sde_collections/utils/title_resolver.py | 2 +- 13 files changed, 26 insertions(+), 24 deletions(-) diff --git a/config/settings/production.py b/config/settings/production.py index aff7db28..270aa00c 100644 --- a/config/settings/production.py +++ b/config/settings/production.py @@ -70,11 +70,11 @@ # ------------------------ STATICFILES_STORAGE = "sde_indexing_helper.utils.storages.StaticRootS3Boto3Storage" COLLECTFAST_STRATEGY = "collectfast.strategies.boto3.Boto3Strategy" -STATIC_URL = f"https://{aws_s3_domain}/static/" +STATIC_URL = f"https://{aws_s3_domain}/static/" # noqa: E231 # MEDIA # ------------------------------------------------------------------------------ DEFAULT_FILE_STORAGE = "sde_indexing_helper.utils.storages.MediaRootS3Boto3Storage" -MEDIA_URL = f"https://{aws_s3_domain}/media/" +MEDIA_URL = f"https://{aws_s3_domain}/media/" # noqa: E231 # EMAIL # ------------------------------------------------------------------------------ diff --git a/config_generation/db_to_xml_file_based.py b/config_generation/db_to_xml_file_based.py index 88252366..14b077b7 100644 --- a/config_generation/db_to_xml_file_based.py +++ b/config_generation/db_to_xml_file_based.py @@ -98,7 +98,7 @@ def update_or_add_element_value( parent_element = xml_root if not parent_element_name else xml_root.find(parent_element_name) if parent_element is None: - raise ValueError(f"Parent element '{parent_element_name}' not found in XML.") + raise ValueError(f"Parent element '{parent_element_name}' not found in XML.") # noqa: E713 existing_element = parent_element.find(element_name) if not add_duplicate and existing_element: diff --git a/feedback/models.py b/feedback/models.py index 0666080f..de2921b5 100644 --- a/feedback/models.py +++ b/feedback/models.py @@ -33,7 +33,7 @@ def format_notification_message(self): Returns a formatted notification message containing details from this Feedback instance. """ notification_message = ( - f" New Feedback Received : \n" + f" New Feedback Received : \n" # noqa: E203 f"Name: {self.name}\n" f"Email: {self.email}\n" f"Subject: {self.subject}\n" diff --git a/scraper/url_grouper.py b/scraper/url_grouper.py index df3be3f9..db6188e5 100644 --- a/scraper/url_grouper.py +++ b/scraper/url_grouper.py @@ -42,10 +42,12 @@ output_file.write(f"

{BASE_URL}

\n") output_file.write("\n") diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py index bab5baac..36d7f722 100644 --- a/scripts/ej/create_ej_dump.py +++ b/scripts/ej/create_ej_dump.py @@ -1,6 +1,7 @@ """ inferences are supplied by the classification model. the contact point is Bishwas -cmr is supplied by running https://github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py +cmr is supplied by running +github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py move to the serve like this: scp ej_dump_20240814_143036.json sde:/home/ec2-user/sde_indexing_helper/backups/ """ diff --git a/scripts/find_redirects_solar_urls.py b/scripts/find_redirects_solar_urls.py index 3bdbc131..db78081b 100644 --- a/scripts/find_redirects_solar_urls.py +++ b/scripts/find_redirects_solar_urls.py @@ -43,9 +43,9 @@ def csv_to_dict_list(file_path): scraped_title = soup.find("title").text.strip() if soup.find("title") else "" except (AssertionError, Exception) as parse_error: scraped_title = "" - print(f"Error parsing URL {url_info['url']}: {parse_error}") + print(f"Error parsing URL {url_info['url']}: {parse_error}") # noqa: F821 except requests.RequestException as e: - print(f"Error fetching URL {url_info['url']}: {e}") + print(f"Error fetching URL {url_info['url']}: {e}") # noqa: F821 response_url = "" scraped_title = "" diff --git a/scripts/quality_and_indexing/restore_deleted_files.py b/scripts/quality_and_indexing/restore_deleted_files.py index 6d6fcb84..70721cc9 100644 --- a/scripts/quality_and_indexing/restore_deleted_files.py +++ b/scripts/quality_and_indexing/restore_deleted_files.py @@ -1,5 +1,6 @@ """ -you need to run this script in the root of the repository that from which the file was deleted, in this case the root of the sinequa_configs repository. +you need to run this script in the root of the repository that from which the file was deleted, +in this case the root of the sinequa_configs repository. """ import subprocess diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 1b38db21..cb105f80 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -154,7 +154,7 @@ def export_as_csv(self, request, queryset): field_names = [field.name for field in meta.fields] response = HttpResponse(content_type="text/csv") - response["Content-Disposition"] = f"attachment; filename={meta}.csv" + response["Content-Disposition"] = f"attachment; filename={meta}.csv" # noqa: E702 writer = csv.writer(response) writer.writerow(field_names) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index c5690a4b..31306b8c 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -4,7 +4,6 @@ import requests from django.contrib.auth import get_user_model from django.db import models -from django.db.models import Q from django.db.models.signals import post_save from django.dispatch import receiver from model_utils import FieldTracker @@ -130,7 +129,7 @@ def tree_root(self) -> str: @property def server_url_secret_prod(self) -> str: - base_url = "https://sciencediscoveryengine.nasa.gov" + base_url = "https://sciencediscoveryengine.nasa.gov" # noqa: E231 payload = { "name": "secret-prod", "scope": "All", @@ -144,7 +143,7 @@ def server_url_secret_prod(self) -> str: @property def server_url_prod(self) -> str: - base_url = "https://sciencediscoveryengine.nasa.gov" + base_url = "https://sciencediscoveryengine.nasa.gov" # noqa: E231 payload = { "name": "query-smd-primary", "scope": "All", @@ -371,13 +370,12 @@ def candidate_urls_count(self) -> int: @property def sinequa_configuration(self) -> str: - return ( - f"https://github.com/NASA-IMPACT/sde-backend/blob/production/sources/SDE/{self.config_folder}/default.xml" - ) + URL = f"https://github.com/NASA-IMPACT/sde-backend/blob/production/sources/SDE/{self.config_folder}/default.xml" # noqa: E231, E501 + return URL @property def github_issue_link(self) -> str: - return f"https://github.com/NASA-IMPACT/sde-project/issues/{self.github_issue_number}" + return f"https://github.com/NASA-IMPACT/sde-project/issues/{self.github_issue_number}" # noqa: E231 @classmethod def _fetch_json_results(cls, url): diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index ae5d78ef..1e14042b 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -143,7 +143,7 @@ def validate_title_pattern(title_pattern_string): if element_type == "xpath": if not is_valid_xpath(element_value): - raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") + raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") # noqa: E231 elif element_type == "brace": try: is_valid_fstring(element_value) diff --git a/sde_collections/utils/health_check.py b/sde_collections/utils/health_check.py index 19c45369..0e09bd87 100644 --- a/sde_collections/utils/health_check.py +++ b/sde_collections/utils/health_check.py @@ -127,12 +127,12 @@ def create_exclude_pattern_report(match_pattern, url): # check with http:// if match_pattern.find("http://") == -1: - url = f"http://{match_pattern}" + url = f"http://{match_pattern}" # noqa: E231 if url in candidate_urls_sinequa: exclude_pattern_report.append(create_exclude_pattern_report(match_pattern, url)) if match_pattern.find("https://") == -1: - url = f"https://{match_pattern}" + url = f"https://{match_pattern}" # noqa: E231 if url in candidate_urls_sinequa: exclude_pattern_report.append(create_exclude_pattern_report(match_pattern, url)) else: diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py index c4cfd78b..a8fae3ca 100644 --- a/sde_collections/utils/slack_utils.py +++ b/sde_collections/utils/slack_utils.py @@ -90,7 +90,7 @@ def format_slack_message(name, details, collection_id): message_template = details["message"] tags = " ".join([f"<{user}>" for user in details["tags"]]) - link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/" + link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/" # noqa: E231 linked_name = f"<{link}|{name}>" return tags + " " + message_template.format(name=linked_name) @@ -101,5 +101,5 @@ def send_slack_message(message): response = requests.post(webhook_url, json=payload) if response.status_code != 200: raise ValueError( - f"Request to Slack returned an error {response.status_code}, the response is:\n{response.text}" + f"Request to Slack returned an error {response.status_code}, the response is:\n{response.text}" # noqa: E231, E501 ) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index b9171de3..20211bf7 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -32,7 +32,7 @@ def is_valid_fstring(pattern: str) -> bool: if node.value.id not in context: variables_allowed = ", ".join([key for key in context.keys()]) raise ValueError( - f"Variable '{node.value.id}' not allowed in f-string pattern." + f"Variable '{node.value.id}' not allowed in f-string pattern." # noqa: E713 f" Allowed variables are: {variables_allowed}" ) From 9cc9acd0e957338f8e858885cea484684d75b873 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Mon, 26 Aug 2024 14:43:11 -0400 Subject: [PATCH 002/441] Move noqa to the right line --- scraper/url_grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper/url_grouper.py b/scraper/url_grouper.py index db6188e5..01b12f8b 100644 --- a/scraper/url_grouper.py +++ b/scraper/url_grouper.py @@ -46,8 +46,8 @@ output_file.write("\n") output_file.write("\n") output_file.write("\n") From 2a22edab9bed637f81908cedddbc1639fe11851a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 21:58:03 +0000 Subject: [PATCH 003/441] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.4.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.4.0...v4.6.0) - [github.com/asottile/pyupgrade: v3.3.1 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.3.1...v3.17.0) - [github.com/psf/black: 23.1.0 → 24.8.0](https://github.com/psf/black/compare/23.1.0...24.8.0) - [github.com/PyCQA/isort: 5.12.0 → 5.13.2](https://github.com/PyCQA/isort/compare/5.12.0...5.13.2) - [github.com/PyCQA/flake8: 6.0.0 → 7.1.1](https://github.com/PyCQA/flake8/compare/6.0.0...7.1.1) - [github.com/pre-commit/mirrors-mypy: v1.4.0 → v1.11.2](https://github.com/pre-commit/mirrors-mypy/compare/v1.4.0...v1.11.2) --- .pre-commit-config.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8c4d553f..5631a71d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,37 +3,37 @@ default_stages: [commit] repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - repo: https://github.com/asottile/pyupgrade - rev: v3.3.1 + rev: v3.17.0 hooks: - id: pyupgrade args: [--py310-plus] - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 24.8.0 hooks: - id: black - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.1.1 hooks: - id: flake8 args: ["--config=setup.cfg"] additional_dependencies: [flake8-isort] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.4.0 + rev: v1.11.2 hooks: - id: mypy args: ["--strict"] From bccdaec949f4f6ed069010b1a3dfb7e798a57512 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 21:58:48 +0000 Subject: [PATCH 004/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- config/wsgi.py | 1 + config_generation/delete_config_folders.py | 1 + config_generation/generate_collection_list.py | 1 + config_generation/generate_commands.py | 1 + document_classifier/async_scraper.py | 1 + document_classifier/encoder.py | 1 + document_classifier/load_dataset.py | 1 + sde_indexing_helper/users/tests/test_forms.py | 1 + 8 files changed, 8 insertions(+) diff --git a/config/wsgi.py b/config/wsgi.py index bc448e89..bbc3c1ef 100644 --- a/config/wsgi.py +++ b/config/wsgi.py @@ -13,6 +13,7 @@ framework. """ + import os import sys from pathlib import Path diff --git a/config_generation/delete_config_folders.py b/config_generation/delete_config_folders.py index 0fc138d6..119d48fc 100644 --- a/config_generation/delete_config_folders.py +++ b/config_generation/delete_config_folders.py @@ -5,6 +5,7 @@ - commands - jobs """ + import glob import os import shutil diff --git a/config_generation/generate_collection_list.py b/config_generation/generate_collection_list.py index 86556c53..ee0e9b47 100644 --- a/config_generation/generate_collection_list.py +++ b/config_generation/generate_collection_list.py @@ -4,6 +4,7 @@ - filter anything that isn't a webcrawler - provide a variable, turned_on_remaining_webcrawlers for import by other files """ + import os from db_to_xml import XmlEditor diff --git a/config_generation/generate_commands.py b/config_generation/generate_commands.py index a538ee03..1b41858c 100644 --- a/config_generation/generate_commands.py +++ b/config_generation/generate_commands.py @@ -2,6 +2,7 @@ sometimes spot fixes need to be run on a list of collections this file provides a quick framework to generate a batch of commands based on an input json """ + from db_to_xml_file_based import XmlEditor from generate_jobs import ParallelJobCreator diff --git a/document_classifier/async_scraper.py b/document_classifier/async_scraper.py index fb2fb7c7..12c039a3 100644 --- a/document_classifier/async_scraper.py +++ b/document_classifier/async_scraper.py @@ -1,4 +1,5 @@ """Asynchronously scrapes the HTML content of a given URL using a headless browser.""" + import asyncio import re diff --git a/document_classifier/encoder.py b/document_classifier/encoder.py index c62bfafc..1bacc5d9 100644 --- a/document_classifier/encoder.py +++ b/document_classifier/encoder.py @@ -1,4 +1,5 @@ """ Encoding the url response """ + import pandas as pd diff --git a/document_classifier/load_dataset.py b/document_classifier/load_dataset.py index 8c64e03b..d61efdad 100644 --- a/document_classifier/load_dataset.py +++ b/document_classifier/load_dataset.py @@ -1,4 +1,5 @@ """ Module for loading dataset """ + from torch.utils.data import DataLoader, SequentialSampler, TensorDataset diff --git a/sde_indexing_helper/users/tests/test_forms.py b/sde_indexing_helper/users/tests/test_forms.py index f89d7141..68145eaa 100644 --- a/sde_indexing_helper/users/tests/test_forms.py +++ b/sde_indexing_helper/users/tests/test_forms.py @@ -1,6 +1,7 @@ """ Module for all Form Tests. """ + from django.utils.translation import gettext_lazy as _ from sde_indexing_helper.users.forms import UserAdminCreationForm From abd565a01c1f2e3ddcfa0aa6a60595b4cda326b9 Mon Sep 17 00:00:00 2001 From: Dawadi Kiran Date: Mon, 26 Aug 2024 21:15:21 -0500 Subject: [PATCH 005/441] Add LRM_QA_{USER, PASSWORD} variable to .django --- .envs/.local/.django | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.envs/.local/.django b/.envs/.local/.django index 97dfaab8..291b9a32 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -37,3 +37,5 @@ LRM_USER='' LRM_PASSWORD='' XLI_USER='' XLI_PASSWORD='' +LRM_QA_USER = '' +LRM_QA_PASSWORD = '' From d656506fb5fee5a11433141579b402f3c6ad06f1 Mon Sep 17 00:00:00 2001 From: Dawadi Kiran Date: Wed, 28 Aug 2024 17:02:20 -0500 Subject: [PATCH 006/441] Fixes issue #989 - Make coding syntax consistent --- .envs/.local/.django | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.envs/.local/.django b/.envs/.local/.django index 291b9a32..402efc3c 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -37,5 +37,5 @@ LRM_USER='' LRM_PASSWORD='' XLI_USER='' XLI_PASSWORD='' -LRM_QA_USER = '' -LRM_QA_PASSWORD = '' +LRM_QA_USER='' +LRM_QA_PASSWORD='' From 71c8fb4485fbbf9c9a7a767c7d15c491561eeff2 Mon Sep 17 00:00:00 2001 From: Dawadi Kiran Date: Mon, 2 Sep 2024 15:02:17 -0500 Subject: [PATCH 007/441] Fixes issue #993 - Add SQLDumpRestoration.md file --- SQLDumpRestoration.md | 81 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 SQLDumpRestoration.md diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md new file mode 100644 index 00000000..3a672a1f --- /dev/null +++ b/SQLDumpRestoration.md @@ -0,0 +1,81 @@ +## Restoring the Database from SQL Dump + +We generally load a database backup from a JSON file by using the following command. + +``` +docker-compose -f local.yml run --rm django python manage.py loaddata backup.json +``` + +However, if the JSON file is particularly large (>1.5GB), Docker might struggle with this method. In such cases, you can use SQL dump and restore commands as an alternative. + +### Steps for Using SQL Dump and Restore + +1. Begin by starting only the PostgreSQL container. This prevents the Django container from making changes while the PostgreSQL container is starting up. + +``` +docker-compose -f local.yml up postgres +``` + +2. Find the container ID using `docker ps`, then enter the PostgreSQL container to execute commands. + +``` +$ docker ps +CONTAINER ID IMAGE COMMAND +23d33f22cc43 sde_indexing_helper_production_postgres "docker-entrypoint.s…" + +$ docker exec -it 23d33f22cc43 bash +``` + +3. Create a connection to the database. + +``` +psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper +``` + +4. Ensure that the database `sde_indexing_helper` is empty. + +``` +sde_indexing_helper-# \c +You are now connected to database "sde_indexing_helper" as user "VnUvMKBSdk...". +sde_indexing_helper-# \dt +Did not find any relations. +``` + +If the database is not empty, delete its contents to create a fresh database: + +``` +sde_indexing_helper=# \c postgres +You are now connected to database "postgres" as user "VnUvMKBSdkoFIETgLongnxYHrYVJKufn". +postgres=# DROP DATABASE sde_indexing_helper; +DROP DATABASE +postgres=# CREATE DATABASE sde_indexing_helper; +CREATE DATABASE + +``` + +5. Transfer the backup SQL dump (`backup.sql`) from your local machine to the PostgreSQL container. + +``` +docker cp /local/path/backup.sql 23d33f22cc43:/ +``` + +6. Import the SQL dump into the PostgreSQL container. + +``` +psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper -f backup.sql +``` + +**Note**: To create a SQL dump of your PostgreSQL database, use the following command: + +``` +pg_dump -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -W -F p -f backup.sql sde_indexing_helper +``` + +7. Bring up all containers at once, and create a superuser account for logging in. + +``` +docker-compose -f local.yml up +docker-compose -f local.yml run --rm django python manage.py createsuperuser +``` + +8. Log in to the SDE Indexing Helper frontend to ensure that all data has been correctly populated in the UI. \ No newline at end of file From 52521e3076eab9e6a3b265a142238a9b4f281f89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 20:04:58 +0000 Subject: [PATCH 008/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- SQLDumpRestoration.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md index 3a672a1f..49e78994 100644 --- a/SQLDumpRestoration.md +++ b/SQLDumpRestoration.md @@ -19,11 +19,11 @@ docker-compose -f local.yml up postgres 2. Find the container ID using `docker ps`, then enter the PostgreSQL container to execute commands. ``` -$ docker ps -CONTAINER ID IMAGE COMMAND +$ docker ps +CONTAINER ID IMAGE COMMAND 23d33f22cc43 sde_indexing_helper_production_postgres "docker-entrypoint.s…" -$ docker exec -it 23d33f22cc43 bash +$ docker exec -it 23d33f22cc43 bash ``` 3. Create a connection to the database. @@ -32,7 +32,7 @@ $ docker exec -it 23d33f22cc43 bash psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper ``` -4. Ensure that the database `sde_indexing_helper` is empty. +4. Ensure that the database `sde_indexing_helper` is empty. ``` sde_indexing_helper-# \c @@ -78,4 +78,4 @@ docker-compose -f local.yml up docker-compose -f local.yml run --rm django python manage.py createsuperuser ``` -8. Log in to the SDE Indexing Helper frontend to ensure that all data has been correctly populated in the UI. \ No newline at end of file +8. Log in to the SDE Indexing Helper frontend to ensure that all data has been correctly populated in the UI. From 2e147f9e2d24d1b2ae20afcba3afc68ec5cd1d57 Mon Sep 17 00:00:00 2001 From: Dawadi Kiran Date: Mon, 2 Sep 2024 16:53:13 -0500 Subject: [PATCH 009/441] Fixes issue #995 --- CONTRIBUTING.md | 69 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..db454288 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,69 @@ +Thank you for your interest in contributing to COSMOS! We welcome contributions and appreciate your help in making this project better. Please follow the guidelines below to ensure a smooth contribution process. + +## Pull Requests + +### Prerequisites + +- **GitHub CLI (`gh`)**: Make sure you have the GitHub CLI installed. If not, you can install it from [GitHub CLI installation page](https://cli.github.com/). + +### 1. **Create an Issue on the Repo** + +1. **Navigate to Your Repository**: + + ```bash + $ cd path/to/your/repository + ``` + +2. **Create an Issue**: +Use the `gh issue create` command to create a new issue. + + ```bash + $ gh issue create --title "Issue Title" --body "Description of the issue" + ``` + + After running this command, you’ll get an issue number in the output. Note this number as it will be used to create a branch. + + +### 2. **Create a Branch for the Issue** + +1. **Create a Branch**: +Use the `gh` CLI to create a branch associated with the issue. The `gh` CLI can automatically create a branch for you based on the issue number. In this case, the `` is 989. + + ```bash + $ gh issue develop -c 989 + github.com/NASA-IMPACT/COSMOS/tree/989-make-coding-syntax-consistent + From https://github.com/NASA-IMPACT/COSMOS + * [new branch] 989-make-coding-syntax-consistent -> origin/989-make-coding-syntax-consistent + + ``` + + This command creates a new branch named `-issue` and switches to it. This branch will be used to work on the issue. + +2. **Make Your Changes and Push:** +Edit files, add code, or make any changes needed to address the issue. Commit your changes and push the branch to the remote repository. + + ```bash + git add . + git commit -m "Fixes issue #" + git push origin -issue + ``` + + +### 3. **Create a Pull Request** + +1. **Create the Pull Request**: +After pushing the branch, create a pull request using the `gh pr create` command: + + ```bash + gh pr create --base dev --head -issue --title "Title of the Pull Request" --body "Description of the changes" + ``` + + - **`-base`**: The base branch you want to merge your changes into (`dev` in our case) + - **`-head`**: The branch that contains your changes (e.g., `-issue`). + - **`-title`**: The title of the pull request. + - **`-body`**: The description or body of the pull request. + + This command will create a pull request from your branch into the base branch specified. + +2. **Review and Merge**: +Once the pull request is created, you can review it on GitHub and merge it if everything looks good. \ No newline at end of file From 733c6c55d40c8e144aeea3b1e91462f857416d82 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 21:57:55 +0000 Subject: [PATCH 010/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CONTRIBUTING.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index db454288..7564194d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,61 +9,61 @@ Thank you for your interest in contributing to COSMOS! We welcome contributions ### 1. **Create an Issue on the Repo** 1. **Navigate to Your Repository**: - + ```bash $ cd path/to/your/repository ``` - + 2. **Create an Issue**: Use the `gh issue create` command to create a new issue. - + ```bash $ gh issue create --title "Issue Title" --body "Description of the issue" ``` - + After running this command, you’ll get an issue number in the output. Note this number as it will be used to create a branch. - + ### 2. **Create a Branch for the Issue** 1. **Create a Branch**: Use the `gh` CLI to create a branch associated with the issue. The `gh` CLI can automatically create a branch for you based on the issue number. In this case, the `` is 989. - + ```bash $ gh issue develop -c 989 - github.com/NASA-IMPACT/COSMOS/tree/989-make-coding-syntax-consistent - From https://github.com/NASA-IMPACT/COSMOS + github.com/NASA-IMPACT/COSMOS/tree/989-make-coding-syntax-consistent + From https://github.com/NASA-IMPACT/COSMOS * [new branch] 989-make-coding-syntax-consistent -> origin/989-make-coding-syntax-consistent - + ``` - + This command creates a new branch named `-issue` and switches to it. This branch will be used to work on the issue. - + 2. **Make Your Changes and Push:** Edit files, add code, or make any changes needed to address the issue. Commit your changes and push the branch to the remote repository. - + ```bash git add . git commit -m "Fixes issue #" git push origin -issue ``` - + ### 3. **Create a Pull Request** 1. **Create the Pull Request**: After pushing the branch, create a pull request using the `gh pr create` command: - + ```bash gh pr create --base dev --head -issue --title "Title of the Pull Request" --body "Description of the changes" ``` - + - **`-base`**: The base branch you want to merge your changes into (`dev` in our case) - **`-head`**: The branch that contains your changes (e.g., `-issue`). - **`-title`**: The title of the pull request. - **`-body`**: The description or body of the pull request. - + This command will create a pull request from your branch into the base branch specified. - + 2. **Review and Merge**: -Once the pull request is created, you can review it on GitHub and merge it if everything looks good. \ No newline at end of file +Once the pull request is created, you can review it on GitHub and merge it if everything looks good. From 3cfa766d2c531a131d62cb889820261f0542a987 Mon Sep 17 00:00:00 2001 From: Dawadi Kiran Date: Mon, 2 Sep 2024 20:18:05 -0500 Subject: [PATCH 011/441] Add more description to 'Review and Merge' --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7564194d..b3d25f62 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,4 +66,4 @@ After pushing the branch, create a pull request using the `gh pr create` command This command will create a pull request from your branch into the base branch specified. 2. **Review and Merge**: -Once the pull request is created, you can review it on GitHub and merge it if everything looks good. +Once the pull request is created, we will review it on GitHub and merge it if everything looks good. If any changes are required, we might ask you to make adjustments before the merge. From a7f823a03996d7b119aeb07beca6725c4005851d Mon Sep 17 00:00:00 2001 From: Dawadi Kiran Date: Mon, 2 Sep 2024 21:34:39 -0500 Subject: [PATCH 012/441] Improve SQLDumpRestoration.md and README.md files --- README.md | 7 +++++-- SQLDumpRestoration.md | 16 ++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 61cf6b50..cc64ef01 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,10 @@ $ docker cp /path/to/your/backup.json container_name:/path/inside/container/back $ docker-compose -f local.yml run --rm django python manage.py loaddata /path/inside/the/container/backup.json $ docker-compose -f local.yml run --rm django python manage.py migrate ``` +### Restoring the Database from a SQL Dump +If the JSON file is particularly large (>1.5GB), Docker might struggle with this method. In such cases, you can use SQL dump and restore commands as an alternative, as described [here](./SQLDumpRestoration.md). + + ## Additional Commands @@ -191,8 +195,7 @@ Documented [here](https://github.com/NASA-IMPACT/sde-indexing-helper/wiki/How-to ## Adding New Features/Fixes -1. Start with a [GitHub issue](https://github.com/NASA-IMPACT/sde-indexing-helper/issues). -2. Use the GitHub CLI to create branches and pull requests (`gh issue develop -c `). +We welcome contributions to improve the project! Before you begin, please take a moment to review our [Contributing Guidelines](./CONTRIBUTING.md). These guidelines will help you understand the process for submitting new features, bug fixes, and other improvements. ## Job Creation diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md index 49e78994..6b4792be 100644 --- a/SQLDumpRestoration.md +++ b/SQLDumpRestoration.md @@ -29,10 +29,14 @@ $ docker exec -it 23d33f22cc43 bash 3. Create a connection to the database. ``` -psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper +psql -U -d ``` -4. Ensure that the database `sde_indexing_helper` is empty. +**Note**: +- For local deployment, refer to the `.envs/.local/.postgres` file for the `POSTGRES_USER` and `POSTGRES_DB` variables. +- For production deployment, refer to the `.envs/.production/.postgres` file. + +4. Ensure that the database `` is empty. Here's an example: ``` sde_indexing_helper-# \c @@ -44,8 +48,8 @@ Did not find any relations. If the database is not empty, delete its contents to create a fresh database: ``` -sde_indexing_helper=# \c postgres -You are now connected to database "postgres" as user "VnUvMKBSdkoFIETgLongnxYHrYVJKufn". +sde_indexing_helper=# \c postgres //connect to a different database before dropping +You are now connected to database "postgres" as user "VnUvMKBSdk....". postgres=# DROP DATABASE sde_indexing_helper; DROP DATABASE postgres=# CREATE DATABASE sde_indexing_helper; @@ -62,13 +66,13 @@ docker cp /local/path/backup.sql 23d33f22cc43:/ 6. Import the SQL dump into the PostgreSQL container. ``` -psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper -f backup.sql +psql -U -d -f backup.sql ``` **Note**: To create a SQL dump of your PostgreSQL database, use the following command: ``` -pg_dump -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -W -F p -f backup.sql sde_indexing_helper +pg_dump -U -W -F p -f backup.sql ``` 7. Bring up all containers at once, and create a superuser account for logging in. From 40574b9b8ad9e86295e6b6fdc1aea705744a63c3 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 9 Sep 2024 09:20:18 -0500 Subject: [PATCH 013/441] remove force reindexing from templates --- config_generation/xmls/job_template.xml | 4 ++-- config_generation/xmls/plugin_indexing_template.xml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config_generation/xmls/job_template.xml b/config_generation/xmls/job_template.xml index c5406ea9..9832101e 100644 --- a/config_generation/xmls/job_template.xml +++ b/config_generation/xmls/job_template.xml @@ -3,7 +3,7 @@ collection - _ForceReindexation + false @@ -32,4 +32,4 @@ - + \ No newline at end of file diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml index b7a9ce63..f7978062 100644 --- a/config_generation/xmls/plugin_indexing_template.xml +++ b/config_generation/xmls/plugin_indexing_template.xml @@ -8,7 +8,7 @@ 1 - true + false SMD_Plugins/Sinequa.Plugin.WebCrawler_Index_URLList 6 @@ -272,4 +272,4 @@ version Md5(doc.url1) - + \ No newline at end of file From a4fb1586af159a9455cdb0126043aae479e7e1b3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 14:23:31 +0000 Subject: [PATCH 014/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- config_generation/xmls/job_template.xml | 2 +- config_generation/xmls/plugin_indexing_template.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config_generation/xmls/job_template.xml b/config_generation/xmls/job_template.xml index 9832101e..7763ecf1 100644 --- a/config_generation/xmls/job_template.xml +++ b/config_generation/xmls/job_template.xml @@ -32,4 +32,4 @@ - \ No newline at end of file + diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml index f7978062..44bfba6c 100644 --- a/config_generation/xmls/plugin_indexing_template.xml +++ b/config_generation/xmls/plugin_indexing_template.xml @@ -272,4 +272,4 @@ version Md5(doc.url1) - \ No newline at end of file + From 0bb97e3c01dd34142c32aa5c2d623da113fa207a Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 16 Sep 2024 16:12:59 -0500 Subject: [PATCH 015/441] point tree root to name --- sde_collections/serializers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 4540bdfb..9623e85d 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -141,9 +141,9 @@ def get_file_extension(self, obj): def get_tree_root(self, obj): if obj.collection.is_multi_division: if obj.division: - return f"/{obj.get_division_display()}/{obj.collection.config_folder}" + return f"/{obj.get_division_display()}/{obj.collection.name}/" else: - return f"/{obj.collection.get_division_display()}/{obj.collection.config_folder}" + return f"/{obj.collection.get_division_display()}/{obj.collection.name}/" else: return obj.collection.tree_root From 233730b19c3a7984e56160f58fb3f35d5da74123 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 23 Sep 2024 13:35:20 -0500 Subject: [PATCH 016/441] change LRM dev configurations --- sde_collections/sinequa_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py index 1dffe26b..1c0a663f 100644 --- a/sde_collections/sinequa_api.py +++ b/sde_collections/sinequa_api.py @@ -38,13 +38,13 @@ "base_url": "http://sde-xli.nasa-impact.net", }, "lrm_dev_server": { - "app_name": "nasa-sba-smd", - "query_name": "query-smd-primary", + "app_name": "sde-init-check", + "query_name": "query-init-check", "base_url": "https://sde-lrm.nasa-impact.net", }, "lrm_qa_server": { - "app_name": "nasa-sba-smd", - "query_name": "query-smd-primary", + "app_name": "sde-init-check", + "query_name": "query-init-check", "base_url": "https://sde-qa.nasa-impact.net", }, } From 9408e45fc8e565efeeac25d2f1e43abf90f27526 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 25 Sep 2024 13:44:13 -0500 Subject: [PATCH 017/441] get URLs from scrapers folder for LRM servers --- sde_collections/sinequa_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py index 1c0a663f..0e4c3b62 100644 --- a/sde_collections/sinequa_api.py +++ b/sde_collections/sinequa_api.py @@ -94,7 +94,7 @@ def query(self, page: int, collection_config_folder: str = "") -> Any: } if collection_config_folder: - if self.server_name == "lis_server": + if self.server_name in ["lis_server", "lrm_dev_server", "lrm_qa_server"]: payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/" else: payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/" From 9c7b25bc0f43234fa9ac097f2a53a9ef02aae131 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:04:46 -0500 Subject: [PATCH 018/441] adding the new base URL model --- sde_collections/models/url.py | 85 +++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 sde_collections/models/url.py diff --git a/sde_collections/models/url.py b/sde_collections/models/url.py new file mode 100644 index 00000000..7ce86dff --- /dev/null +++ b/sde_collections/models/url.py @@ -0,0 +1,85 @@ +import os +from urllib.parse import urlparse + +from django.db import models + +from .collection import Collection +from .collection_choice_fields import Divisions, DocumentTypes +from .pattern import ExcludePattern + + +class UrlQuerySet(models.QuerySet): + def with_exclusion_status(self): + return self.annotate( + excluded=models.Exists( + ExcludePattern.candidate_urls.through.objects.filter(candidateurl=models.OuterRef("pk")) + ) + ) + + +class UrlManager(models.Manager): + def get_queryset(self): + return UrlQuerySet(self.model, using=self._db).with_exclusion_status() + + +class Url(models.Model): + """This is the base URL model which serves as a base for DeltaUrl and CuratedUrl.""" + + collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="urls") + url = models.CharField("URL", max_length=4096) + scraped_title = models.CharField( + "Scraped Title", + max_length=1024, + default="", + blank=True, + help_text="This is the original title scraped by Sinequa", + ) + generated_title = models.CharField( + "Generated Title", + max_length=1024, + default="", + blank=True, + help_text="This is the title generated based on a Title Pattern", + ) + visited = models.BooleanField(default=False) + document_type = models.IntegerField(choices=DocumentTypes.choices, null=True) + division = models.IntegerField(choices=Divisions.choices, null=True) + + objects = UrlManager() + + class Meta: + verbose_name = "URL" + verbose_name_plural = "URLs" + ordering = ["url"] + + @property + def fileext(self) -> str: + parsed_url = urlparse(self.url) + path = parsed_url.path + if path.endswith("/") or not path: + return "html" + extension = os.path.splitext(path)[1] + return extension[1:] if extension.startswith(".") else extension or "html" + + def splits(self) -> list[tuple[str, str]]: + parts = [] + part_string = "" + for part in self.path.split("/"): + if part: + part_string += f"/{part}" + parts.append((part_string, part)) + return parts + + @property + def path(self) -> str: + parsed = urlparse(self.url) + path = f"{parsed.path}" + if parsed.query: + path += f"?{parsed.query}" + return path + + def __str__(self) -> str: + return self.url + + def save(self, *args, **kwargs): + super().save(*args, **kwargs) From 115481d5359ff7064061ea18ff0e02152343fbd7 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:05:08 -0500 Subject: [PATCH 019/441] adding the new dump url model --- sde_collections/models/dump_url.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 sde_collections/models/dump_url.py diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py new file mode 100644 index 00000000..85ef85d9 --- /dev/null +++ b/sde_collections/models/dump_url.py @@ -0,0 +1,9 @@ +from .url import Url + + +class DumpUrl(Url): + """Model for storing all the imported URLs before seperating them into delta URLs and Curated URLs.""" + + class Meta: + verbose_name = "Dump URL" + verbose_name_plural = "Dump URLs" From 8af6102de71cb45d5e32f0c61dedf011583df1d0 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:05:28 -0500 Subject: [PATCH 020/441] adding the new delta url model --- sde_collections/models/delta_url.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 sde_collections/models/delta_url.py diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py new file mode 100644 index 00000000..028607ab --- /dev/null +++ b/sde_collections/models/delta_url.py @@ -0,0 +1,13 @@ +from django.db import models + +from .url import Url + + +class DeltaUrl(Url): + """Model for storing delta URLs for curation purposes""" + + delete = models.BooleanField(default=False) + + class Meta: + verbose_name = "Delta URL" + verbose_name_plural = "Delta URLs" From 3f9c88520939f53a61711f6c2dc6f0ec351c6918 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:05:50 -0500 Subject: [PATCH 021/441] adding the new curated url model --- sde_collections/models/curated_url.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 sde_collections/models/curated_url.py diff --git a/sde_collections/models/curated_url.py b/sde_collections/models/curated_url.py new file mode 100644 index 00000000..d55dcb5f --- /dev/null +++ b/sde_collections/models/curated_url.py @@ -0,0 +1,9 @@ +from .url import Url + + +class CuratedUrl(Url): + """Model for storing curated and live URLs after the curation process.""" + + class Meta: + verbose_name = "Curated URL" + verbose_name_plural = "Curated URLs" From 3c9627fc3e67d477f2746d63a8304695b334ed5e Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:06:08 -0500 Subject: [PATCH 022/441] adding the necessary migration file --- .../0059_url_curatedurl_deltaurl_dumpurl.py | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py diff --git a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py new file mode 100644 index 00000000..82f4d4af --- /dev/null +++ b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py @@ -0,0 +1,146 @@ +# Generated by Django 4.2.9 on 2024-10-10 03:01 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="Url", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("url", models.CharField(max_length=4096, verbose_name="URL")), + ( + "scraped_title", + models.CharField( + blank=True, + default="", + help_text="This is the original title scraped by Sinequa", + max_length=1024, + verbose_name="Scraped Title", + ), + ), + ( + "generated_title", + models.CharField( + blank=True, + default="", + help_text="This is the title generated based on a Title Pattern", + max_length=1024, + verbose_name="Generated Title", + ), + ), + ("visited", models.BooleanField(default=False)), + ( + "document_type", + models.IntegerField( + choices=[ + (1, "Images"), + (2, "Data"), + (3, "Documentation"), + (4, "Software and Tools"), + (5, "Missions and Instruments"), + ], + null=True, + ), + ), + ( + "division", + models.IntegerField( + choices=[ + (1, "Astrophysics"), + (2, "Biological and Physical Sciences"), + (3, "Earth Science"), + (4, "Heliophysics"), + (5, "Planetary Science"), + (6, "General"), + ], + null=True, + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="urls", + to="sde_collections.collection", + ), + ), + ], + options={ + "verbose_name": "URL", + "verbose_name_plural": "URLs", + "ordering": ["url"], + }, + ), + migrations.CreateModel( + name="CuratedUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Curated URL", + "verbose_name_plural": "Curated URLs", + }, + bases=("sde_collections.url",), + ), + migrations.CreateModel( + name="DeltaUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ("delete", models.BooleanField(default=False)), + ], + options={ + "verbose_name": "Delta URL", + "verbose_name_plural": "Delta URLs", + }, + bases=("sde_collections.url",), + ), + migrations.CreateModel( + name="DumpUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Dump URL", + "verbose_name_plural": "Dump URLs", + }, + bases=("sde_collections.url",), + ), + ] From 2fcd346a2260779f64f319f7c63436792ca13cc1 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:08:41 -0500 Subject: [PATCH 023/441] adding a command file to migrate urls into delta and curated URL models --- .../management/commands/migrate_urls.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 sde_collections/management/commands/migrate_urls.py diff --git a/sde_collections/management/commands/migrate_urls.py b/sde_collections/management/commands/migrate_urls.py new file mode 100644 index 00000000..6958c107 --- /dev/null +++ b/sde_collections/management/commands/migrate_urls.py @@ -0,0 +1,59 @@ +from django.core.management.base import BaseCommand + +from sde_collections.models.candidate_url import CandidateURL +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices +from sde_collections.models.curated_url import CuratedUrl +from sde_collections.models.delta_url import DeltaUrl + + +class Command(BaseCommand): + help = "Migrate CandidateURLs to CuratedUrl or DeltaUrl based on collection workflow status" + + def handle(self, *args, **kwargs): + # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl + collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED) + self.stdout.write( + f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..." + ) + + for collection in collections_for_curated: + candidate_urls = CandidateURL.objects.filter(collection=collection) + for candidate_url in candidate_urls: + CuratedUrl.objects.create( + collection=candidate_url.collection, + url=candidate_url.url, + scraped_title=candidate_url.scraped_title, + generated_title=candidate_url.generated_title, + visited=candidate_url.visited, + document_type=candidate_url.document_type, + division=candidate_url.division, + ) + self.stdout.write( + f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl." + ) + + # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl + collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED) + self.stdout.write( + f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..." + ) + + for collection in collections_for_delta: + candidate_urls = CandidateURL.objects.filter(collection=collection) + for candidate_url in candidate_urls: + DeltaUrl.objects.create( + collection=candidate_url.collection, + url=candidate_url.url, + scraped_title=candidate_url.scraped_title, + generated_title=candidate_url.generated_title, + visited=candidate_url.visited, + document_type=candidate_url.document_type, + division=candidate_url.division, + delete=False, + ) + self.stdout.write( + f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl." + ) + + self.stdout.write(self.style.SUCCESS("Migration complete.")) From d691af30fa10362f35a61b6bfd9f175ba3175bac Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:09:37 -0500 Subject: [PATCH 024/441] added the new models into admin console --- sde_collections/admin.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index cb105f80..e4ff5097 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -5,7 +5,11 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory +from .models.curated_url import CuratedUrl +from .models.delta_url import DeltaUrl +from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern +from .models.url import Url from .tasks import import_candidate_urls_from_api @@ -299,9 +303,41 @@ class DivisionPatternAdmin(admin.ModelAdmin): search_fields = ("match_pattern", "division") +class UrlAdmin(admin.ModelAdmin): + """Admin View for Url""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + +class DumpUrlAdmin(admin.ModelAdmin): + """Admin View for DumpUrl""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + +class CuratedUrlAdmin(admin.ModelAdmin): + """Admin View for CuratedUrl""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + +class DeltaUrlAdmin(admin.ModelAdmin): + """Admin View for DeltaUrl""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + admin.site.register(WorkflowHistory, WorkflowHistoryAdmin) admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) admin.site.register(IncludePattern) admin.site.register(ResolvedTitle, ResolvedTitleAdmin) admin.site.register(DivisionPattern, DivisionPatternAdmin) +admin.site.register(Url, UrlAdmin) +admin.site.register(DeltaUrl, DeltaUrlAdmin) +admin.site.register(DumpUrl, DumpUrlAdmin) +admin.site.register(CuratedUrl, CuratedUrlAdmin) From a17029f88dc2644e3705ed11aa9ce9a4e727c431 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:01:11 -0500 Subject: [PATCH 025/441] removed url and dumpurl models from admin --- sde_collections/admin.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index e4ff5097..4fce1ea7 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,9 +7,7 @@ from .models.collection import Collection, WorkflowHistory from .models.curated_url import CuratedUrl from .models.delta_url import DeltaUrl -from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .models.url import Url from .tasks import import_candidate_urls_from_api @@ -303,20 +301,6 @@ class DivisionPatternAdmin(admin.ModelAdmin): search_fields = ("match_pattern", "division") -class UrlAdmin(admin.ModelAdmin): - """Admin View for Url""" - - list_display = ("url", "scraped_title", "collection") - list_filter = ("collection",) - - -class DumpUrlAdmin(admin.ModelAdmin): - """Admin View for DumpUrl""" - - list_display = ("url", "scraped_title", "collection") - list_filter = ("collection",) - - class CuratedUrlAdmin(admin.ModelAdmin): """Admin View for CuratedUrl""" @@ -337,7 +321,5 @@ class DeltaUrlAdmin(admin.ModelAdmin): admin.site.register(IncludePattern) admin.site.register(ResolvedTitle, ResolvedTitleAdmin) admin.site.register(DivisionPattern, DivisionPatternAdmin) -admin.site.register(Url, UrlAdmin) admin.site.register(DeltaUrl, DeltaUrlAdmin) -admin.site.register(DumpUrl, DumpUrlAdmin) admin.site.register(CuratedUrl, CuratedUrlAdmin) From 8606581c8e7970403519499e7171ae8503f7c296 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:02:10 -0500 Subject: [PATCH 026/441] edited the curated url api serialzier used for indexing --- sde_collections/serializers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 9623e85d..2f11700b 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -3,6 +3,7 @@ from .models.candidate_url import CandidateURL from .models.collection import Collection, WorkflowHistory from .models.collection_choice_fields import Divisions, DocumentTypes +from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -107,19 +108,18 @@ class Meta: ) -class CandidateURLAPISerializer(serializers.ModelSerializer): +class CuratedUrlAPISerializer(serializers.ModelSerializer): document_type = serializers.SerializerMethodField() title = serializers.SerializerMethodField() file_extension = serializers.SerializerMethodField() tree_root = serializers.SerializerMethodField() class Meta: - model = CandidateURL + model = CuratedUrl fields = ( "url", "title", "document_type", - "hash", "file_extension", "tree_root", ) From 0f8578cb2059bfce3c9f0508663090fd7e6c08ff Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:02:35 -0500 Subject: [PATCH 027/441] changed the api endpoit to have an appropriate name --- sde_collections/urls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 4e3d0534..214d1198 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -55,9 +55,9 @@ # Delete an existing CandidateURL instance: /candidate-urls/{id}/ path("api/", include(router.urls)), path( - "candidate-urls-api//", - view=views.CandidateURLAPIView.as_view(), - name="candidate-url-api", + "curated-urls-api//", + view=views.CuratedURLAPIView.as_view(), + name="curated-url-api", ), path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"), ] From 717eb533f59878f776b45b43216464323127341f Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:03:22 -0500 Subject: [PATCH 028/441] changed the api vew to point to the right curated url model --- sde_collections/views.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sde_collections/views.py b/sde_collections/views.py index 241979ba..b8ff70a0 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -27,6 +27,7 @@ DocumentTypes, WorkflowStatusChoices, ) +from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -35,11 +36,11 @@ TitlePattern, ) from .serializers import ( - CandidateURLAPISerializer, CandidateURLBulkCreateSerializer, CandidateURLSerializer, CollectionReadSerializer, CollectionSerializer, + CuratedUrlAPISerializer, DivisionPatternSerializer, DocumentTypePatternSerializer, ExcludePatternSerializer, @@ -307,8 +308,8 @@ def create(self, request, *args, **kwargs): return Response(serializer.data, status=status.HTTP_201_CREATED) -class CandidateURLAPIView(ListAPIView): - serializer_class = CandidateURLAPISerializer +class CuratedURLAPIView(ListAPIView): + serializer_class = CuratedUrlAPISerializer def get(self, request, *args, **kwargs): config_folder = kwargs.get("config_folder") @@ -317,7 +318,7 @@ def get(self, request, *args, **kwargs): def get_queryset(self): queryset = ( - CandidateURL.objects.filter(collection__config_folder=self.config_folder) + CuratedUrl.objects.filter(collection__config_folder=self.config_folder) .with_exclusion_status() .filter(excluded=False) ) From 83cb35a45d39dba10fcc22e0d7b6ae7979cc299b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:03:36 -0500 Subject: [PATCH 029/441] migration file with changes --- .../migrations/0060_delete_dumpurl.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 sde_collections/migrations/0060_delete_dumpurl.py diff --git a/sde_collections/migrations/0060_delete_dumpurl.py b/sde_collections/migrations/0060_delete_dumpurl.py new file mode 100644 index 00000000..db9a10c1 --- /dev/null +++ b/sde_collections/migrations/0060_delete_dumpurl.py @@ -0,0 +1,16 @@ +# Generated by Django 4.2.9 on 2024-10-14 16:37 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"), + ] + + operations = [ + migrations.DeleteModel( + name="DumpUrl", + ), + ] From 27d0b49bff19ce81905286f3b3cb2925132dcca0 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 14 Oct 2024 20:40:01 -0500 Subject: [PATCH 030/441] change EnableNeuralIndexing to true in indexing template --- config_generation/xmls/plugin_indexing_template.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml index 44bfba6c..34aea51f 100644 --- a/config_generation/xmls/plugin_indexing_template.xml +++ b/config_generation/xmls/plugin_indexing_template.xml @@ -20,7 +20,7 @@ - false + true true false From d537302dcbbc288175ec81f62994b5fec84fbcbc Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 17 Oct 2024 13:52:37 -0500 Subject: [PATCH 031/441] add per indicator thrsholding and new dump --- scripts/ej/cmr_to_models.py | 2 +- scripts/ej/create_ej_dump.py | 37 +++++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py index 130de722..f7ba46db 100644 --- a/scripts/ej/cmr_to_models.py +++ b/scripts/ej/cmr_to_models.py @@ -69,7 +69,7 @@ def categorize_processing_level(level): # remove existing data EnvironmentalJusticeRow.objects.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV).delete() -ej_dump = json.load(open("backups/ej_dump_20240815_112916.json")) +ej_dump = json.load(open("backups/ej_dump_20241017_133151.json.json")) for dataset in ej_dump: ej_row = EnvironmentalJusticeRow( destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV, diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py index 36d7f722..c44aebc5 100644 --- a/scripts/ej/create_ej_dump.py +++ b/scripts/ej/create_ej_dump.py @@ -2,7 +2,7 @@ inferences are supplied by the classification model. the contact point is Bishwas cmr is supplied by running github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py -move to the serve like this: scp ej_dump_20240814_143036.json sde:/home/ec2-user/sde_indexing_helper/backups/ +move to the server like this: scp ej_dump_20241017_133151.json sde:/home/ec2-user/sde_indexing_helper/backups/ """ import json @@ -19,12 +19,12 @@ def save_to_json(data: dict | list, file_path: str) -> None: json.dump(data, file, indent=2) -def process_classifications(predictions: list[dict[str, float]], threshold: float = 0.5) -> list[str]: +def process_classifications(predictions: list[dict[str, float]], thresholds: dict[str, float]) -> list[str]: """ - Process the predictions and classify as follows: - 1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification - 2. Filter classifications based on the threshold, excluding 'Not EJ' - 3. Default to 'Not EJ' if no classifications meet the threshold + Process the predictions and classify based on the individual thresholds per indicator: + 1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification. + 2. Filter classifications based on their individual thresholds, excluding 'Not EJ'. + 3. Default to 'Not EJ' if no classifications meet the threshold. """ highest_prediction = max(predictions, key=lambda x: x["score"]) @@ -32,7 +32,9 @@ def process_classifications(predictions: list[dict[str, float]], threshold: floa return ["Not EJ"] classifications = [ - pred["label"] for pred in predictions if pred["score"] >= threshold and pred["label"] != "Not EJ" + pred["label"] + for pred in predictions + if pred["score"] >= thresholds[pred["label"]] and pred["label"] != "Not EJ" ] return classifications if classifications else ["Not EJ"] @@ -63,14 +65,14 @@ def remove_unauthorized_classifications(classifications: list[str]) -> list[str] def update_cmr_with_classifications( inferences: list[dict[str, dict]], cmr_dict: dict[str, dict[str, dict]], - threshold: float = 0.5, + thresholds: dict[str, float], ) -> list[dict[str, dict]]: """Update CMR data with valid classifications based on inferences.""" predicted_cmr = [] for inference in inferences: - classifications = process_classifications(predictions=inference["predictions"], threshold=threshold) + classifications = process_classifications(predictions=inference["predictions"], thresholds=thresholds) classifications = remove_unauthorized_classifications(classifications) if classifications: @@ -84,17 +86,30 @@ def update_cmr_with_classifications( def main(): - inferences = load_json_file("cmr-inference.json") + thresholds = { + "Not EJ": 0.80, + "Climate Change": 0.95, + "Disasters": 0.80, + "Extreme Heat": 0.50, + "Food Availability": 0.80, + "Health & Air Quality": 0.90, + "Human Dimensions": 0.80, + "Urban Flooding": 0.50, + "Water Availability": 0.80, + } + + inferences = load_json_file("alpha-1.3-wise-vortex-42-predictions.json") cmr = load_json_file("cmr_collections_umm_20240807_142146.json") cmr_dict = create_cmr_dict(cmr) - predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, threshold=0.8) + predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, thresholds=thresholds) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") file_name = f"ej_dump_{timestamp}.json" save_to_json(predicted_cmr, file_name) + print(f"Saved to {file_name}") if __name__ == "__main__": From b559facb6a5a43104445943cf1eadec4fe6ae0e7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 23 Oct 2024 21:48:31 -0500 Subject: [PATCH 032/441] Fixes issue #1071 --- .envs/.local/.django | 5 + sde_collections/admin.py | 18 ++ .../0059_candidateurl_scraped_text.py | 18 ++ sde_collections/models/candidate_url.py | 1 + sde_collections/tasks.py | 192 +++++++++++++++++- 5 files changed, 232 insertions(+), 2 deletions(-) create mode 100644 sde_collections/migrations/0059_candidateurl_scraped_text.py diff --git a/.envs/.local/.django b/.envs/.local/.django index 402efc3c..ce2e8095 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -39,3 +39,8 @@ XLI_USER='' XLI_PASSWORD='' LRM_QA_USER='' LRM_QA_PASSWORD='' + +#Server Tokens +#-------------------------------------------------------------------------------- +LRMDEV_TOKEN='' +LIS_TOKEN='' \ No newline at end of file diff --git a/sde_collections/admin.py b/sde_collections/admin.py index cb105f80..ecf92838 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,6 +7,22 @@ from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api +from .tasks import fetch_and_update_full_text + + +@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text") +def fetch_full_text_lrm_dev_action(modeladmin, request, queryset): + for collection in queryset: + fetch_and_update_full_text.delay(collection.id, "LRM_DEV") + modeladmin.message_user(request, "Full text fetched and updated from LRM_DEV successfully.") + + +@admin.action(description="Import candidate URLs from Li's Server with Full Text") +def fetch_full_text_lis_action(modeladmin, request, queryset): + for collection in queryset: + fetch_and_update_full_text.delay(collection.id, "LIS") + modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.") + @admin.action(description="Generate deployment message") @@ -239,6 +255,8 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): import_candidate_urls_lis_server, import_candidate_urls_lrm_dev_server, import_candidate_urls_lrm_qa_server, + fetch_full_text_lrm_dev_action, + fetch_full_text_lis_action, ] ordering = ("cleaning_order",) diff --git a/sde_collections/migrations/0059_candidateurl_scraped_text.py b/sde_collections/migrations/0059_candidateurl_scraped_text.py new file mode 100644 index 00000000..cc3ea65b --- /dev/null +++ b/sde_collections/migrations/0059_candidateurl_scraped_text.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-10-21 23:10 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="scraped_text", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 51c3a28b..936ea363 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -35,6 +35,7 @@ class CandidateURL(models.Model): blank=True, help_text="This is the original title scraped by Sinequa", ) + scraped_text = models.TextField(blank=True, null=True) generated_title = models.CharField( "Generated Title", default="", diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index fa754efc..3172b22f 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -1,13 +1,13 @@ import json import os import shutil - +import requests import boto3 from django.apps import apps from django.conf import settings from django.core import management from django.core.management.commands import loaddata - +from sde_collections.models.candidate_url import CandidateURL from config import celery_app from .models.collection import Collection, WorkflowStatusChoices @@ -141,3 +141,191 @@ def resolve_title_pattern(title_pattern_id): TitlePattern = apps.get_model("sde_collections", "TitlePattern") title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() +''' +@celery_app.task +def fetch_and_update_full_text(collection_id): + + try: + collection = Collection.objects.get(id=collection_id) + except Collection.DoesNotExist: + raise Exception(f"Collection with ID {collection_id} does not exist.") + + url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server + sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'" + token = os.getenv('LRMDEV_TOKEN') + + + payload = json.dumps({ + "method": "engine.sql", + "sql": sql_command, + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default" + }) + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {token}' + } + + response = requests.post(url, headers=headers, data=payload) + if response.status_code == 200: + records = response.json().get("Rows", []) + for record in records: + url, full_text, title = record + if not url or not full_text or not title: + continue + # Directly update or create the entry without checking for content changes + CandidateURL.objects.update_or_create( + url=url, + collection=collection, + defaults={ + 'scraped_text': full_text, + 'scraped_title': title + } + ) + + return f"Processed {len(records)} records; Updated or created in database." + else: + raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") + ''' + +#You will have to have a different function for Li's server as it uses user and pw with body to login. +#If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 +#Fill in the tokens in the .django file + +#Integrated - LRM devs and Lis separate +''' +@celery_app.task +def fetch_and_update_full_text(collection_id, server_type): + try: + collection = Collection.objects.get(id=collection_id) + except Collection.DoesNotExist: + raise Exception(f"Collection with ID {collection_id} does not exist.") + + # Server-specific configurations + server_config = get_server_config(server_type) + + # API Request Parameters + payload = json.dumps({ + "method": "engine.sql", + "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default" + }) + + token = server_config["token"] + url = server_config["url"] + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {token}' + } + + # Send the request + response = requests.post(url, headers=headers, data=payload) + if response.status_code == 200: + records = response.json().get("Rows", []) + for record in records: + url, full_text, title = record + if not url or not full_text or not title: + continue + CandidateURL.objects.update_or_create( + url=url, + collection=collection, + defaults={ + 'scraped_text': full_text, + 'scraped_title': title + } + ) + return f"Processed {len(records)} records; Updated or created in database." + else: + raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") + + +def get_server_config(server_type): + if server_type == "LRM_DEV": + return { + "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LRMDEV_TOKEN") + } + elif server_type == "LIS": + return { + "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LIS_TOKEN") + } + else: + raise ValueError("Invalid server type.") +''' + + +@celery_app.task +def fetch_and_update_full_text(collection_id, server_type): + try: + collection = Collection.objects.get(id=collection_id) + except Collection.DoesNotExist: + raise Exception(f"Collection with ID {collection_id} does not exist.") + + server_config = get_server_config(server_type) + token = server_config["token"] + url = server_config["url"] + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {token}' + } + + payload = json.dumps({ + "method": "engine.sql", + "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default" + }) + + try: + response = requests.post(url, headers=headers, data=payload, timeout=10) + response.raise_for_status() # Raise exception for HTTP errors + except requests.exceptions.RequestException as e: + raise Exception(f"API request failed: {str(e)}") + + records = response.json().get("Rows", []) + if not records: + return "No records found in the response." + + for record in records: + url, full_text, title = record + if not (url and full_text and title): + continue + + CandidateURL.objects.update_or_create( + url=url, + collection=collection, + defaults={ + 'scraped_text': full_text, + 'scraped_title': title + } + ) + + return f"Successfully processed {len(records)} records and updated the database." + +def get_server_config(server_type): + if server_type == "LRM_DEV": + return { + "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LRMDEV_TOKEN") + } + elif server_type == "LIS": + return { + "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LIS_TOKEN") + } + else: + raise ValueError("Invalid server type.") + From 8678ed6e83edc61461c51a51cc8bd9b5c9190dde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 03:05:09 +0000 Subject: [PATCH 033/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .envs/.local/.django | 2 +- sde_collections/admin.py | 4 +- sde_collections/tasks.py | 82 ++++++++++++++++++---------------------- 3 files changed, 39 insertions(+), 49 deletions(-) diff --git a/.envs/.local/.django b/.envs/.local/.django index ce2e8095..07e159fa 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -43,4 +43,4 @@ LRM_QA_PASSWORD='' #Server Tokens #-------------------------------------------------------------------------------- LRMDEV_TOKEN='' -LIS_TOKEN='' \ No newline at end of file +LIS_TOKEN='' diff --git a/sde_collections/admin.py b/sde_collections/admin.py index ecf92838..7b519a15 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -6,8 +6,7 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .tasks import import_candidate_urls_from_api -from .tasks import fetch_and_update_full_text +from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api @admin.action(description="Import candidate URLs from LRM Dev Server with Full Text") @@ -22,7 +21,6 @@ def fetch_full_text_lis_action(modeladmin, request, queryset): for collection in queryset: fetch_and_update_full_text.delay(collection.id, "LIS") modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.") - @admin.action(description="Generate deployment message") diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 3172b22f..8d93a2de 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -1,14 +1,16 @@ import json import os import shutil -import requests + import boto3 +import requests from django.apps import apps from django.conf import settings from django.core import management from django.core.management.commands import loaddata -from sde_collections.models.candidate_url import CandidateURL + from config import celery_app +from sde_collections.models.candidate_url import CandidateURL from .models.collection import Collection, WorkflowStatusChoices from .sinequa_api import Api @@ -141,15 +143,17 @@ def resolve_title_pattern(title_pattern_id): TitlePattern = apps.get_model("sde_collections", "TitlePattern") title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() -''' + + +""" @celery_app.task def fetch_and_update_full_text(collection_id): - + try: collection = Collection.objects.get(id=collection_id) except Collection.DoesNotExist: raise Exception(f"Collection with ID {collection_id} does not exist.") - + url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'" token = os.getenv('LRMDEV_TOKEN') @@ -164,12 +168,12 @@ def fetch_and_update_full_text(collection_id): "resolveIndexList": "false", "engines": "default" }) - + headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {token}' } - + response = requests.post(url, headers=headers, data=payload) if response.status_code == 200: records = response.json().get("Rows", []) @@ -190,21 +194,21 @@ def fetch_and_update_full_text(collection_id): return f"Processed {len(records)} records; Updated or created in database." else: raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") - ''' + """ -#You will have to have a different function for Li's server as it uses user and pw with body to login. -#If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 -#Fill in the tokens in the .django file +# You will have to have a different function for Li's server as it uses user and pw with body to login. +# If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 +# Fill in the tokens in the .django file -#Integrated - LRM devs and Lis separate -''' +# Integrated - LRM devs and Lis separate +""" @celery_app.task def fetch_and_update_full_text(collection_id, server_type): try: collection = Collection.objects.get(id=collection_id) except Collection.DoesNotExist: raise Exception(f"Collection with ID {collection_id} does not exist.") - + # Server-specific configurations server_config = get_server_config(server_type) @@ -260,7 +264,7 @@ def get_server_config(server_type): } else: raise ValueError("Invalid server type.") -''' +""" @celery_app.task @@ -274,20 +278,19 @@ def fetch_and_update_full_text(collection_id, server_type): token = server_config["token"] url = server_config["url"] - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {token}' - } - - payload = json.dumps({ - "method": "engine.sql", - "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", - "pretty": True, - "log": False, - "output": "json", - "resolveIndexList": "false", - "engines": "default" - }) + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} + + payload = json.dumps( + { + "method": "engine.sql", + "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default", + } + ) try: response = requests.post(url, headers=headers, data=payload, timeout=10) @@ -302,30 +305,19 @@ def fetch_and_update_full_text(collection_id, server_type): for record in records: url, full_text, title = record if not (url and full_text and title): - continue + continue CandidateURL.objects.update_or_create( - url=url, - collection=collection, - defaults={ - 'scraped_text': full_text, - 'scraped_title': title - } + url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title} ) return f"Successfully processed {len(records)} records and updated the database." + def get_server_config(server_type): if server_type == "LRM_DEV": - return { - "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LRMDEV_TOKEN") - } + return {"url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LRMDEV_TOKEN")} elif server_type == "LIS": - return { - "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LIS_TOKEN") - } + return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")} else: raise ValueError("Invalid server type.") - From e4881a94adaa5dba4d9dca928a55117ef4e671b7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 23 Oct 2024 22:24:32 -0500 Subject: [PATCH 034/441] Fixes issue #1071 --- sde_collections/tasks.py | 124 +-------------------------------------- 1 file changed, 1 insertion(+), 123 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 8d93a2de..0c54ea0c 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -144,129 +144,6 @@ def resolve_title_pattern(title_pattern_id): title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() - -""" -@celery_app.task -def fetch_and_update_full_text(collection_id): - - try: - collection = Collection.objects.get(id=collection_id) - except Collection.DoesNotExist: - raise Exception(f"Collection with ID {collection_id} does not exist.") - - url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server - sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'" - token = os.getenv('LRMDEV_TOKEN') - - - payload = json.dumps({ - "method": "engine.sql", - "sql": sql_command, - "pretty": True, - "log": False, - "output": "json", - "resolveIndexList": "false", - "engines": "default" - }) - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {token}' - } - - response = requests.post(url, headers=headers, data=payload) - if response.status_code == 200: - records = response.json().get("Rows", []) - for record in records: - url, full_text, title = record - if not url or not full_text or not title: - continue - # Directly update or create the entry without checking for content changes - CandidateURL.objects.update_or_create( - url=url, - collection=collection, - defaults={ - 'scraped_text': full_text, - 'scraped_title': title - } - ) - - return f"Processed {len(records)} records; Updated or created in database." - else: - raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") - """ - -# You will have to have a different function for Li's server as it uses user and pw with body to login. -# If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 -# Fill in the tokens in the .django file - -# Integrated - LRM devs and Lis separate -""" -@celery_app.task -def fetch_and_update_full_text(collection_id, server_type): - try: - collection = Collection.objects.get(id=collection_id) - except Collection.DoesNotExist: - raise Exception(f"Collection with ID {collection_id} does not exist.") - - # Server-specific configurations - server_config = get_server_config(server_type) - - # API Request Parameters - payload = json.dumps({ - "method": "engine.sql", - "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", - "pretty": True, - "log": False, - "output": "json", - "resolveIndexList": "false", - "engines": "default" - }) - - token = server_config["token"] - url = server_config["url"] - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {token}' - } - - # Send the request - response = requests.post(url, headers=headers, data=payload) - if response.status_code == 200: - records = response.json().get("Rows", []) - for record in records: - url, full_text, title = record - if not url or not full_text or not title: - continue - CandidateURL.objects.update_or_create( - url=url, - collection=collection, - defaults={ - 'scraped_text': full_text, - 'scraped_title': title - } - ) - return f"Processed {len(records)} records; Updated or created in database." - else: - raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") - - -def get_server_config(server_type): - if server_type == "LRM_DEV": - return { - "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LRMDEV_TOKEN") - } - elif server_type == "LIS": - return { - "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LIS_TOKEN") - } - else: - raise ValueError("Invalid server type.") -""" - - @celery_app.task def fetch_and_update_full_text(collection_id, server_type): try: @@ -321,3 +198,4 @@ def get_server_config(server_type): return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")} else: raise ValueError("Invalid server type.") + \ No newline at end of file From 47f164f7f7a5d3a1f3f983d92d9d1bd4636f087b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 03:25:01 +0000 Subject: [PATCH 035/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 0c54ea0c..f505c942 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -144,6 +144,7 @@ def resolve_title_pattern(title_pattern_id): title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() + @celery_app.task def fetch_and_update_full_text(collection_id, server_type): try: @@ -198,4 +199,3 @@ def get_server_config(server_type): return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")} else: raise ValueError("Invalid server type.") - \ No newline at end of file From f4849e862184c83e20e115f2ce2beffb38daf914 Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Tue, 29 Oct 2024 23:09:16 -0500 Subject: [PATCH 036/441] add PairedFieldDescriptor two-column tag model --- ...ection_tdamm_manual_collection_tdamm_ml.py | 23 ++++++++++++++++++ sde_collections/models/collection.py | 5 ++++ .../utils/paired_field_descriptor.py | 24 +++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py create mode 100644 sde_collections/utils/paired_field_descriptor.py diff --git a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py new file mode 100644 index 00000000..557ad13e --- /dev/null +++ b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.9 on 2024-10-30 00:44 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="collection", + name="tdamm_manual", + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AddField( + model_name="collection", + name="tdamm_ml", + field=models.CharField(blank=True, max_length=255, null=True), + ), + ] diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 31306b8c..a2d3181c 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -26,6 +26,7 @@ UpdateFrequencies, WorkflowStatusChoices, ) +from ..utils.paired_field_descriptor import PairedFieldDescriptor User = get_user_model() @@ -33,6 +34,10 @@ class Collection(models.Model): """Model definition for Collection.""" + tdamm_manual = models.CharField(max_length=255, null=True, blank=True) + tdamm_ml = models.CharField(max_length=255, null=True, blank=True) + tdamm = PairedFieldDescriptor('tdamm') + name = models.CharField("Name", max_length=1024) config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False) url = models.URLField("URL", max_length=2048) diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py new file mode 100644 index 00000000..e07d41dc --- /dev/null +++ b/sde_collections/utils/paired_field_descriptor.py @@ -0,0 +1,24 @@ +from django.db import models + + +class PairedFieldDescriptor: + def __init__(self, field_name): + self.manual_field_name = f"{field_name}_manual" + self.ml_field_name = f"{field_name}_ml" + + def __get__(self, instance, owner): + if instance is None: + return self + # Return manual tag if available, otherwise ML tag + manual_value = getattr(instance, self.manual_field_name, None) + machine_learning_value = getattr(instance, self.ml_field_name, None) + return manual_value if manual_value is not None else machine_learning_value + + def __set__(self, instance, value): + # Set the value of the manual field + setattr(instance, self.manual_field_name, value) + + def __delete__(self, instance): + # Delete both manual and ML fields + delattr(instance, self.manual_field_name) + delattr(instance, self.ml_field_name) From a469ef1242824645885433ac0d3ecd8d4a23a7fe Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Wed, 30 Oct 2024 16:14:07 -0500 Subject: [PATCH 037/441] add fields to admin panel --- sde_collections/admin.py | 35 +++++++++++++++++++ ...remove_collection_tdamm_manual_and_more.py | 31 ++++++++++++++++ sde_collections/models/collection.py | 6 ++-- sde_collections/serializers.py | 1 + 4 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py diff --git a/sde_collections/admin.py b/sde_collections/admin.py index cb105f80..add9a906 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,6 +7,7 @@ from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api +from django import forms @admin.action(description="Generate deployment message") @@ -174,10 +175,34 @@ def update_config(self, request, queryset): update_config.short_description = "Update configs of selected" +class CollectionForm(forms.ModelForm): + tdamm_tag = forms.CharField(required=False, label="TDAMM Tag") + + class Meta: + model = Collection + fields = "__all__" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.instance and hasattr(self.instance, "tdamm_tag"): + # Set the initial value of tdamm_tag to the computed value + self.fields["tdamm_tag"].initial = self.instance.tdamm_tag + + def clean(self): + cleaned_data = super().clean() + tdamm_value = cleaned_data.get("tdamm_tag") + if tdamm_value: + # Set the manual field with the value from tdamm + cleaned_data["tdamm_tag_manual"] = tdamm_value + return cleaned_data + + @admin.register(Collection) class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): """Admin View for Collection""" + form = CollectionForm + fieldsets = ( ( "Essential information", @@ -187,6 +212,9 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): "config_folder", "url", "division", + "tdamm_tag", + "tdamm_tag_ml", + "tdamm_tag_manual", "document_type", "update_frequency", "source", @@ -215,15 +243,22 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): ), ) + def tdamm_tag(self, obj): + return obj.tdamm_tag + list_display = ( "name", "candidate_urls_count", "config_folder", "url", + "tdamm_tag", + "tdamm_tag_ml", + "tdamm_tag_manual", "division", "new_collection", "is_multi_division", ) + readonly_fields = ("config_folder",) list_filter = ("division", "curation_status", "workflow_status", "turned_on", "is_multi_division") search_fields = ("name", "url", "config_folder") diff --git a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py new file mode 100644 index 00000000..37b817a7 --- /dev/null +++ b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py @@ -0,0 +1,31 @@ +# Generated by Django 4.2.9 on 2024-10-30 21:05 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_collection_tdamm_manual_collection_tdamm_ml"), + ] + + operations = [ + migrations.RemoveField( + model_name="collection", + name="tdamm_manual", + ), + migrations.RemoveField( + model_name="collection", + name="tdamm_ml", + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_manual", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"), + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_ml", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"), + ), + ] diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index a2d3181c..1d140889 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -34,9 +34,9 @@ class Collection(models.Model): """Model definition for Collection.""" - tdamm_manual = models.CharField(max_length=255, null=True, blank=True) - tdamm_ml = models.CharField(max_length=255, null=True, blank=True) - tdamm = PairedFieldDescriptor('tdamm') + tdamm_tag_manual = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM Manual Tag") + tdamm_tag_ml = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM ML Tag") + tdamm_tag = PairedFieldDescriptor('tdamm_tag') name = models.CharField("Name", max_length=1024) config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 9623e85d..19717818 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -26,6 +26,7 @@ class Meta: "workflow_status_display", "curated_by", "division", + "tdamm_tag", "document_type", "name", ) From 8e8e0ac743f6a915e8196c6dc9914060766315eb Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Mon, 4 Nov 2024 00:13:31 -0600 Subject: [PATCH 038/441] moved tdamm_tags feature from collection to candidate_url --- sde_collections/admin.py | 127 ++++++++++++----- ..._candidateurl_tdamm_tag_manual_and_more.py | 134 ++++++++++++++++++ ...ection_tdamm_manual_collection_tdamm_ml.py | 23 --- ...remove_collection_tdamm_manual_and_more.py | 31 ---- sde_collections/models/candidate_url.py | 60 +++++++- sde_collections/models/collection.py | 5 - sde_collections/serializers.py | 2 +- 7 files changed, 287 insertions(+), 95 deletions(-) create mode 100644 sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py delete mode 100644 sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py delete mode 100644 sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py diff --git a/sde_collections/admin.py b/sde_collections/admin.py index add9a906..73576899 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -8,6 +8,7 @@ from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api from django import forms +from django.contrib.postgres.fields import ArrayField @admin.action(description="Generate deployment message") @@ -175,34 +176,10 @@ def update_config(self, request, queryset): update_config.short_description = "Update configs of selected" -class CollectionForm(forms.ModelForm): - tdamm_tag = forms.CharField(required=False, label="TDAMM Tag") - - class Meta: - model = Collection - fields = "__all__" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.instance and hasattr(self.instance, "tdamm_tag"): - # Set the initial value of tdamm_tag to the computed value - self.fields["tdamm_tag"].initial = self.instance.tdamm_tag - - def clean(self): - cleaned_data = super().clean() - tdamm_value = cleaned_data.get("tdamm_tag") - if tdamm_value: - # Set the manual field with the value from tdamm - cleaned_data["tdamm_tag_manual"] = tdamm_value - return cleaned_data - - @admin.register(Collection) class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): """Admin View for Collection""" - form = CollectionForm - fieldsets = ( ( "Essential information", @@ -212,9 +189,6 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): "config_folder", "url", "division", - "tdamm_tag", - "tdamm_tag_ml", - "tdamm_tag_manual", "document_type", "update_frequency", "source", @@ -243,17 +217,11 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): ), ) - def tdamm_tag(self, obj): - return obj.tdamm_tag - list_display = ( "name", "candidate_urls_count", "config_folder", "url", - "tdamm_tag", - "tdamm_tag_ml", - "tdamm_tag_manual", "division", "new_collection", "is_multi_division", @@ -296,13 +264,104 @@ def exclude_and_delete_children(modeladmin, request, queryset): for candidate_url in queryset.all(): candidate_url.get_children().delete() +class CandidateURLForm(forms.ModelForm): + # tdamm_tag = forms.MultipleChoiceField( + # choices=CandidateURL.TDAMM_TAG_CHOICES, + # required=False, + # label="TDAMM Tags", + # widget=forms.CheckboxSelectMultiple, + # ) + + tdamm_tag_ml = forms.MultipleChoiceField( + choices=CandidateURL.TDAMM_TAG_CHOICES, + required=False, + label="TDAMM ML Tags", + widget=forms.CheckboxSelectMultiple, + ) + + tdamm_tag_manual = forms.MultipleChoiceField( + choices=CandidateURL.TDAMM_TAG_CHOICES, + required=False, + label="TDAMM Manual Tags", + widget=forms.CheckboxSelectMultiple, + ) + + class Meta: + model = CandidateURL + fields = '__all__' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Initialize tdamm_tag + # if self.instance and hasattr(self.instance, 'tdamm_tag'): + # self.fields['tdamm_tag'].initial = self.instance.tdamm_tag or [] + + # Initialize tdamm_tag_ml + if self.instance and self.instance.tdamm_tag_ml: + self.fields['tdamm_tag_ml'].initial = self.instance.tdamm_tag_ml + + # Initialize tdamm_tag_manual + if self.instance and self.instance.tdamm_tag_manual: + self.fields['tdamm_tag_manual'].initial = self.instance.tdamm_tag_manual + + def clean(self): + cleaned_data = super().clean() + + # Handle tdamm_tag + # tdamm_tag_value = cleaned_data.get('tdamm_tag', []) + # if not tdamm_tag_value: + # cleaned_data['tdamm_tag_manual'] = None + # else: + # cleaned_data['tdamm_tag_manual'] = tdamm_tag_value + + # Handle tdamm_tag_ml + tdamm_tag_ml_value = cleaned_data.get('tdamm_tag_ml', []) + if not tdamm_tag_ml_value: + cleaned_data['tdamm_tag_ml'] = None + + # Handle tdamm_tag_manual + tdamm_tag_manual_value = cleaned_data.get('tdamm_tag_manual', []) + if not tdamm_tag_manual_value: + cleaned_data['tdamm_tag_manual'] = None + + return cleaned_data class CandidateURLAdmin(admin.ModelAdmin): """Admin View for CandidateURL""" - list_display = ("url", "scraped_title", "collection") + form = CandidateURLForm + + list_display = ( + "url", + "scraped_title", + "collection", + # "tdamm_tag_display", + "tdamm_tag_ml_display", + "tdamm_tag_manual_display" + ) list_filter = ("collection",) + # @admin.display(description='TDAMM Tags') + # def tdamm_tag_display(self, obj): + # if obj.tdamm_tag: + # readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag] + # return ", ".join(readable_tags) + # return "" + + @admin.display(description='TDAMM ML Tags') + def tdamm_tag_ml_display(self, obj): + if obj.tdamm_tag_ml: + readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_ml] + return ", ".join(readable_tags) + return "" + + @admin.display(description='TDAMM Manual Tags') + def tdamm_tag_manual_display(self, obj): + if obj.tdamm_tag_manual: + readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_manual] + return ", ".join(readable_tags) + return "" + class TitlePatternAdmin(admin.ModelAdmin): """Admin View for TitlePattern""" diff --git a/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..057f1ed6 --- /dev/null +++ b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,134 @@ +# Generated by Django 4.2.9 on 2024-11-02 04:36 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + null=True, + size=None, + verbose_name="TDAMM Manual Tags", + ), + ), + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + null=True, + size=None, + verbose_name="TDAMM ML Tags", + ), + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_manual", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"), + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_ml", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"), + ), + ] diff --git a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py deleted file mode 100644 index 557ad13e..00000000 --- a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py +++ /dev/null @@ -1,23 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-30 00:44 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), - ] - - operations = [ - migrations.AddField( - model_name="collection", - name="tdamm_manual", - field=models.CharField(blank=True, max_length=255, null=True), - ), - migrations.AddField( - model_name="collection", - name="tdamm_ml", - field=models.CharField(blank=True, max_length=255, null=True), - ), - ] diff --git a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py deleted file mode 100644 index 37b817a7..00000000 --- a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-30 21:05 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0059_collection_tdamm_manual_collection_tdamm_ml"), - ] - - operations = [ - migrations.RemoveField( - model_name="collection", - name="tdamm_manual", - ), - migrations.RemoveField( - model_name="collection", - name="tdamm_ml", - ), - migrations.AddField( - model_name="collection", - name="tdamm_tag_manual", - field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"), - ), - migrations.AddField( - model_name="collection", - name="tdamm_tag_ml", - field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"), - ), - ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 51c3a28b..f8c91a97 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -7,7 +7,8 @@ from .collection import Collection from .collection_choice_fields import Divisions, DocumentTypes from .pattern import ExcludePattern, TitlePattern - +from ..utils.paired_field_descriptor import PairedFieldDescriptor +from django.contrib.postgres.fields import ArrayField class CandidateURLQuerySet(models.QuerySet): def with_exclusion_status(self): @@ -80,6 +81,63 @@ class CandidateURL(models.Model): help_text="Helps keep track if the Current URL is present in production or not", ) + TDAMM_TAG_CHOICES = [ + ('MMA_M_EM', 'Messenger - EM Radiation'), + ('MMA_M_EM_G', 'Messenger - EM Radiation - Gamma rays'), + ('MMA_M_EM_X', 'Messenger - EM Radiation - X-rays'), + ('MMA_M_EM_U', 'Messenger - EM Radiation - Ultraviolet'), + ('MMA_M_EM_O', 'Messenger - EM Radiation - Optical'), + ('MMA_M_EM_I', 'Messenger - EM Radiation - Infrared'), + ('MMA_M_EM_M', 'Messenger - EM Radiation - Microwave'), + ('MMA_M_EM_R', 'Messenger - EM Radiation - Radio'), + ('MMA_M_G', 'Messenger - Gravitational Waves'), + ('MMA_M_G_CBI', 'Messenger - Gravitational Waves - Compact Binary Inspiral'), + ('MMA_M_G_S', 'Messenger - Gravitational Waves - Stochastic'), + ('MMA_M_G_CON', 'Messenger - Gravitational Waves - Continuous'), + ('MMA_M_G_B', 'Messenger - Gravitational Waves - Burst'), + ('MMA_M_C', 'Messenger - Cosmic Rays'), + ('MMA_M_N', 'Messenger - Neutrinos'), + ('MMA_O_BI', 'Objects - Binaries'), + ('MMA_O_BI_BBH', 'Objects - Binaries - Binary Black Holes'), + ('MMA_O_BI_BNS', 'Objects - Binaries - Binary Neutron Stars'), + ('MMA_O_BI_C', 'Objects - Binaries - Cataclysmic Variables'), + ('MMA_O_BI_N', 'Objects - Binaries - Neutron Star-Black Hole'), + ('MMA_O_BI_B', 'Objects - Binaries - Binary Pulsars'), + ('MMA_O_BI_W', 'Objects - Binaries - White Dwarf Binaries'), + ('MMA_O_BH', 'Objects - Black Holes'), + ('MMA_O_BH_AGN', 'Objects - Black Holes - Active Galactic Nuclei'), + ('MMA_O_BH_IM', 'Objects - Black Holes - Intermediate mass'), + ('MMA_O_BH_STM', 'Objects - Black Holes - Stellar mass'), + ('MMA_O_BH_SUM', 'Objects - Black Holes - Supermassive'), + ('MMA_O_E', 'Objects - Exoplanets'), + ('MMA_O_N', 'Objects - Neutron Stars'), + ('MMA_O_N_M', 'Objects - Neutron Stars - Magnetars'), + ('MMA_O_N_P', 'Objects - Neutron Stars - Pulsars'), + ('MMA_O_N_PWN', 'Objects - Neutron Stars - Pulsar Wind Nebula'), + ('MMA_O_S', 'Objects - Supernova Remnants'), + ('MMA_S_F', 'Signals - Fast Radio Bursts'), + ('MMA_S_G', 'Signals - Gamma-ray Bursts'), + ('MMA_S_K', 'Signals - Kilonovae'), + ('MMA_S_N', 'Signals - Novae'), + ('MMA_S_P', 'Signals - Pevatrons'), + ('MMA_S_ST', 'Signals - Stellar flares'), + ('MMA_S_SU', 'Signals - Supernovae'), + ] + + tdamm_tag_manual = ArrayField( + models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), + blank=True, + null=True, + verbose_name="TDAMM Manual Tags" + ) + tdamm_tag_ml = ArrayField( + models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), + blank=True, + null=True, + verbose_name="TDAMM ML Tags" + ) + tdamm_tag = PairedFieldDescriptor('tdamm_tag') + class Meta: """Meta definition for Candidate URL.""" diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 1d140889..31306b8c 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -26,7 +26,6 @@ UpdateFrequencies, WorkflowStatusChoices, ) -from ..utils.paired_field_descriptor import PairedFieldDescriptor User = get_user_model() @@ -34,10 +33,6 @@ class Collection(models.Model): """Model definition for Collection.""" - tdamm_tag_manual = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM Manual Tag") - tdamm_tag_ml = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM ML Tag") - tdamm_tag = PairedFieldDescriptor('tdamm_tag') - name = models.CharField("Name", max_length=1024) config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False) url = models.URLField("URL", max_length=2048) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 19717818..b7bb3b25 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -26,7 +26,6 @@ class Meta: "workflow_status_display", "curated_by", "division", - "tdamm_tag", "document_type", "name", ) @@ -123,6 +122,7 @@ class Meta: "hash", "file_extension", "tree_root", + "tdamm_tag" ) def get_document_type(self, obj): From 6bf48ff100d32cfe3e52605b13625f044210e79b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:46:19 -0600 Subject: [PATCH 039/441] adding admin views for DumpURL and URL models --- sde_collections/admin.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 4fce1ea7..a8fce352 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,7 +7,9 @@ from .models.collection import Collection, WorkflowHistory from .models.curated_url import CuratedUrl from .models.delta_url import DeltaUrl +from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern +from .models.url import Url from .tasks import import_candidate_urls_from_api @@ -315,6 +317,20 @@ class DeltaUrlAdmin(admin.ModelAdmin): list_filter = ("collection",) +class DumpUrlAdmin(admin.ModelAdmin): + """Admin View for DumpUrl""" + + list_display = ("url", "scraped_title", "generated_title", "collection") + list_filter = ("collection",) + + +class UrlAdmin(admin.ModelAdmin): + """Admin View for Url""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + admin.site.register(WorkflowHistory, WorkflowHistoryAdmin) admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) @@ -323,3 +339,5 @@ class DeltaUrlAdmin(admin.ModelAdmin): admin.site.register(DivisionPattern, DivisionPatternAdmin) admin.site.register(DeltaUrl, DeltaUrlAdmin) admin.site.register(CuratedUrl, CuratedUrlAdmin) +admin.site.register(DumpUrl, DumpUrlAdmin) +admin.site.register(Url, UrlAdmin) From 483685135cceffd131de25ffbf78c9d0bbdee929 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:46:34 -0600 Subject: [PATCH 040/441] migration for the dump URL file --- sde_collections/migrations/0061_dumpurl.py | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 sde_collections/migrations/0061_dumpurl.py diff --git a/sde_collections/migrations/0061_dumpurl.py b/sde_collections/migrations/0061_dumpurl.py new file mode 100644 index 00000000..4aeb0088 --- /dev/null +++ b/sde_collections/migrations/0061_dumpurl.py @@ -0,0 +1,35 @@ +# Generated by Django 4.2.9 on 2024-10-23 19:29 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0060_delete_dumpurl"), + ] + + operations = [ + migrations.CreateModel( + name="DumpUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Dump URL", + "verbose_name_plural": "Dump URLs", + }, + bases=("sde_collections.url",), + ), + ] From 19feff8cd273488bb727db9e9b81b9a0a112701b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:49:31 -0600 Subject: [PATCH 041/441] adding tasks to compare and add URLs to the new models --- sde_collections/tasks.py | 109 +++++++++++++++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 10 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index fa754efc..ecc3c1a9 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -6,11 +6,14 @@ from django.apps import apps from django.conf import settings from django.core import management -from django.core.management.commands import loaddata from config import celery_app from .models.collection import Collection, WorkflowStatusChoices +from .models.curated_url import CuratedUrl +from .models.delta_url import DeltaUrl +from .models.dump_url import DumpUrl +from .models.url import Url from .sinequa_api import Api from .utils.github_helper import GitHubHandler @@ -49,7 +52,7 @@ def _get_data_to_import(collection, server_name): continue augmented_data = { - "model": "sde_collections.candidateurl", + "model": "sde_collections.url", "fields": { "collection": collection_pk, "url": url, @@ -62,6 +65,88 @@ def _get_data_to_import(collection, server_name): return data_to_import +def _compare_and_populate_delta_urls(collection): + """Compare DumpUrl and CuratedUrl and populate DeltaUrl.""" + dump_urls = DumpUrl.objects.filter(collection=collection) + curated_urls = CuratedUrl.objects.filter(collection=collection) + + DeltaUrl.objects.filter(collection=collection).delete() + + curated_urls_dict = {url.url: url for url in curated_urls} + + # Iterate over Dump URLs to find deltas + for dump_url in dump_urls: + curated_url = curated_urls_dict.get(dump_url.url) + + if not curated_url: + # New URL found, add to DeltaUrl + DeltaUrl.objects.create( + collection=collection, + url=dump_url.url, + scraped_title=dump_url.scraped_title, + generated_title=dump_url.generated_title, + document_type=dump_url.document_type, + division=dump_url.division, + delete=False, + ) + elif ( + curated_url.scraped_title != dump_url.scraped_title + or curated_url.generated_title != dump_url.generated_title + or curated_url.document_type != dump_url.document_type + or curated_url.division != dump_url.division + ): + # Metadata changed, add to DeltaUrl + DeltaUrl.objects.create( + collection=collection, + url=dump_url.url, + scraped_title=dump_url.scraped_title, + generated_title=dump_url.generated_title, + document_type=dump_url.document_type, + division=dump_url.division, + delete=False, + ) + + # Mark any missing URLs in CuratedUrl as deleted in DeltaUrl + dump_url_set = set(dump_urls.values_list("url", flat=True)) + for curated_url in curated_urls: + if curated_url.url not in dump_url_set: + DeltaUrl.objects.create( + collection=collection, + url=curated_url.url, + scraped_title=curated_url.scraped_title, + generated_title=curated_url.generated_title, + document_type=curated_url.document_type, + division=curated_url.division, + delete=True, + ) + + +def populate_dump_urls(collection): + urls = Url.objects.filter(collection=collection) + + for url_instance in urls: + try: + # Create DumpUrl by passing in the parent Url fields + dump_url_instance = DumpUrl( + id=url_instance.id, + collection=url_instance.collection, + url=url_instance.url, + scraped_title=url_instance.scraped_title, + visited=url_instance.visited, + document_type=url_instance.document_type, + division=url_instance.division, + ) + dump_url_instance.save() # Save both Url and DumpUrl entries + + print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}") + + except Exception as e: + print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}") + continue + + print(f"Successfully populated DumpUrl model with {urls.count()} entries.") + + @celery_app.task(soft_time_limit=10000) def import_candidate_urls_from_api(server_name="test", collection_ids=[]): TEMP_FOLDER_NAME = "temp" @@ -76,26 +161,30 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]): data_to_import = _get_data_to_import(server_name=server_name, collection=collection) print(f"Got {len(data_to_import)} records for {collection.config_folder}") + print("Clearing DumpUrl model...") + DumpUrl.objects.filter(collection=collection).delete() + print("Dumping django fixture to file") json.dump(data_to_import, open(urls_file, "w")) - print("Deleting existing candidate URLs") - # this sometimes takes a while - collection.candidate_urls.all().delete() + print("Loading data into Url model using loaddata...") + management.call_command("loaddata", urls_file) - print("Loading fixture; this may take a while") - # subprocess.call(f'python manage.py loaddata "{urls_file}"', shell=True) - management.call_command(loaddata.Command(), urls_file) + print("Creating DumpUrl entries...") + populate_dump_urls(collection) print("Applying existing patterns; this may take a while") collection.apply_all_patterns() - if collection.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING: + print("Comparing DumpUrl with CuratedUrl...") + _compare_and_populate_delta_urls(collection) + + if collection.workflow_status != WorkflowStatusChoices.ENGINEERING_IN_PROGRESS: collection.workflow_status = WorkflowStatusChoices.ENGINEERING_IN_PROGRESS collection.save() # Finally set the status to READY_FOR_CURATION - collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION + # collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION collection.save() print("Deleting temp files") From 7e24495fb2489615c0b8a6fd4b79d2e7550c436c Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:50:26 -0600 Subject: [PATCH 042/441] adding a save method for dump URL --- sde_collections/models/dump_url.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py index 85ef85d9..82e168ca 100644 --- a/sde_collections/models/dump_url.py +++ b/sde_collections/models/dump_url.py @@ -7,3 +7,8 @@ class DumpUrl(Url): class Meta: verbose_name = "Dump URL" verbose_name_plural = "Dump URLs" + + def save(self, *args, **kwargs): + if not self.pk: # Ensure it's only called on create + super().save(*args, **kwargs) # Save the parent `Url` entry + super().save(*args, **kwargs) From e5e64f46c26d822c971d741c774bed8dabf1121b Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 4 Nov 2024 11:34:53 -0600 Subject: [PATCH 043/441] move all url models into the same file --- sde_collections/models/curated_url.py | 9 -------- sde_collections/models/delta_url.py | 13 ----------- sde_collections/models/dump_url.py | 14 ------------ sde_collections/models/url.py | 31 +++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 36 deletions(-) delete mode 100644 sde_collections/models/curated_url.py delete mode 100644 sde_collections/models/delta_url.py delete mode 100644 sde_collections/models/dump_url.py diff --git a/sde_collections/models/curated_url.py b/sde_collections/models/curated_url.py deleted file mode 100644 index d55dcb5f..00000000 --- a/sde_collections/models/curated_url.py +++ /dev/null @@ -1,9 +0,0 @@ -from .url import Url - - -class CuratedUrl(Url): - """Model for storing curated and live URLs after the curation process.""" - - class Meta: - verbose_name = "Curated URL" - verbose_name_plural = "Curated URLs" diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py deleted file mode 100644 index 028607ab..00000000 --- a/sde_collections/models/delta_url.py +++ /dev/null @@ -1,13 +0,0 @@ -from django.db import models - -from .url import Url - - -class DeltaUrl(Url): - """Model for storing delta URLs for curation purposes""" - - delete = models.BooleanField(default=False) - - class Meta: - verbose_name = "Delta URL" - verbose_name_plural = "Delta URLs" diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py deleted file mode 100644 index 82e168ca..00000000 --- a/sde_collections/models/dump_url.py +++ /dev/null @@ -1,14 +0,0 @@ -from .url import Url - - -class DumpUrl(Url): - """Model for storing all the imported URLs before seperating them into delta URLs and Curated URLs.""" - - class Meta: - verbose_name = "Dump URL" - verbose_name_plural = "Dump URLs" - - def save(self, *args, **kwargs): - if not self.pk: # Ensure it's only called on create - super().save(*args, **kwargs) # Save the parent `Url` entry - super().save(*args, **kwargs) diff --git a/sde_collections/models/url.py b/sde_collections/models/url.py index 7ce86dff..3fc70243 100644 --- a/sde_collections/models/url.py +++ b/sde_collections/models/url.py @@ -83,3 +83,34 @@ def __str__(self) -> str: def save(self, *args, **kwargs): super().save(*args, **kwargs) + + +class DumpUrl(Url): + """Model for storing all the imported URLs before separating them into delta URLs and Curated URLs.""" + + class Meta: + verbose_name = "Dump URL" + verbose_name_plural = "Dump URLs" + + def save(self, *args, **kwargs): + if not self.pk: # Ensure it's only called on create + super().save(*args, **kwargs) # Save the parent `Url` entry + super().save(*args, **kwargs) + + +class DeltaUrl(Url): + """Model for storing delta URLs for curation purposes""" + + delete = models.BooleanField(default=False) + + class Meta: + verbose_name = "Delta URL" + verbose_name_plural = "Delta URLs" + + +class CuratedUrl(Url): + """Model for storing curated and live URLs after the curation process.""" + + class Meta: + verbose_name = "Curated URL" + verbose_name_plural = "Curated URLs" From 7a906b71d5355fc13cacafd1f985ee692e9474ef Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 4 Nov 2024 11:42:44 -0600 Subject: [PATCH 044/441] update admin url imports --- sde_collections/admin.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index a8fce352..df33af9d 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -5,11 +5,8 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory -from .models.curated_url import CuratedUrl -from .models.delta_url import DeltaUrl -from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .models.url import Url +from .models.url import Url, CuratedUrl, DeltaUrl, DumpUrl from .tasks import import_candidate_urls_from_api From 728a5b425b76d402ffefb83aef5f574fa7b84c2c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 17:42:59 +0000 Subject: [PATCH 045/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/admin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index df33af9d..e7780846 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -6,7 +6,7 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .models.url import Url, CuratedUrl, DeltaUrl, DumpUrl +from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url from .tasks import import_candidate_urls_from_api From f5c69bd4ce64c1edcfdd700e15e0e0404b19ce67 Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Mon, 4 Nov 2024 14:46:19 -0600 Subject: [PATCH 046/441] refactor code --- sde_collections/admin.py | 171 ++++++++++++------ ..._candidateurl_tdamm_tag_manual_and_more.py | 151 ++++++++++++++++ sde_collections/models/candidate_url.py | 120 +++++++----- sde_collections/serializers.py | 21 ++- .../utils/paired_field_descriptor.py | 3 - 5 files changed, 349 insertions(+), 117 deletions(-) create mode 100644 sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 73576899..0860d0e5 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -8,7 +8,6 @@ from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api from django import forms -from django.contrib.postgres.fields import ArrayField @admin.action(description="Generate deployment message") @@ -264,14 +263,8 @@ def exclude_and_delete_children(modeladmin, request, queryset): for candidate_url in queryset.all(): candidate_url.get_children().delete() -class CandidateURLForm(forms.ModelForm): - # tdamm_tag = forms.MultipleChoiceField( - # choices=CandidateURL.TDAMM_TAG_CHOICES, - # required=False, - # label="TDAMM Tags", - # widget=forms.CheckboxSelectMultiple, - # ) +class CandidateURLForm(forms.ModelForm): tdamm_tag_ml = forms.MultipleChoiceField( choices=CandidateURL.TDAMM_TAG_CHOICES, required=False, @@ -285,83 +278,141 @@ class CandidateURLForm(forms.ModelForm): label="TDAMM Manual Tags", widget=forms.CheckboxSelectMultiple, ) - + class Meta: model = CandidateURL - fields = '__all__' + fields = "__all__" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Initialize tdamm_tag - # if self.instance and hasattr(self.instance, 'tdamm_tag'): - # self.fields['tdamm_tag'].initial = self.instance.tdamm_tag or [] - - # Initialize tdamm_tag_ml - if self.instance and self.instance.tdamm_tag_ml: - self.fields['tdamm_tag_ml'].initial = self.instance.tdamm_tag_ml - - # Initialize tdamm_tag_manual - if self.instance and self.instance.tdamm_tag_manual: - self.fields['tdamm_tag_manual'].initial = self.instance.tdamm_tag_manual + instance = kwargs.get("instance") + + # Only show TDAMM fields if is_tdamm is True + if not instance or not instance.is_tdamm: + if "tdamm_tag_ml" in self.fields: + del self.fields["tdamm_tag_ml"] + if "tdamm_tag_manual" in self.fields: + del self.fields["tdamm_tag_manual"] + else: + # Initialize tdamm fields only if is_tdamm is True + if hasattr(self.instance, "tdamm_tag_ml"): + self.fields["tdamm_tag_ml"].initial = self.instance.tdamm_tag_ml or [] + + if hasattr(self.instance, "tdamm_tag_manual"): + self.fields["tdamm_tag_manual"].initial = self.instance.tdamm_tag_manual or [] def clean(self): cleaned_data = super().clean() - - # Handle tdamm_tag - # tdamm_tag_value = cleaned_data.get('tdamm_tag', []) - # if not tdamm_tag_value: - # cleaned_data['tdamm_tag_manual'] = None - # else: - # cleaned_data['tdamm_tag_manual'] = tdamm_tag_value - - # Handle tdamm_tag_ml - tdamm_tag_ml_value = cleaned_data.get('tdamm_tag_ml', []) - if not tdamm_tag_ml_value: - cleaned_data['tdamm_tag_ml'] = None - - # Handle tdamm_tag_manual - tdamm_tag_manual_value = cleaned_data.get('tdamm_tag_manual', []) - if not tdamm_tag_manual_value: - cleaned_data['tdamm_tag_manual'] = None - return cleaned_data + def save(self, commit=True): + instance = super().save(commit=False) + + # Handle TDAMM fields if is_tdamm is True + if instance.is_tdamm: + # Get values from the form + tdamm_tag_ml = self.cleaned_data.get("tdamm_tag_ml", []) + tdamm_tag_manual = self.cleaned_data.get("tdamm_tag_manual", []) + + # Set the values directly on the instance + instance.tdamm_tag_ml = tdamm_tag_ml or None + instance.tdamm_tag_manual = tdamm_tag_manual or None + else: + # Clear TDAMM fields if is_tdamm is False + instance.tdamm_tag_ml = None + instance.tdamm_tag_manual = None + + if commit: + instance.save() + + return instance + + class CandidateURLAdmin(admin.ModelAdmin): """Admin View for CandidateURL""" form = CandidateURLForm - list_display = ( - "url", - "scraped_title", - "collection", - # "tdamm_tag_display", - "tdamm_tag_ml_display", - "tdamm_tag_manual_display" - ) - list_filter = ("collection",) - - # @admin.display(description='TDAMM Tags') - # def tdamm_tag_display(self, obj): - # if obj.tdamm_tag: - # readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag] - # return ", ".join(readable_tags) - # return "" - - @admin.display(description='TDAMM ML Tags') + def get_list_display(self, request): + list_display = [ + "url", + "scraped_title", + "collection", + "is_tdamm", + ] + # Add TDAMM-related fields only if any TDAMM-enabled URLs exist + if CandidateURL.objects.filter(is_tdamm=True).exists(): + list_display.extend(["tdamm_tag_ml_display", "tdamm_tag_manual_display"]) + return list_display + + list_filter = ("collection", "is_tdamm") + + @admin.display(description="TDAMM ML Tags") def tdamm_tag_ml_display(self, obj): - if obj.tdamm_tag_ml: + if obj.is_tdamm and obj.tdamm_tag_ml: readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_ml] return ", ".join(readable_tags) return "" - @admin.display(description='TDAMM Manual Tags') + @admin.display(description="TDAMM Manual Tags") def tdamm_tag_manual_display(self, obj): - if obj.tdamm_tag_manual: + if obj.is_tdamm and obj.tdamm_tag_manual: readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_manual] return ", ".join(readable_tags) return "" + def get_fieldsets(self, request, obj=None): + """Dynamically adjust fieldsets based on is_tdamm""" + fieldsets = [ + ( + "Essential Information", + { + "fields": ( + "collection", + "url", + "hash", + "scraped_title", + "generated_title", + "test_title", + "production_title", + "level", + "visited", + "document_type", + "division", + "inferenced_by", + "is_pdf", + "present_on_test", + "present_on_prod", + "is_tdamm", + ) + }, + ), + ] + + # Add TDAMM fields only if is_tdamm is True + if obj and obj.is_tdamm: + fieldsets.append( + ( + "TDAMM Tags", + { + "fields": ( + "tdamm_tag_ml", + "tdamm_tag_manual", + ), + "classes": ("collapse",), + }, + ) + ) + + return fieldsets + + def save_model(self, request, obj, form, change): + """Ensure proper saving of the model""" + if not obj.is_tdamm: + obj.tdamm_tag_ml = None + obj.tdamm_tag_manual = None + super().save_model(request, obj, form, change) + class TitlePatternAdmin(admin.ModelAdmin): """Admin View for TitlePattern""" diff --git a/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..d8a0a4a7 --- /dev/null +++ b/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,151 @@ +# Generated by Django 4.2.9 on 2024-11-04 06:33 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_candidateurl_tdamm_tag_manual_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="candidateurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + verbose_name="TDAMM Manual Tags", + ), + ), + migrations.RenameField( + model_name="candidateurl", + old_name="tdamm_tag_manual", + new_name="_tdamm_tag_manual", + ), + migrations.AlterField( + model_name="candidateurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + verbose_name="TDAMM ML Tags", + ), + ), + migrations.RenameField( + model_name="candidateurl", + old_name="tdamm_tag_ml", + new_name="_tdamm_tag_ml", + ), + migrations.RemoveField( + model_name="collection", + name="tdamm_tag_manual", + ), + migrations.RemoveField( + model_name="collection", + name="tdamm_tag_ml", + ), + migrations.AddField( + model_name="candidateurl", + name="is_tdamm", + field=models.BooleanField( + default=False, help_text="Enable TDAMM tagging for this URL", verbose_name="Is TDAMM" + ), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index f8c91a97..41c1072f 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -10,6 +10,7 @@ from ..utils.paired_field_descriptor import PairedFieldDescriptor from django.contrib.postgres.fields import ArrayField + class CandidateURLQuerySet(models.QuerySet): def with_exclusion_status(self): return self.annotate( @@ -80,63 +81,91 @@ class CandidateURL(models.Model): default=False, help_text="Helps keep track if the Current URL is present in production or not", ) + is_tdamm = models.BooleanField("Is TDAMM?", default=False, help_text="Enable TDAMM tagging for this URL") TDAMM_TAG_CHOICES = [ - ('MMA_M_EM', 'Messenger - EM Radiation'), - ('MMA_M_EM_G', 'Messenger - EM Radiation - Gamma rays'), - ('MMA_M_EM_X', 'Messenger - EM Radiation - X-rays'), - ('MMA_M_EM_U', 'Messenger - EM Radiation - Ultraviolet'), - ('MMA_M_EM_O', 'Messenger - EM Radiation - Optical'), - ('MMA_M_EM_I', 'Messenger - EM Radiation - Infrared'), - ('MMA_M_EM_M', 'Messenger - EM Radiation - Microwave'), - ('MMA_M_EM_R', 'Messenger - EM Radiation - Radio'), - ('MMA_M_G', 'Messenger - Gravitational Waves'), - ('MMA_M_G_CBI', 'Messenger - Gravitational Waves - Compact Binary Inspiral'), - ('MMA_M_G_S', 'Messenger - Gravitational Waves - Stochastic'), - ('MMA_M_G_CON', 'Messenger - Gravitational Waves - Continuous'), - ('MMA_M_G_B', 'Messenger - Gravitational Waves - Burst'), - ('MMA_M_C', 'Messenger - Cosmic Rays'), - ('MMA_M_N', 'Messenger - Neutrinos'), - ('MMA_O_BI', 'Objects - Binaries'), - ('MMA_O_BI_BBH', 'Objects - Binaries - Binary Black Holes'), - ('MMA_O_BI_BNS', 'Objects - Binaries - Binary Neutron Stars'), - ('MMA_O_BI_C', 'Objects - Binaries - Cataclysmic Variables'), - ('MMA_O_BI_N', 'Objects - Binaries - Neutron Star-Black Hole'), - ('MMA_O_BI_B', 'Objects - Binaries - Binary Pulsars'), - ('MMA_O_BI_W', 'Objects - Binaries - White Dwarf Binaries'), - ('MMA_O_BH', 'Objects - Black Holes'), - ('MMA_O_BH_AGN', 'Objects - Black Holes - Active Galactic Nuclei'), - ('MMA_O_BH_IM', 'Objects - Black Holes - Intermediate mass'), - ('MMA_O_BH_STM', 'Objects - Black Holes - Stellar mass'), - ('MMA_O_BH_SUM', 'Objects - Black Holes - Supermassive'), - ('MMA_O_E', 'Objects - Exoplanets'), - ('MMA_O_N', 'Objects - Neutron Stars'), - ('MMA_O_N_M', 'Objects - Neutron Stars - Magnetars'), - ('MMA_O_N_P', 'Objects - Neutron Stars - Pulsars'), - ('MMA_O_N_PWN', 'Objects - Neutron Stars - Pulsar Wind Nebula'), - ('MMA_O_S', 'Objects - Supernova Remnants'), - ('MMA_S_F', 'Signals - Fast Radio Bursts'), - ('MMA_S_G', 'Signals - Gamma-ray Bursts'), - ('MMA_S_K', 'Signals - Kilonovae'), - ('MMA_S_N', 'Signals - Novae'), - ('MMA_S_P', 'Signals - Pevatrons'), - ('MMA_S_ST', 'Signals - Stellar flares'), - ('MMA_S_SU', 'Signals - Supernovae'), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), ] - tdamm_tag_manual = ArrayField( + # Define TDAMM fields but make them optional + @property + def tdamm_tag_manual(self): + if hasattr(self, "_tdamm_tag_manual") and self.is_tdamm: + return self._tdamm_tag_manual + return None + + @tdamm_tag_manual.setter + def tdamm_tag_manual(self, value): + if self.is_tdamm: + self._tdamm_tag_manual = value + + @property + def tdamm_tag_ml(self): + if hasattr(self, "_tdamm_tag_ml") and self.is_tdamm: + return self._tdamm_tag_ml + return None + + @tdamm_tag_ml.setter + def tdamm_tag_ml(self, value): + if self.is_tdamm: + self._tdamm_tag_ml = value + + _tdamm_tag_manual = ArrayField( models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), blank=True, null=True, - verbose_name="TDAMM Manual Tags" + verbose_name="TDAMM Manual Tags", + db_column="tdamm_tag_manual", ) - tdamm_tag_ml = ArrayField( + + _tdamm_tag_ml = ArrayField( models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), blank=True, null=True, - verbose_name="TDAMM ML Tags" + verbose_name="TDAMM ML Tags", + db_column="tdamm_tag_ml", ) - tdamm_tag = PairedFieldDescriptor('tdamm_tag') + + tdamm_tag = PairedFieldDescriptor("tdamm_tag") class Meta: """Meta definition for Candidate URL.""" @@ -144,6 +173,7 @@ class Meta: verbose_name = "Candidate URL" verbose_name_plural = "Candidate URLs" ordering = ["url"] + db_table = "sde_collections_candidateurl" @property def fileext(self) -> str: diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index b7bb3b25..29d86c31 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -112,18 +112,21 @@ class CandidateURLAPISerializer(serializers.ModelSerializer): title = serializers.SerializerMethodField() file_extension = serializers.SerializerMethodField() tree_root = serializers.SerializerMethodField() + tdamm_tag = serializers.SerializerMethodField() class Meta: model = CandidateURL - fields = ( - "url", - "title", - "document_type", - "hash", - "file_extension", - "tree_root", - "tdamm_tag" - ) + fields = ("url", "title", "document_type", "hash", "file_extension", "tree_root", "is_tdamm", "tdamm_tag") + + def to_representation(self, instance): + """Remove tdamm_tag field if is_tdamm is False""" + representation = super().to_representation(instance) + if not instance.is_tdamm: + representation.pop("tdamm_tag", None) + return representation + + def get_tdamm_tag(self, obj): + return obj.tdamm_tag def get_document_type(self, obj): if obj.document_type is not None: diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py index e07d41dc..9ac0c4e3 100644 --- a/sde_collections/utils/paired_field_descriptor.py +++ b/sde_collections/utils/paired_field_descriptor.py @@ -1,6 +1,3 @@ -from django.db import models - - class PairedFieldDescriptor: def __init__(self, field_name): self.manual_field_name = f"{field_name}_manual" From 7e888e8457f02bbe8417f6e66ecc1d52be9608c4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 20:49:23 +0000 Subject: [PATCH 047/441] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/admin.py | 2 +- sde_collections/models/candidate_url.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 0860d0e5..bf97cf02 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -1,5 +1,6 @@ import csv +from django import forms from django.contrib import admin, messages from django.http import HttpResponse @@ -7,7 +8,6 @@ from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api -from django import forms @admin.action(description="Generate deployment message") diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 41c1072f..8d2776dd 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -2,13 +2,13 @@ import os from urllib.parse import urlparse +from django.contrib.postgres.fields import ArrayField from django.db import models +from ..utils.paired_field_descriptor import PairedFieldDescriptor from .collection import Collection from .collection_choice_fields import Divisions, DocumentTypes from .pattern import ExcludePattern, TitlePattern -from ..utils.paired_field_descriptor import PairedFieldDescriptor -from django.contrib.postgres.fields import ArrayField class CandidateURLQuerySet(models.QuerySet): From df88c6b11c1a91709bfcd01a1a88f8887f8b814b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:23:41 -0600 Subject: [PATCH 048/441] squashed migrations --- .../0059_url_curatedurl_deltaurl_dumpurl.py | 2 +- .../migrations/0060_delete_dumpurl.py | 16 --------- sde_collections/migrations/0061_dumpurl.py | 35 ------------------- 3 files changed, 1 insertion(+), 52 deletions(-) delete mode 100644 sde_collections/migrations/0060_delete_dumpurl.py delete mode 100644 sde_collections/migrations/0061_dumpurl.py diff --git a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py index 82f4d4af..58478546 100644 --- a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py +++ b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.9 on 2024-10-10 03:01 +# Generated by Django 4.2.9 on 2024-11-04 22:22 from django.db import migrations, models import django.db.models.deletion diff --git a/sde_collections/migrations/0060_delete_dumpurl.py b/sde_collections/migrations/0060_delete_dumpurl.py deleted file mode 100644 index db9a10c1..00000000 --- a/sde_collections/migrations/0060_delete_dumpurl.py +++ /dev/null @@ -1,16 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-14 16:37 - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"), - ] - - operations = [ - migrations.DeleteModel( - name="DumpUrl", - ), - ] diff --git a/sde_collections/migrations/0061_dumpurl.py b/sde_collections/migrations/0061_dumpurl.py deleted file mode 100644 index 4aeb0088..00000000 --- a/sde_collections/migrations/0061_dumpurl.py +++ /dev/null @@ -1,35 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-23 19:29 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0060_delete_dumpurl"), - ] - - operations = [ - migrations.CreateModel( - name="DumpUrl", - fields=[ - ( - "url_ptr", - models.OneToOneField( - auto_created=True, - on_delete=django.db.models.deletion.CASCADE, - parent_link=True, - primary_key=True, - serialize=False, - to="sde_collections.url", - ), - ), - ], - options={ - "verbose_name": "Dump URL", - "verbose_name_plural": "Dump URLs", - }, - bases=("sde_collections.url",), - ), - ] From 48592cb6af69176b54fca27944dc0da370178aa1 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:24:01 -0600 Subject: [PATCH 049/441] updated import references --- sde_collections/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 2f11700b..c42a84e6 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -3,7 +3,6 @@ from .models.candidate_url import CandidateURL from .models.collection import Collection, WorkflowHistory from .models.collection_choice_fields import Divisions, DocumentTypes -from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -11,6 +10,7 @@ IncludePattern, TitlePattern, ) +from .models.url import CuratedUrl class CollectionSerializer(serializers.ModelSerializer): From 266082c6f6054af9b0a72ed9cdf1a227012a080b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:24:20 -0600 Subject: [PATCH 050/441] updated import references --- sde_collections/tasks.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index ecc3c1a9..77876500 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -10,10 +10,7 @@ from config import celery_app from .models.collection import Collection, WorkflowStatusChoices -from .models.curated_url import CuratedUrl -from .models.delta_url import DeltaUrl -from .models.dump_url import DumpUrl -from .models.url import Url +from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url from .sinequa_api import Api from .utils.github_helper import GitHubHandler From c3e2aee2be337ab04387e860c6cef24fcc8266ac Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:24:39 -0600 Subject: [PATCH 051/441] update import references --- sde_collections/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/views.py b/sde_collections/views.py index b8ff70a0..5d5d2982 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -27,7 +27,6 @@ DocumentTypes, WorkflowStatusChoices, ) -from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -35,6 +34,7 @@ IncludePattern, TitlePattern, ) +from .models.url import CuratedUrl from .serializers import ( CandidateURLBulkCreateSerializer, CandidateURLSerializer, From f95a1a2666c3ab3d34c3331ccc883e83aa8c6006 Mon Sep 17 00:00:00 2001 From: Dhanur Sharma Date: Wed, 6 Nov 2024 15:55:28 -0600 Subject: [PATCH 052/441] Frontend work in progress --- sde_collections/serializers.py | 90 +- sde_collections/urls.py | 2 + sde_collections/views.py | 64 +- .../static/js/candidate_url_list.js | 848 +++++++++++++---- .../sde_collections/candidate_urls_list.html | 893 +++++++++++------- 5 files changed, 1378 insertions(+), 519 deletions(-) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index c42a84e6..ff1b6d3d 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -10,7 +10,7 @@ IncludePattern, TitlePattern, ) -from .models.url import CuratedUrl +from .models.url import CuratedUrl, DeltaUrl class CollectionSerializer(serializers.ModelSerializer): @@ -99,6 +99,94 @@ class Meta: ) +class CuratedURLSerializer(serializers.ModelSerializer): + excluded = serializers.BooleanField(required=False) + document_type_display = serializers.CharField(source="get_document_type_display", read_only=True) + division_display = serializers.CharField(source="get_division_display", read_only=True) + url = serializers.CharField(required=False) + generated_title_id = serializers.SerializerMethodField(read_only=True) + match_pattern_type = serializers.SerializerMethodField(read_only=True) + curated_urls_count = serializers.SerializerMethodField(read_only=True) + + def get_curated_urls_count(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.curated_urls.count() if titlepattern else 0 + + def get_generated_title_id(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.id if titlepattern else None + + def get_match_pattern_type(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.match_pattern_type if titlepattern else None + + class Meta: + model = CuratedUrl + fields = ( + "id", + "excluded", + "url", + "scraped_title", + "generated_title", + "generated_title_id", + "match_pattern_type", + "curated_urls_count", + "document_type", + "document_type_display", + "division", + "division_display", + "visited", + # "test_title", + # "production_title", + # "present_on_test", + # "present_on_prod", + ) + + +class DeltaURLSerializer(serializers.ModelSerializer): + excluded = serializers.BooleanField(required=False) + document_type_display = serializers.CharField(source="get_document_type_display", read_only=True) + division_display = serializers.CharField(source="get_division_display", read_only=True) + url = serializers.CharField(required=False) + generated_title_id = serializers.SerializerMethodField(read_only=True) + match_pattern_type = serializers.SerializerMethodField(read_only=True) + delta_urls_count = serializers.SerializerMethodField(read_only=True) + + def get_delta_urls_count(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.delta_urls.count() if titlepattern else 0 + + def get_generated_title_id(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.id if titlepattern else None + + def get_match_pattern_type(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.match_pattern_type if titlepattern else None + + class Meta: + model = DeltaUrl + fields = ( + "id", + "excluded", + "url", + "scraped_title", + "generated_title", + "generated_title_id", + "match_pattern_type", + "delta_urls_count", + "document_type", + "document_type_display", + "division", + "division_display", + "visited", + # "test_title", + # "production_title", + # "present_on_test", + # "present_on_prod", + ) + + class CandidateURLBulkCreateSerializer(serializers.ModelSerializer): class Meta: model = CandidateURL diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 214d1198..a17f6390 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -9,6 +9,8 @@ router.register(r"collections", views.CollectionViewSet, basename="collection") router.register(r"collections-read", views.CollectionReadViewSet, basename="collection-read") router.register(r"candidate-urls", views.CandidateURLViewSet) +router.register(r"curated-urls", views.CuratedURLViewSet) +router.register(r"delta-urls", views.DeltaURLViewSet) router.register(r"exclude-patterns", views.ExcludePatternViewSet) router.register(r"include-patterns", views.IncludePatternViewSet) router.register(r"title-patterns", views.TitlePatternViewSet) diff --git a/sde_collections/views.py b/sde_collections/views.py index 5d5d2982..f738b23d 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -34,13 +34,15 @@ IncludePattern, TitlePattern, ) -from .models.url import CuratedUrl +from .models.url import CuratedUrl, DeltaUrl from .serializers import ( CandidateURLBulkCreateSerializer, CandidateURLSerializer, CollectionReadSerializer, CollectionSerializer, CuratedUrlAPISerializer, + CuratedURLSerializer, + DeltaURLSerializer, DivisionPatternSerializer, DocumentTypePatternSerializer, ExcludePatternSerializer, @@ -285,6 +287,66 @@ def update_division(self, request, pk=None): return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) +class CuratedURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet): + queryset = CuratedUrl.objects.all() + serializer_class = CuratedURLSerializer + + def _filter_by_is_excluded(self, queryset, is_excluded): + if is_excluded == "false": + queryset = queryset.filter(excluded=False) + elif is_excluded == "true": + queryset = queryset.exclude(excluded=False) + return queryset + + def get_queryset(self): + queryset = super().get_queryset() + if self.request.method == "GET": + # Filter based on exclusion status + is_excluded = self.request.GET.get("is_excluded") + if is_excluded: + queryset = self._filter_by_is_excluded(queryset, is_excluded) + return queryset.order_by("url") + + def update_division(self, request, pk=None): + curated_url = get_object_or_404(CuratedUrl, pk=pk) + division = request.data.get("division") + if division: + curated_url.division = division + curated_url.save() + return Response(status=status.HTTP_200_OK) + return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) + + +class DeltaURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet): + queryset = DeltaUrl.objects.all() + serializer_class = DeltaURLSerializer + + def _filter_by_is_excluded(self, queryset, is_excluded): + if is_excluded == "false": + queryset = queryset.filter(excluded=False) + elif is_excluded == "true": + queryset = queryset.exclude(excluded=False) + return queryset + + def get_queryset(self): + queryset = super().get_queryset() + if self.request.method == "GET": + # Filter based on exclusion status + is_excluded = self.request.GET.get("is_excluded") + if is_excluded: + queryset = self._filter_by_is_excluded(queryset, is_excluded) + return queryset.order_by("url") + + def update_division(self, request, pk=None): + delta_url = get_object_or_404(DeltaUrl, pk=pk) + division = request.data.get("division") + if division: + delta_url.division = division + delta_url.save() + return Response(status=status.HTTP_200_OK) + return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) + + class CandidateURLBulkCreateView(generics.ListCreateAPIView): queryset = CandidateURL.objects.all() serializer_class = CandidateURLBulkCreateSerializer diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index ed6d3e4b..7b01cc6c 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -322,6 +322,436 @@ function initializeDataTable() { }, 1000) ); + var curated_urls_table = $("#curated_urls_table").DataTable({ + pageLength: 100, + colReorder: true, + stateSave: true, + layout: { + bottomEnd: "inputPaging", + topEnd: null, + topStart: { + info: true, + pageLength: { + menu: [ + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], + ], + }, + buttons: [ + { + extend: "csv", + exportOptions: { + columns: [0, 11, 2, 12, 10], + }, + customize: function (csv) { + var lines = csv.split("\n"); + + // Reorder the header columns + var headers = lines[0].split(","); + headers[4] = "New Title"; + var reorderedHeaders = [ + headers[0], + headers[3], + headers[1], + headers[4], + headers[5], + headers[2], + ]; + lines[0] = reorderedHeaders.join(","); + + const appliedFilt = [ + [`URL:`, `${$("#curatedUrlFilter").val()}`.trim()], + [`Exclude:`, `${$(".dropdown-1").val()}`.trim()], + [ + `Scraped Title:`, + `${$("#curatedScrapedTitleFilter").val()}`.trim(), + ], + [`New Title:`, `${$("#curatedNewTitleFilter").val()}`.trim()], + [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()], + [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()], + ]; + + const filtersAreEmpty = appliedFilt.every((filter) => { + return filter[1] === "" || filter[1] === "undefined"; + }); + + // Remove the second row with the filters + if (lines.length > 2) { + lines.splice(1, 1); + } + let alteredLines = []; + lines.forEach((line) => { + let newLine = ""; + newLine = line.replace("open_in_new", ""); + alteredLines.push(newLine); + }); + + if (filtersAreEmpty) return alteredLines.join("\n"); + else { + // Add filter information to the first row + const secondRowFilters = [ + "Export of SDE Curated URLs", + `"(Applied Filters: ${appliedFilt + .reduce((acc, curr) => { + if ( + curr[1] !== " undefined" && + curr[1] !== " " && + curr[1] !== "" && + curr[1] !== "undefined" + ) { + acc = `${acc}, ${curr[0]} ${curr[1]}`; + } + return acc; + }, "") + .slice(2)})"`, + ]; + + var appliedFiltersInfo = secondRowFilters.join("\n"); + return appliedFiltersInfo + "\n" + alteredLines.join("\n"); + } + }, + }, + "spacer", + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#curated_urls_table"); + }, + }, + ], + }, + }, + serverSide: true, + orderCellsTop: true, + pagingType: "input", + rowId: "url", + stateLoadCallback: function (settings) { + var state = JSON.parse( + localStorage.getItem( + "DataTables_curated_urls_" + window.location.pathname + ) + ); + if (!state) { + settings.oInit.pageLength = 1; + } + return state; + }, + ajax: { + url: `/api/curated-urls/?format=datatables&collection_id=${collection_id}`, + data: function (d) { + d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; + }, + }, + initComplete: function (data) { + const addDropdownSelect = [1, 4, 5]; + const dict = { + 1: "Images", + 2: "Data", + 3: "Documentation", + 4: "Software and Tools", + 5: "Missions and Instruments", + }; + this.api() + .columns() + .every(function (index) { + let column = this; + if (addDropdownSelect.includes(index)) { + $("thead tr td select.dropdown-" + index).on("change", function () { + var val = $.fn.dataTable.util.escapeRegex($(this).val()); + column.search(val ? "^" + val + "$" : "", true, false).draw(); + }); + } + }); + }, + + columns: [ + getCuratedURLColumn(), + getExcludedColumn(true_icon, false_icon), + getScrapedTitleColumn(), + getCuratedGeneratedTitleColumn(), + getDocumentTypeColumn(), + getDivisionColumn(), + { data: "id", visible: false, searchable: false }, + { data: "generated_title_id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false, searchable: false }, + { data: "curated_urls_count", visible: false, searchable: false }, + { data: "excluded", visible: false, searchable: false }, + { + data: null, + render: function (data, type, row) { + if (!row.document_type) return "Select"; + return dict[row.document_type]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + const excludedDict = { + true: "Yes", + false: "No", + }; + return excludedDict[row.excluded]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + return row.generated_title; + }, + visible: false, + }, + // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []), + // getDivisionColumn(), + ], + createdRow: function (row, data, dataIndex) { + if (data["excluded"]) { + $(row).attr( + "style", + "background-color: rgba(255, 61, 87, 0.36) !important" + ); + } + }, + }); + + $("#curatedUrlFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(0).search(this.value).draw(); + }, 1000) + ); + + $("#curatedScrapedTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(2).search(this.value).draw(); + }, 1000) + ); + + $("#curatedNewTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(3).search(this.value).draw(); + }, 1000) + ); + + var delta_urls_table = $("#delta_urls_table").DataTable({ + pageLength: 100, + colReorder: true, + stateSave: true, + layout: { + bottomEnd: "inputPaging", + topEnd: null, + topStart: { + info: true, + pageLength: { + menu: [ + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], + ], + }, + buttons: [ + { + extend: "csv", + exportOptions: { + columns: [0, 11, 2, 12, 10], + }, + customize: function (csv) { + var lines = csv.split("\n"); + + // Reorder the header columns + var headers = lines[0].split(","); + headers[4] = "New Title"; + var reorderedHeaders = [ + headers[0], + headers[3], + headers[1], + headers[4], + headers[5], + headers[2], + ]; + lines[0] = reorderedHeaders.join(","); + + const appliedFilt = [ + [`URL:`, `${$("#deltaUrlFilter").val()}`.trim()], + [`Exclude:`, `${$(".dropdown-1").val()}`.trim()], + [ + `Scraped Title:`, + `${$("#deltaScrapedTitleFilter").val()}`.trim(), + ], + [`New Title:`, `${$("#deltaNewTitleFilter").val()}`.trim()], + [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()], + [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()], + ]; + + const filtersAreEmpty = appliedFilt.every((filter) => { + return filter[1] === "" || filter[1] === "undefined"; + }); + + // Remove the second row with the filters + if (lines.length > 2) { + lines.splice(1, 1); + } + let alteredLines = []; + lines.forEach((line) => { + let newLine = ""; + newLine = line.replace("open_in_new", ""); + alteredLines.push(newLine); + }); + + if (filtersAreEmpty) return alteredLines.join("\n"); + else { + // Add filter information to the first row + const secondRowFilters = [ + "Export of SDE Delta URLs", + `"(Applied Filters: ${appliedFilt + .reduce((acc, curr) => { + if ( + curr[1] !== " undefined" && + curr[1] !== " " && + curr[1] !== "" && + curr[1] !== "undefined" + ) { + acc = `${acc}, ${curr[0]} ${curr[1]}`; + } + return acc; + }, "") + .slice(2)})"`, + ]; + + var appliedFiltersInfo = secondRowFilters.join("\n"); + return appliedFiltersInfo + "\n" + alteredLines.join("\n"); + } + }, + }, + "spacer", + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#delta_urls_table"); + }, + }, + ], + }, + }, + serverSide: true, + orderCellsTop: true, + pagingType: "input", + rowId: "url", + stateLoadCallback: function (settings) { + var state = JSON.parse( + localStorage.getItem( + "DataTables_delta_urls_" + window.location.pathname + ) + ); + if (!state) { + settings.oInit.pageLength = 1; + } + return state; + }, + ajax: { + url: `/api/delta-urls/?format=datatables&collection_id=${collection_id}`, + data: function (d) { + d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; + }, + }, + initComplete: function (data) { + const addDropdownSelect = [1, 4, 5]; + const dict = { + 1: "Images", + 2: "Data", + 3: "Documentation", + 4: "Software and Tools", + 5: "Missions and Instruments", + }; + this.api() + .columns() + .every(function (index) { + let column = this; + if (addDropdownSelect.includes(index)) { + $("thead tr td select.dropdown-" + index).on("change", function () { + var val = $.fn.dataTable.util.escapeRegex($(this).val()); + column.search(val ? "^" + val + "$" : "", true, false).draw(); + }); + } + }); + }, + + columns: [ + getDeltaURLColumn(), + getExcludedColumn(true_icon, false_icon), + getScrapedTitleColumn(), + getDeltaGeneratedTitleColumn(), + getDocumentTypeColumn(), + getDivisionColumn(), + { data: "id", visible: false, searchable: false }, + { data: "generated_title_id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false, searchable: false }, + { data: "delta_urls_count", visible: false, searchable: false }, + { data: "excluded", visible: false, searchable: false }, + { + data: null, + render: function (data, type, row) { + if (!row.document_type) return "Select"; + return dict[row.document_type]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + const excludedDict = { + true: "Yes", + false: "No", + }; + return excludedDict[row.excluded]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + return row.generated_title; + }, + visible: false, + }, + // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []), + // getDivisionColumn(), + ], + createdRow: function (row, data, dataIndex) { + if (data["excluded"]) { + $(row).attr( + "style", + "background-color: rgba(255, 61, 87, 0.36) !important" + ); + } + }, + }); + + $("#deltaUrlFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + delta_urls_table.columns(0).search(this.value).draw(); + }, 1000) + ); + + $("#deltaScrapedTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + delta_urls_table.columns(2).search(this.value).draw(); + }, 1000) + ); + + $("#deltaNewTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + delta_urls_table.columns(3).search(this.value).draw(); + }, 1000) + ); + var exclude_patterns_table = $("#exclude_patterns_table").DataTable({ // scrollY: true, dom: "lBrtip", @@ -682,96 +1112,96 @@ function initializeDataTable() { var division_patterns_table = $("#division_patterns_table").DataTable({ dom: "lBrtip", buttons: [ - { - text: "Add Pattern", - className: "addPattern", - action: function () { - $modal = $("#divisionPatternModal").modal(); - }, + { + text: "Add Pattern", + className: "addPattern", + action: function () { + $modal = $("#divisionPatternModal").modal(); }, - { - text: "Customize Columns", - className: "customizeColumns", - action: function () { - modalContents("#division_patterns_table"); - }, + }, + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#division_patterns_table"); }, + }, ], lengthMenu: [ - [25, 50, 100, 500], - ["Show 25", "Show 50", "Show 100", "Show 500"], + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], ], orderCellsTop: true, pageLength: 100, ajax: `/api/division-patterns/?format=datatables&collection_id=${collection_id}`, initComplete: function (data) { - this.api() - .columns() - .every(function (index) { - var table = $("#division_patterns_table").DataTable(); - - let addDropdownSelect = { - 1: { - columnToSearch: 6, - matchPattern: { - "Individual URL Pattern": 1, - "Multi-URL Pattern": 2, - }, - }, - 2: { - columnToSearch: 7, - matchPattern: { - "Astrophysics": 1, - "Biological and Physical Sciences": 2, - "Earth Science": 3, - "Heliophysics": 4, - "Planetary Science": 5, - }, - }, - }; - - let column = this; - if (column.data().length === 0) { - $(`#division-patterns-dropdown-${index}`).prop("disabled", true); - } else if (index in addDropdownSelect) { - $("#division-patterns-dropdown-" + index).on("change", function () { - let col = addDropdownSelect[index].columnToSearch; - let searchInput = - addDropdownSelect[index].matchPattern[$(this).val()]; - if ($(this).val() === "" || $(this).val() === undefined) - table.columns(col).search("").draw(); - else { - table.columns(col).search(searchInput).draw(); - } - }); - } + this.api() + .columns() + .every(function (index) { + var table = $("#division_patterns_table").DataTable(); + + let addDropdownSelect = { + 1: { + columnToSearch: 6, + matchPattern: { + "Individual URL Pattern": 1, + "Multi-URL Pattern": 2, + }, + }, + 2: { + columnToSearch: 7, + matchPattern: { + "Astrophysics": 1, + "Biological and Physical Sciences": 2, + "Earth Science": 3, + "Heliophysics": 4, + "Planetary Science": 5, + }, + }, + }; + + let column = this; + if (column.data().length === 0) { + $(`#division-patterns-dropdown-${index}`).prop("disabled", true); + } else if (index in addDropdownSelect) { + $("#division-patterns-dropdown-" + index).on("change", function () { + let col = addDropdownSelect[index].columnToSearch; + let searchInput = + addDropdownSelect[index].matchPattern[$(this).val()]; + if ($(this).val() === "" || $(this).val() === undefined) + table.columns(col).search("").draw(); + else { + table.columns(col).search(searchInput).draw(); + } }); + } + }); }, columns: [ - { data: "match_pattern", class: "whiteText" }, - { - data: "match_pattern_type_display", - class: "text-center whiteText", - sortable: false, - }, - { data: "division_display", class: "whiteText" }, - { - data: "candidate_urls_count", - class: "text-center whiteText", - sortable: true, - }, - { - data: null, - sortable: false, - class: "text-center", - render: function (data, type, row) { - return ``; - }, + { data: "match_pattern", class: "whiteText" }, + { + data: "match_pattern_type_display", + class: "text-center whiteText", + sortable: false, + }, + { data: "division_display", class: "whiteText" }, + { + data: "candidate_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: null, + sortable: false, + class: "text-center", + render: function (data, type, row) { + return ``; }, - { data: "id", visible: false, searchable: false }, - { data: "match_pattern_type", visible: false }, - { data: "division", visible: false }, + }, + { data: "id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false }, + { data: "division", visible: false }, ], }); @@ -841,8 +1271,8 @@ function getDivisionColumn() { `; }, @@ -882,7 +1312,7 @@ $("#division_pattern_form").on("submit", function (e) { inputs = {}; input_serialized = $(this).serializeArray(); input_serialized.forEach((field) => { - inputs[field.name] = field.value; + inputs[field.name] = field.value; }); console.log("Form Inputs:", inputs); // Debugging line to check inputs @@ -902,43 +1332,43 @@ $(".division_form_select").on("click", function (e) { function postDivisionPatterns(match_pattern, match_pattern_type, division) { if (!match_pattern) { - toastr.error("Please highlight a pattern to add division."); - return; + toastr.error("Please highlight a pattern to add division."); + return; } $.ajax({ - url: "/api/division-patterns/", - type: "POST", - data: { - collection: collection_id, - match_pattern: match_pattern, - match_pattern_type: match_pattern_type, - division: division, - csrfmiddlewaretoken: csrftoken, - }, - success: function (data) { - $("#candidate_urls_table").DataTable().ajax.reload(null, false); - $("#division_patterns_table").DataTable().ajax.reload(null, false); - if (currentTab === "") { // Only add a notification if we are on the first tab - newDivisionPatternsCount = newDivisionPatternsCount + 1; - $("#divisionPatternsTab").html( - `Division Patterns ` + - newDivisionPatternsCount + " new" + - `` - ); - } - }, - error: function (xhr, status, error) { - var errorMessage = xhr.responseText; - if ( - errorMessage == - '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}' - ) { - toastr.success("Pattern already exists"); - return; - } - toastr.error(errorMessage); - }, + url: "/api/division-patterns/", + type: "POST", + data: { + collection: collection_id, + match_pattern: match_pattern, + match_pattern_type: match_pattern_type, + division: division, + csrfmiddlewaretoken: csrftoken, + }, + success: function (data) { + $("#candidate_urls_table").DataTable().ajax.reload(null, false); + $("#division_patterns_table").DataTable().ajax.reload(null, false); + if (currentTab === "") { // Only add a notification if we are on the first tab + newDivisionPatternsCount = newDivisionPatternsCount + 1; + $("#divisionPatternsTab").html( + `Division Patterns ` + + newDivisionPatternsCount + " new" + + `` + ); + } + }, + error: function (xhr, status, error) { + var errorMessage = xhr.responseText; + if ( + errorMessage == + '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}' + ) { + toastr.success("Pattern already exists"); + return; + } + toastr.error(errorMessage); + }, }); } @@ -950,9 +1380,36 @@ function getURLColumn() { return `
${remove_protocol( data )} - open_in_new
`; + open_in_new`; + }, + }; +} + +function getCuratedURLColumn() { + return { + data: "url", + width: "30%", + render: function (data, type, row) { + return `
${remove_protocol( + data + )} + open_in_new
`; + }, + }; +} + +function getDeltaURLColumn() { + return { + data: "url", + width: "30%", + render: function (data, type, row) { + return `
${remove_protocol( + data + )} + open_in_new
`; }, }; } @@ -972,13 +1429,36 @@ function getGeneratedTitleColumn() { data: "generated_title", width: "20%", render: function (data, type, row) { - return ``; + return ``; + }, + }; +} + +function getCuratedGeneratedTitleColumn() { + return { + data: "generated_title", + width: "20%", + render: function (data, type, row) { + return ``; + }, + }; +} + +function getDeltaGeneratedTitleColumn() { + return { + data: "generated_title", + width: "20%", + render: function (data, type, row) { + return ``; }, }; } @@ -991,11 +1471,11 @@ function getExcludedColumn(true_icon, false_icon) { render: function (data, type, row) { return data === true ? `${true_icon}` + row["url"] + )}>${true_icon}` : `${false_icon}`; + row["url"] + )}>${false_icon}`; }, }; } @@ -1016,8 +1496,8 @@ function getDocumentTypeColumn() { button_color = data ? "btn-success" : "btn-secondary"; return ` @@ -323,15 +523,18 @@
+ + + +
@@ -349,15 +552,18 @@
+ + + +
@@ -375,19 +581,23 @@
+ + + +
@@ -405,7 +615,8 @@