From 9c7b25bc0f43234fa9ac097f2a53a9ef02aae131 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:04:46 -0500 Subject: [PATCH 001/330] adding the new base URL model --- sde_collections/models/url.py | 85 +++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 sde_collections/models/url.py diff --git a/sde_collections/models/url.py b/sde_collections/models/url.py new file mode 100644 index 00000000..7ce86dff --- /dev/null +++ b/sde_collections/models/url.py @@ -0,0 +1,85 @@ +import os +from urllib.parse import urlparse + +from django.db import models + +from .collection import Collection +from .collection_choice_fields import Divisions, DocumentTypes +from .pattern import ExcludePattern + + +class UrlQuerySet(models.QuerySet): + def with_exclusion_status(self): + return self.annotate( + excluded=models.Exists( + ExcludePattern.candidate_urls.through.objects.filter(candidateurl=models.OuterRef("pk")) + ) + ) + + +class UrlManager(models.Manager): + def get_queryset(self): + return UrlQuerySet(self.model, using=self._db).with_exclusion_status() + + +class Url(models.Model): + """This is the base URL model which serves as a base for DeltaUrl and CuratedUrl.""" + + collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="urls") + url = models.CharField("URL", max_length=4096) + scraped_title = models.CharField( + "Scraped Title", + max_length=1024, + default="", + blank=True, + help_text="This is the original title scraped by Sinequa", + ) + generated_title = models.CharField( + "Generated Title", + max_length=1024, + default="", + blank=True, + help_text="This is the title generated based on a Title Pattern", + ) + visited = models.BooleanField(default=False) + document_type = models.IntegerField(choices=DocumentTypes.choices, null=True) + division = models.IntegerField(choices=Divisions.choices, null=True) + + objects = UrlManager() + + class Meta: + verbose_name = "URL" + verbose_name_plural = "URLs" + ordering = ["url"] + + @property + def fileext(self) -> str: + parsed_url = urlparse(self.url) + path = parsed_url.path + if path.endswith("/") or not path: + return "html" + extension = os.path.splitext(path)[1] + return extension[1:] if extension.startswith(".") else extension or "html" + + def splits(self) -> list[tuple[str, str]]: + parts = [] + part_string = "" + for part in self.path.split("/"): + if part: + part_string += f"/{part}" + parts.append((part_string, part)) + return parts + + @property + def path(self) -> str: + parsed = urlparse(self.url) + path = f"{parsed.path}" + if parsed.query: + path += f"?{parsed.query}" + return path + + def __str__(self) -> str: + return self.url + + def save(self, *args, **kwargs): + super().save(*args, **kwargs) From 115481d5359ff7064061ea18ff0e02152343fbd7 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:05:08 -0500 Subject: [PATCH 002/330] adding the new dump url model --- sde_collections/models/dump_url.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 sde_collections/models/dump_url.py diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py new file mode 100644 index 00000000..85ef85d9 --- /dev/null +++ b/sde_collections/models/dump_url.py @@ -0,0 +1,9 @@ +from .url import Url + + +class DumpUrl(Url): + """Model for storing all the imported URLs before seperating them into delta URLs and Curated URLs.""" + + class Meta: + verbose_name = "Dump URL" + verbose_name_plural = "Dump URLs" From 8af6102de71cb45d5e32f0c61dedf011583df1d0 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:05:28 -0500 Subject: [PATCH 003/330] adding the new delta url model --- sde_collections/models/delta_url.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 sde_collections/models/delta_url.py diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py new file mode 100644 index 00000000..028607ab --- /dev/null +++ b/sde_collections/models/delta_url.py @@ -0,0 +1,13 @@ +from django.db import models + +from .url import Url + + +class DeltaUrl(Url): + """Model for storing delta URLs for curation purposes""" + + delete = models.BooleanField(default=False) + + class Meta: + verbose_name = "Delta URL" + verbose_name_plural = "Delta URLs" From 3f9c88520939f53a61711f6c2dc6f0ec351c6918 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:05:50 -0500 Subject: [PATCH 004/330] adding the new curated url model --- sde_collections/models/curated_url.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 sde_collections/models/curated_url.py diff --git a/sde_collections/models/curated_url.py b/sde_collections/models/curated_url.py new file mode 100644 index 00000000..d55dcb5f --- /dev/null +++ b/sde_collections/models/curated_url.py @@ -0,0 +1,9 @@ +from .url import Url + + +class CuratedUrl(Url): + """Model for storing curated and live URLs after the curation process.""" + + class Meta: + verbose_name = "Curated URL" + verbose_name_plural = "Curated URLs" From 3c9627fc3e67d477f2746d63a8304695b334ed5e Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:06:08 -0500 Subject: [PATCH 005/330] adding the necessary migration file --- .../0059_url_curatedurl_deltaurl_dumpurl.py | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py diff --git a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py new file mode 100644 index 00000000..82f4d4af --- /dev/null +++ b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py @@ -0,0 +1,146 @@ +# Generated by Django 4.2.9 on 2024-10-10 03:01 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="Url", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("url", models.CharField(max_length=4096, verbose_name="URL")), + ( + "scraped_title", + models.CharField( + blank=True, + default="", + help_text="This is the original title scraped by Sinequa", + max_length=1024, + verbose_name="Scraped Title", + ), + ), + ( + "generated_title", + models.CharField( + blank=True, + default="", + help_text="This is the title generated based on a Title Pattern", + max_length=1024, + verbose_name="Generated Title", + ), + ), + ("visited", models.BooleanField(default=False)), + ( + "document_type", + models.IntegerField( + choices=[ + (1, "Images"), + (2, "Data"), + (3, "Documentation"), + (4, "Software and Tools"), + (5, "Missions and Instruments"), + ], + null=True, + ), + ), + ( + "division", + models.IntegerField( + choices=[ + (1, "Astrophysics"), + (2, "Biological and Physical Sciences"), + (3, "Earth Science"), + (4, "Heliophysics"), + (5, "Planetary Science"), + (6, "General"), + ], + null=True, + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="urls", + to="sde_collections.collection", + ), + ), + ], + options={ + "verbose_name": "URL", + "verbose_name_plural": "URLs", + "ordering": ["url"], + }, + ), + migrations.CreateModel( + name="CuratedUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Curated URL", + "verbose_name_plural": "Curated URLs", + }, + bases=("sde_collections.url",), + ), + migrations.CreateModel( + name="DeltaUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ("delete", models.BooleanField(default=False)), + ], + options={ + "verbose_name": "Delta URL", + "verbose_name_plural": "Delta URLs", + }, + bases=("sde_collections.url",), + ), + migrations.CreateModel( + name="DumpUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Dump URL", + "verbose_name_plural": "Dump URLs", + }, + bases=("sde_collections.url",), + ), + ] From 2fcd346a2260779f64f319f7c63436792ca13cc1 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:08:41 -0500 Subject: [PATCH 006/330] adding a command file to migrate urls into delta and curated URL models --- .../management/commands/migrate_urls.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 sde_collections/management/commands/migrate_urls.py diff --git a/sde_collections/management/commands/migrate_urls.py b/sde_collections/management/commands/migrate_urls.py new file mode 100644 index 00000000..6958c107 --- /dev/null +++ b/sde_collections/management/commands/migrate_urls.py @@ -0,0 +1,59 @@ +from django.core.management.base import BaseCommand + +from sde_collections.models.candidate_url import CandidateURL +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices +from sde_collections.models.curated_url import CuratedUrl +from sde_collections.models.delta_url import DeltaUrl + + +class Command(BaseCommand): + help = "Migrate CandidateURLs to CuratedUrl or DeltaUrl based on collection workflow status" + + def handle(self, *args, **kwargs): + # Migrate CandidateURLs for collections with CURATED or higher workflow status to CuratedUrl + collections_for_curated = Collection.objects.filter(workflow_status__gte=WorkflowStatusChoices.CURATED) + self.stdout.write( + f"Migrating URLs for {collections_for_curated.count()} collections with CURATED or higher status..." + ) + + for collection in collections_for_curated: + candidate_urls = CandidateURL.objects.filter(collection=collection) + for candidate_url in candidate_urls: + CuratedUrl.objects.create( + collection=candidate_url.collection, + url=candidate_url.url, + scraped_title=candidate_url.scraped_title, + generated_title=candidate_url.generated_title, + visited=candidate_url.visited, + document_type=candidate_url.document_type, + division=candidate_url.division, + ) + self.stdout.write( + f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to CuratedUrl." + ) + + # Migrate CandidateURLs for collections with a status lower than CURATED to DeltaUrl + collections_for_delta = Collection.objects.filter(workflow_status__lt=WorkflowStatusChoices.CURATED) + self.stdout.write( + f"Migrating URLs for {collections_for_delta.count()} collections with status lower than CURATED..." + ) + + for collection in collections_for_delta: + candidate_urls = CandidateURL.objects.filter(collection=collection) + for candidate_url in candidate_urls: + DeltaUrl.objects.create( + collection=candidate_url.collection, + url=candidate_url.url, + scraped_title=candidate_url.scraped_title, + generated_title=candidate_url.generated_title, + visited=candidate_url.visited, + document_type=candidate_url.document_type, + division=candidate_url.division, + delete=False, + ) + self.stdout.write( + f"Migrated {candidate_urls.count()} URLs from collection '{collection.name}' to DeltaUrl." + ) + + self.stdout.write(self.style.SUCCESS("Migration complete.")) From d691af30fa10362f35a61b6bfd9f175ba3175bac Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Wed, 9 Oct 2024 22:09:37 -0500 Subject: [PATCH 007/330] added the new models into admin console --- sde_collections/admin.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index cb105f80..e4ff5097 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -5,7 +5,11 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory +from .models.curated_url import CuratedUrl +from .models.delta_url import DeltaUrl +from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern +from .models.url import Url from .tasks import import_candidate_urls_from_api @@ -299,9 +303,41 @@ class DivisionPatternAdmin(admin.ModelAdmin): search_fields = ("match_pattern", "division") +class UrlAdmin(admin.ModelAdmin): + """Admin View for Url""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + +class DumpUrlAdmin(admin.ModelAdmin): + """Admin View for DumpUrl""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + +class CuratedUrlAdmin(admin.ModelAdmin): + """Admin View for CuratedUrl""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + +class DeltaUrlAdmin(admin.ModelAdmin): + """Admin View for DeltaUrl""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + admin.site.register(WorkflowHistory, WorkflowHistoryAdmin) admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) admin.site.register(IncludePattern) admin.site.register(ResolvedTitle, ResolvedTitleAdmin) admin.site.register(DivisionPattern, DivisionPatternAdmin) +admin.site.register(Url, UrlAdmin) +admin.site.register(DeltaUrl, DeltaUrlAdmin) +admin.site.register(DumpUrl, DumpUrlAdmin) +admin.site.register(CuratedUrl, CuratedUrlAdmin) From a17029f88dc2644e3705ed11aa9ce9a4e727c431 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:01:11 -0500 Subject: [PATCH 008/330] removed url and dumpurl models from admin --- sde_collections/admin.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index e4ff5097..4fce1ea7 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,9 +7,7 @@ from .models.collection import Collection, WorkflowHistory from .models.curated_url import CuratedUrl from .models.delta_url import DeltaUrl -from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .models.url import Url from .tasks import import_candidate_urls_from_api @@ -303,20 +301,6 @@ class DivisionPatternAdmin(admin.ModelAdmin): search_fields = ("match_pattern", "division") -class UrlAdmin(admin.ModelAdmin): - """Admin View for Url""" - - list_display = ("url", "scraped_title", "collection") - list_filter = ("collection",) - - -class DumpUrlAdmin(admin.ModelAdmin): - """Admin View for DumpUrl""" - - list_display = ("url", "scraped_title", "collection") - list_filter = ("collection",) - - class CuratedUrlAdmin(admin.ModelAdmin): """Admin View for CuratedUrl""" @@ -337,7 +321,5 @@ class DeltaUrlAdmin(admin.ModelAdmin): admin.site.register(IncludePattern) admin.site.register(ResolvedTitle, ResolvedTitleAdmin) admin.site.register(DivisionPattern, DivisionPatternAdmin) -admin.site.register(Url, UrlAdmin) admin.site.register(DeltaUrl, DeltaUrlAdmin) -admin.site.register(DumpUrl, DumpUrlAdmin) admin.site.register(CuratedUrl, CuratedUrlAdmin) From 8606581c8e7970403519499e7171ae8503f7c296 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:02:10 -0500 Subject: [PATCH 009/330] edited the curated url api serialzier used for indexing --- sde_collections/serializers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 9623e85d..2f11700b 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -3,6 +3,7 @@ from .models.candidate_url import CandidateURL from .models.collection import Collection, WorkflowHistory from .models.collection_choice_fields import Divisions, DocumentTypes +from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -107,19 +108,18 @@ class Meta: ) -class CandidateURLAPISerializer(serializers.ModelSerializer): +class CuratedUrlAPISerializer(serializers.ModelSerializer): document_type = serializers.SerializerMethodField() title = serializers.SerializerMethodField() file_extension = serializers.SerializerMethodField() tree_root = serializers.SerializerMethodField() class Meta: - model = CandidateURL + model = CuratedUrl fields = ( "url", "title", "document_type", - "hash", "file_extension", "tree_root", ) From 0f8578cb2059bfce3c9f0508663090fd7e6c08ff Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:02:35 -0500 Subject: [PATCH 010/330] changed the api endpoit to have an appropriate name --- sde_collections/urls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 4e3d0534..214d1198 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -55,9 +55,9 @@ # Delete an existing CandidateURL instance: /candidate-urls/{id}/ path("api/", include(router.urls)), path( - "candidate-urls-api//", - view=views.CandidateURLAPIView.as_view(), - name="candidate-url-api", + "curated-urls-api//", + view=views.CuratedURLAPIView.as_view(), + name="curated-url-api", ), path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"), ] From 717eb533f59878f776b45b43216464323127341f Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:03:22 -0500 Subject: [PATCH 011/330] changed the api vew to point to the right curated url model --- sde_collections/views.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sde_collections/views.py b/sde_collections/views.py index 241979ba..b8ff70a0 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -27,6 +27,7 @@ DocumentTypes, WorkflowStatusChoices, ) +from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -35,11 +36,11 @@ TitlePattern, ) from .serializers import ( - CandidateURLAPISerializer, CandidateURLBulkCreateSerializer, CandidateURLSerializer, CollectionReadSerializer, CollectionSerializer, + CuratedUrlAPISerializer, DivisionPatternSerializer, DocumentTypePatternSerializer, ExcludePatternSerializer, @@ -307,8 +308,8 @@ def create(self, request, *args, **kwargs): return Response(serializer.data, status=status.HTTP_201_CREATED) -class CandidateURLAPIView(ListAPIView): - serializer_class = CandidateURLAPISerializer +class CuratedURLAPIView(ListAPIView): + serializer_class = CuratedUrlAPISerializer def get(self, request, *args, **kwargs): config_folder = kwargs.get("config_folder") @@ -317,7 +318,7 @@ def get(self, request, *args, **kwargs): def get_queryset(self): queryset = ( - CandidateURL.objects.filter(collection__config_folder=self.config_folder) + CuratedUrl.objects.filter(collection__config_folder=self.config_folder) .with_exclusion_status() .filter(excluded=False) ) From 83cb35a45d39dba10fcc22e0d7b6ae7979cc299b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 14 Oct 2024 12:03:36 -0500 Subject: [PATCH 012/330] migration file with changes --- .../migrations/0060_delete_dumpurl.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 sde_collections/migrations/0060_delete_dumpurl.py diff --git a/sde_collections/migrations/0060_delete_dumpurl.py b/sde_collections/migrations/0060_delete_dumpurl.py new file mode 100644 index 00000000..db9a10c1 --- /dev/null +++ b/sde_collections/migrations/0060_delete_dumpurl.py @@ -0,0 +1,16 @@ +# Generated by Django 4.2.9 on 2024-10-14 16:37 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"), + ] + + operations = [ + migrations.DeleteModel( + name="DumpUrl", + ), + ] From 27d0b49bff19ce81905286f3b3cb2925132dcca0 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 14 Oct 2024 20:40:01 -0500 Subject: [PATCH 013/330] change EnableNeuralIndexing to true in indexing template --- config_generation/xmls/plugin_indexing_template.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml index 44bfba6c..34aea51f 100644 --- a/config_generation/xmls/plugin_indexing_template.xml +++ b/config_generation/xmls/plugin_indexing_template.xml @@ -20,7 +20,7 @@ - false + true true false From d537302dcbbc288175ec81f62994b5fec84fbcbc Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Thu, 17 Oct 2024 13:52:37 -0500 Subject: [PATCH 014/330] add per indicator thrsholding and new dump --- scripts/ej/cmr_to_models.py | 2 +- scripts/ej/create_ej_dump.py | 37 +++++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py index 130de722..f7ba46db 100644 --- a/scripts/ej/cmr_to_models.py +++ b/scripts/ej/cmr_to_models.py @@ -69,7 +69,7 @@ def categorize_processing_level(level): # remove existing data EnvironmentalJusticeRow.objects.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV).delete() -ej_dump = json.load(open("backups/ej_dump_20240815_112916.json")) +ej_dump = json.load(open("backups/ej_dump_20241017_133151.json.json")) for dataset in ej_dump: ej_row = EnvironmentalJusticeRow( destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV, diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py index 36d7f722..c44aebc5 100644 --- a/scripts/ej/create_ej_dump.py +++ b/scripts/ej/create_ej_dump.py @@ -2,7 +2,7 @@ inferences are supplied by the classification model. the contact point is Bishwas cmr is supplied by running github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py -move to the serve like this: scp ej_dump_20240814_143036.json sde:/home/ec2-user/sde_indexing_helper/backups/ +move to the server like this: scp ej_dump_20241017_133151.json sde:/home/ec2-user/sde_indexing_helper/backups/ """ import json @@ -19,12 +19,12 @@ def save_to_json(data: dict | list, file_path: str) -> None: json.dump(data, file, indent=2) -def process_classifications(predictions: list[dict[str, float]], threshold: float = 0.5) -> list[str]: +def process_classifications(predictions: list[dict[str, float]], thresholds: dict[str, float]) -> list[str]: """ - Process the predictions and classify as follows: - 1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification - 2. Filter classifications based on the threshold, excluding 'Not EJ' - 3. Default to 'Not EJ' if no classifications meet the threshold + Process the predictions and classify based on the individual thresholds per indicator: + 1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification. + 2. Filter classifications based on their individual thresholds, excluding 'Not EJ'. + 3. Default to 'Not EJ' if no classifications meet the threshold. """ highest_prediction = max(predictions, key=lambda x: x["score"]) @@ -32,7 +32,9 @@ def process_classifications(predictions: list[dict[str, float]], threshold: floa return ["Not EJ"] classifications = [ - pred["label"] for pred in predictions if pred["score"] >= threshold and pred["label"] != "Not EJ" + pred["label"] + for pred in predictions + if pred["score"] >= thresholds[pred["label"]] and pred["label"] != "Not EJ" ] return classifications if classifications else ["Not EJ"] @@ -63,14 +65,14 @@ def remove_unauthorized_classifications(classifications: list[str]) -> list[str] def update_cmr_with_classifications( inferences: list[dict[str, dict]], cmr_dict: dict[str, dict[str, dict]], - threshold: float = 0.5, + thresholds: dict[str, float], ) -> list[dict[str, dict]]: """Update CMR data with valid classifications based on inferences.""" predicted_cmr = [] for inference in inferences: - classifications = process_classifications(predictions=inference["predictions"], threshold=threshold) + classifications = process_classifications(predictions=inference["predictions"], thresholds=thresholds) classifications = remove_unauthorized_classifications(classifications) if classifications: @@ -84,17 +86,30 @@ def update_cmr_with_classifications( def main(): - inferences = load_json_file("cmr-inference.json") + thresholds = { + "Not EJ": 0.80, + "Climate Change": 0.95, + "Disasters": 0.80, + "Extreme Heat": 0.50, + "Food Availability": 0.80, + "Health & Air Quality": 0.90, + "Human Dimensions": 0.80, + "Urban Flooding": 0.50, + "Water Availability": 0.80, + } + + inferences = load_json_file("alpha-1.3-wise-vortex-42-predictions.json") cmr = load_json_file("cmr_collections_umm_20240807_142146.json") cmr_dict = create_cmr_dict(cmr) - predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, threshold=0.8) + predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, thresholds=thresholds) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") file_name = f"ej_dump_{timestamp}.json" save_to_json(predicted_cmr, file_name) + print(f"Saved to {file_name}") if __name__ == "__main__": From b559facb6a5a43104445943cf1eadec4fe6ae0e7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 23 Oct 2024 21:48:31 -0500 Subject: [PATCH 015/330] Fixes issue #1071 --- .envs/.local/.django | 5 + sde_collections/admin.py | 18 ++ .../0059_candidateurl_scraped_text.py | 18 ++ sde_collections/models/candidate_url.py | 1 + sde_collections/tasks.py | 192 +++++++++++++++++- 5 files changed, 232 insertions(+), 2 deletions(-) create mode 100644 sde_collections/migrations/0059_candidateurl_scraped_text.py diff --git a/.envs/.local/.django b/.envs/.local/.django index 402efc3c..ce2e8095 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -39,3 +39,8 @@ XLI_USER='' XLI_PASSWORD='' LRM_QA_USER='' LRM_QA_PASSWORD='' + +#Server Tokens +#-------------------------------------------------------------------------------- +LRMDEV_TOKEN='' +LIS_TOKEN='' \ No newline at end of file diff --git a/sde_collections/admin.py b/sde_collections/admin.py index cb105f80..ecf92838 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,6 +7,22 @@ from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api +from .tasks import fetch_and_update_full_text + + +@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text") +def fetch_full_text_lrm_dev_action(modeladmin, request, queryset): + for collection in queryset: + fetch_and_update_full_text.delay(collection.id, "LRM_DEV") + modeladmin.message_user(request, "Full text fetched and updated from LRM_DEV successfully.") + + +@admin.action(description="Import candidate URLs from Li's Server with Full Text") +def fetch_full_text_lis_action(modeladmin, request, queryset): + for collection in queryset: + fetch_and_update_full_text.delay(collection.id, "LIS") + modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.") + @admin.action(description="Generate deployment message") @@ -239,6 +255,8 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): import_candidate_urls_lis_server, import_candidate_urls_lrm_dev_server, import_candidate_urls_lrm_qa_server, + fetch_full_text_lrm_dev_action, + fetch_full_text_lis_action, ] ordering = ("cleaning_order",) diff --git a/sde_collections/migrations/0059_candidateurl_scraped_text.py b/sde_collections/migrations/0059_candidateurl_scraped_text.py new file mode 100644 index 00000000..cc3ea65b --- /dev/null +++ b/sde_collections/migrations/0059_candidateurl_scraped_text.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-10-21 23:10 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="scraped_text", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 51c3a28b..936ea363 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -35,6 +35,7 @@ class CandidateURL(models.Model): blank=True, help_text="This is the original title scraped by Sinequa", ) + scraped_text = models.TextField(blank=True, null=True) generated_title = models.CharField( "Generated Title", default="", diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index fa754efc..3172b22f 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -1,13 +1,13 @@ import json import os import shutil - +import requests import boto3 from django.apps import apps from django.conf import settings from django.core import management from django.core.management.commands import loaddata - +from sde_collections.models.candidate_url import CandidateURL from config import celery_app from .models.collection import Collection, WorkflowStatusChoices @@ -141,3 +141,191 @@ def resolve_title_pattern(title_pattern_id): TitlePattern = apps.get_model("sde_collections", "TitlePattern") title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() +''' +@celery_app.task +def fetch_and_update_full_text(collection_id): + + try: + collection = Collection.objects.get(id=collection_id) + except Collection.DoesNotExist: + raise Exception(f"Collection with ID {collection_id} does not exist.") + + url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server + sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'" + token = os.getenv('LRMDEV_TOKEN') + + + payload = json.dumps({ + "method": "engine.sql", + "sql": sql_command, + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default" + }) + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {token}' + } + + response = requests.post(url, headers=headers, data=payload) + if response.status_code == 200: + records = response.json().get("Rows", []) + for record in records: + url, full_text, title = record + if not url or not full_text or not title: + continue + # Directly update or create the entry without checking for content changes + CandidateURL.objects.update_or_create( + url=url, + collection=collection, + defaults={ + 'scraped_text': full_text, + 'scraped_title': title + } + ) + + return f"Processed {len(records)} records; Updated or created in database." + else: + raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") + ''' + +#You will have to have a different function for Li's server as it uses user and pw with body to login. +#If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 +#Fill in the tokens in the .django file + +#Integrated - LRM devs and Lis separate +''' +@celery_app.task +def fetch_and_update_full_text(collection_id, server_type): + try: + collection = Collection.objects.get(id=collection_id) + except Collection.DoesNotExist: + raise Exception(f"Collection with ID {collection_id} does not exist.") + + # Server-specific configurations + server_config = get_server_config(server_type) + + # API Request Parameters + payload = json.dumps({ + "method": "engine.sql", + "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default" + }) + + token = server_config["token"] + url = server_config["url"] + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {token}' + } + + # Send the request + response = requests.post(url, headers=headers, data=payload) + if response.status_code == 200: + records = response.json().get("Rows", []) + for record in records: + url, full_text, title = record + if not url or not full_text or not title: + continue + CandidateURL.objects.update_or_create( + url=url, + collection=collection, + defaults={ + 'scraped_text': full_text, + 'scraped_title': title + } + ) + return f"Processed {len(records)} records; Updated or created in database." + else: + raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") + + +def get_server_config(server_type): + if server_type == "LRM_DEV": + return { + "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LRMDEV_TOKEN") + } + elif server_type == "LIS": + return { + "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LIS_TOKEN") + } + else: + raise ValueError("Invalid server type.") +''' + + +@celery_app.task +def fetch_and_update_full_text(collection_id, server_type): + try: + collection = Collection.objects.get(id=collection_id) + except Collection.DoesNotExist: + raise Exception(f"Collection with ID {collection_id} does not exist.") + + server_config = get_server_config(server_type) + token = server_config["token"] + url = server_config["url"] + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {token}' + } + + payload = json.dumps({ + "method": "engine.sql", + "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default" + }) + + try: + response = requests.post(url, headers=headers, data=payload, timeout=10) + response.raise_for_status() # Raise exception for HTTP errors + except requests.exceptions.RequestException as e: + raise Exception(f"API request failed: {str(e)}") + + records = response.json().get("Rows", []) + if not records: + return "No records found in the response." + + for record in records: + url, full_text, title = record + if not (url and full_text and title): + continue + + CandidateURL.objects.update_or_create( + url=url, + collection=collection, + defaults={ + 'scraped_text': full_text, + 'scraped_title': title + } + ) + + return f"Successfully processed {len(records)} records and updated the database." + +def get_server_config(server_type): + if server_type == "LRM_DEV": + return { + "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LRMDEV_TOKEN") + } + elif server_type == "LIS": + return { + "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", + "token": os.getenv("LIS_TOKEN") + } + else: + raise ValueError("Invalid server type.") + From 8678ed6e83edc61461c51a51cc8bd9b5c9190dde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 03:05:09 +0000 Subject: [PATCH 016/330] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .envs/.local/.django | 2 +- sde_collections/admin.py | 4 +- sde_collections/tasks.py | 82 ++++++++++++++++++---------------------- 3 files changed, 39 insertions(+), 49 deletions(-) diff --git a/.envs/.local/.django b/.envs/.local/.django index ce2e8095..07e159fa 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -43,4 +43,4 @@ LRM_QA_PASSWORD='' #Server Tokens #-------------------------------------------------------------------------------- LRMDEV_TOKEN='' -LIS_TOKEN='' \ No newline at end of file +LIS_TOKEN='' diff --git a/sde_collections/admin.py b/sde_collections/admin.py index ecf92838..7b519a15 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -6,8 +6,7 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .tasks import import_candidate_urls_from_api -from .tasks import fetch_and_update_full_text +from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api @admin.action(description="Import candidate URLs from LRM Dev Server with Full Text") @@ -22,7 +21,6 @@ def fetch_full_text_lis_action(modeladmin, request, queryset): for collection in queryset: fetch_and_update_full_text.delay(collection.id, "LIS") modeladmin.message_user(request, "Full text fetched and updated from Li's Server successfully.") - @admin.action(description="Generate deployment message") diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 3172b22f..8d93a2de 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -1,14 +1,16 @@ import json import os import shutil -import requests + import boto3 +import requests from django.apps import apps from django.conf import settings from django.core import management from django.core.management.commands import loaddata -from sde_collections.models.candidate_url import CandidateURL + from config import celery_app +from sde_collections.models.candidate_url import CandidateURL from .models.collection import Collection, WorkflowStatusChoices from .sinequa_api import Api @@ -141,15 +143,17 @@ def resolve_title_pattern(title_pattern_id): TitlePattern = apps.get_model("sde_collections", "TitlePattern") title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() -''' + + +""" @celery_app.task def fetch_and_update_full_text(collection_id): - + try: collection = Collection.objects.get(id=collection_id) except Collection.DoesNotExist: raise Exception(f"Collection with ID {collection_id} does not exist.") - + url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'" token = os.getenv('LRMDEV_TOKEN') @@ -164,12 +168,12 @@ def fetch_and_update_full_text(collection_id): "resolveIndexList": "false", "engines": "default" }) - + headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {token}' } - + response = requests.post(url, headers=headers, data=payload) if response.status_code == 200: records = response.json().get("Rows", []) @@ -190,21 +194,21 @@ def fetch_and_update_full_text(collection_id): return f"Processed {len(records)} records; Updated or created in database." else: raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") - ''' + """ -#You will have to have a different function for Li's server as it uses user and pw with body to login. -#If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 -#Fill in the tokens in the .django file +# You will have to have a different function for Li's server as it uses user and pw with body to login. +# If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 +# Fill in the tokens in the .django file -#Integrated - LRM devs and Lis separate -''' +# Integrated - LRM devs and Lis separate +""" @celery_app.task def fetch_and_update_full_text(collection_id, server_type): try: collection = Collection.objects.get(id=collection_id) except Collection.DoesNotExist: raise Exception(f"Collection with ID {collection_id} does not exist.") - + # Server-specific configurations server_config = get_server_config(server_type) @@ -260,7 +264,7 @@ def get_server_config(server_type): } else: raise ValueError("Invalid server type.") -''' +""" @celery_app.task @@ -274,20 +278,19 @@ def fetch_and_update_full_text(collection_id, server_type): token = server_config["token"] url = server_config["url"] - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {token}' - } - - payload = json.dumps({ - "method": "engine.sql", - "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", - "pretty": True, - "log": False, - "output": "json", - "resolveIndexList": "false", - "engines": "default" - }) + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} + + payload = json.dumps( + { + "method": "engine.sql", + "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", + "pretty": True, + "log": False, + "output": "json", + "resolveIndexList": "false", + "engines": "default", + } + ) try: response = requests.post(url, headers=headers, data=payload, timeout=10) @@ -302,30 +305,19 @@ def fetch_and_update_full_text(collection_id, server_type): for record in records: url, full_text, title = record if not (url and full_text and title): - continue + continue CandidateURL.objects.update_or_create( - url=url, - collection=collection, - defaults={ - 'scraped_text': full_text, - 'scraped_title': title - } + url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title} ) return f"Successfully processed {len(records)} records and updated the database." + def get_server_config(server_type): if server_type == "LRM_DEV": - return { - "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LRMDEV_TOKEN") - } + return {"url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LRMDEV_TOKEN")} elif server_type == "LIS": - return { - "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LIS_TOKEN") - } + return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")} else: raise ValueError("Invalid server type.") - From e4881a94adaa5dba4d9dca928a55117ef4e671b7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 23 Oct 2024 22:24:32 -0500 Subject: [PATCH 017/330] Fixes issue #1071 --- sde_collections/tasks.py | 124 +-------------------------------------- 1 file changed, 1 insertion(+), 123 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 8d93a2de..0c54ea0c 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -144,129 +144,6 @@ def resolve_title_pattern(title_pattern_id): title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() - -""" -@celery_app.task -def fetch_and_update_full_text(collection_id): - - try: - collection = Collection.objects.get(id=collection_id) - except Collection.DoesNotExist: - raise Exception(f"Collection with ID {collection_id} does not exist.") - - url = "https://sde-lrm.nasa-impact.net/api/v1/engine.sql" #LRM_DEV Server - sql_command = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'" - token = os.getenv('LRMDEV_TOKEN') - - - payload = json.dumps({ - "method": "engine.sql", - "sql": sql_command, - "pretty": True, - "log": False, - "output": "json", - "resolveIndexList": "false", - "engines": "default" - }) - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {token}' - } - - response = requests.post(url, headers=headers, data=payload) - if response.status_code == 200: - records = response.json().get("Rows", []) - for record in records: - url, full_text, title = record - if not url or not full_text or not title: - continue - # Directly update or create the entry without checking for content changes - CandidateURL.objects.update_or_create( - url=url, - collection=collection, - defaults={ - 'scraped_text': full_text, - 'scraped_title': title - } - ) - - return f"Processed {len(records)} records; Updated or created in database." - else: - raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") - """ - -# You will have to have a different function for Li's server as it uses user and pw with body to login. -# If the sinequa web token is used, can user&pw be removed from the body? if yes then can integrate, but headers will b diff (auth/cookie). if lis then header1, elif lrm_dev then h2, else h3 -# Fill in the tokens in the .django file - -# Integrated - LRM devs and Lis separate -""" -@celery_app.task -def fetch_and_update_full_text(collection_id, server_type): - try: - collection = Collection.objects.get(id=collection_id) - except Collection.DoesNotExist: - raise Exception(f"Collection with ID {collection_id} does not exist.") - - # Server-specific configurations - server_config = get_server_config(server_type) - - # API Request Parameters - payload = json.dumps({ - "method": "engine.sql", - "sql": f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection.config_folder}/'", - "pretty": True, - "log": False, - "output": "json", - "resolveIndexList": "false", - "engines": "default" - }) - - token = server_config["token"] - url = server_config["url"] - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {token}' - } - - # Send the request - response = requests.post(url, headers=headers, data=payload) - if response.status_code == 200: - records = response.json().get("Rows", []) - for record in records: - url, full_text, title = record - if not url or not full_text or not title: - continue - CandidateURL.objects.update_or_create( - url=url, - collection=collection, - defaults={ - 'scraped_text': full_text, - 'scraped_title': title - } - ) - return f"Processed {len(records)} records; Updated or created in database." - else: - raise Exception(f"Failed to fetch text: {response.status_code} {response.text}") - - -def get_server_config(server_type): - if server_type == "LRM_DEV": - return { - "url": "https://sde-lrm.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LRMDEV_TOKEN") - } - elif server_type == "LIS": - return { - "url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", - "token": os.getenv("LIS_TOKEN") - } - else: - raise ValueError("Invalid server type.") -""" - - @celery_app.task def fetch_and_update_full_text(collection_id, server_type): try: @@ -321,3 +198,4 @@ def get_server_config(server_type): return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")} else: raise ValueError("Invalid server type.") + \ No newline at end of file From 47f164f7f7a5d3a1f3f983d92d9d1bd4636f087b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 03:25:01 +0000 Subject: [PATCH 018/330] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index 0c54ea0c..f505c942 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -144,6 +144,7 @@ def resolve_title_pattern(title_pattern_id): title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() + @celery_app.task def fetch_and_update_full_text(collection_id, server_type): try: @@ -198,4 +199,3 @@ def get_server_config(server_type): return {"url": "http://sde-xli.nasa-impact.net/api/v1/engine.sql", "token": os.getenv("LIS_TOKEN")} else: raise ValueError("Invalid server type.") - \ No newline at end of file From f4849e862184c83e20e115f2ce2beffb38daf914 Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Tue, 29 Oct 2024 23:09:16 -0500 Subject: [PATCH 019/330] add PairedFieldDescriptor two-column tag model --- ...ection_tdamm_manual_collection_tdamm_ml.py | 23 ++++++++++++++++++ sde_collections/models/collection.py | 5 ++++ .../utils/paired_field_descriptor.py | 24 +++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py create mode 100644 sde_collections/utils/paired_field_descriptor.py diff --git a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py new file mode 100644 index 00000000..557ad13e --- /dev/null +++ b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.9 on 2024-10-30 00:44 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="collection", + name="tdamm_manual", + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AddField( + model_name="collection", + name="tdamm_ml", + field=models.CharField(blank=True, max_length=255, null=True), + ), + ] diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 31306b8c..a2d3181c 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -26,6 +26,7 @@ UpdateFrequencies, WorkflowStatusChoices, ) +from ..utils.paired_field_descriptor import PairedFieldDescriptor User = get_user_model() @@ -33,6 +34,10 @@ class Collection(models.Model): """Model definition for Collection.""" + tdamm_manual = models.CharField(max_length=255, null=True, blank=True) + tdamm_ml = models.CharField(max_length=255, null=True, blank=True) + tdamm = PairedFieldDescriptor('tdamm') + name = models.CharField("Name", max_length=1024) config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False) url = models.URLField("URL", max_length=2048) diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py new file mode 100644 index 00000000..e07d41dc --- /dev/null +++ b/sde_collections/utils/paired_field_descriptor.py @@ -0,0 +1,24 @@ +from django.db import models + + +class PairedFieldDescriptor: + def __init__(self, field_name): + self.manual_field_name = f"{field_name}_manual" + self.ml_field_name = f"{field_name}_ml" + + def __get__(self, instance, owner): + if instance is None: + return self + # Return manual tag if available, otherwise ML tag + manual_value = getattr(instance, self.manual_field_name, None) + machine_learning_value = getattr(instance, self.ml_field_name, None) + return manual_value if manual_value is not None else machine_learning_value + + def __set__(self, instance, value): + # Set the value of the manual field + setattr(instance, self.manual_field_name, value) + + def __delete__(self, instance): + # Delete both manual and ML fields + delattr(instance, self.manual_field_name) + delattr(instance, self.ml_field_name) From a469ef1242824645885433ac0d3ecd8d4a23a7fe Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Wed, 30 Oct 2024 16:14:07 -0500 Subject: [PATCH 020/330] add fields to admin panel --- sde_collections/admin.py | 35 +++++++++++++++++++ ...remove_collection_tdamm_manual_and_more.py | 31 ++++++++++++++++ sde_collections/models/collection.py | 6 ++-- sde_collections/serializers.py | 1 + 4 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py diff --git a/sde_collections/admin.py b/sde_collections/admin.py index cb105f80..add9a906 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,6 +7,7 @@ from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api +from django import forms @admin.action(description="Generate deployment message") @@ -174,10 +175,34 @@ def update_config(self, request, queryset): update_config.short_description = "Update configs of selected" +class CollectionForm(forms.ModelForm): + tdamm_tag = forms.CharField(required=False, label="TDAMM Tag") + + class Meta: + model = Collection + fields = "__all__" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.instance and hasattr(self.instance, "tdamm_tag"): + # Set the initial value of tdamm_tag to the computed value + self.fields["tdamm_tag"].initial = self.instance.tdamm_tag + + def clean(self): + cleaned_data = super().clean() + tdamm_value = cleaned_data.get("tdamm_tag") + if tdamm_value: + # Set the manual field with the value from tdamm + cleaned_data["tdamm_tag_manual"] = tdamm_value + return cleaned_data + + @admin.register(Collection) class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): """Admin View for Collection""" + form = CollectionForm + fieldsets = ( ( "Essential information", @@ -187,6 +212,9 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): "config_folder", "url", "division", + "tdamm_tag", + "tdamm_tag_ml", + "tdamm_tag_manual", "document_type", "update_frequency", "source", @@ -215,15 +243,22 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): ), ) + def tdamm_tag(self, obj): + return obj.tdamm_tag + list_display = ( "name", "candidate_urls_count", "config_folder", "url", + "tdamm_tag", + "tdamm_tag_ml", + "tdamm_tag_manual", "division", "new_collection", "is_multi_division", ) + readonly_fields = ("config_folder",) list_filter = ("division", "curation_status", "workflow_status", "turned_on", "is_multi_division") search_fields = ("name", "url", "config_folder") diff --git a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py new file mode 100644 index 00000000..37b817a7 --- /dev/null +++ b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py @@ -0,0 +1,31 @@ +# Generated by Django 4.2.9 on 2024-10-30 21:05 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_collection_tdamm_manual_collection_tdamm_ml"), + ] + + operations = [ + migrations.RemoveField( + model_name="collection", + name="tdamm_manual", + ), + migrations.RemoveField( + model_name="collection", + name="tdamm_ml", + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_manual", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"), + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_ml", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"), + ), + ] diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index a2d3181c..1d140889 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -34,9 +34,9 @@ class Collection(models.Model): """Model definition for Collection.""" - tdamm_manual = models.CharField(max_length=255, null=True, blank=True) - tdamm_ml = models.CharField(max_length=255, null=True, blank=True) - tdamm = PairedFieldDescriptor('tdamm') + tdamm_tag_manual = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM Manual Tag") + tdamm_tag_ml = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM ML Tag") + tdamm_tag = PairedFieldDescriptor('tdamm_tag') name = models.CharField("Name", max_length=1024) config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 9623e85d..19717818 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -26,6 +26,7 @@ class Meta: "workflow_status_display", "curated_by", "division", + "tdamm_tag", "document_type", "name", ) From 8e8e0ac743f6a915e8196c6dc9914060766315eb Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Mon, 4 Nov 2024 00:13:31 -0600 Subject: [PATCH 021/330] moved tdamm_tags feature from collection to candidate_url --- sde_collections/admin.py | 127 ++++++++++++----- ..._candidateurl_tdamm_tag_manual_and_more.py | 134 ++++++++++++++++++ ...ection_tdamm_manual_collection_tdamm_ml.py | 23 --- ...remove_collection_tdamm_manual_and_more.py | 31 ---- sde_collections/models/candidate_url.py | 60 +++++++- sde_collections/models/collection.py | 5 - sde_collections/serializers.py | 2 +- 7 files changed, 287 insertions(+), 95 deletions(-) create mode 100644 sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py delete mode 100644 sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py delete mode 100644 sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py diff --git a/sde_collections/admin.py b/sde_collections/admin.py index add9a906..73576899 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -8,6 +8,7 @@ from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api from django import forms +from django.contrib.postgres.fields import ArrayField @admin.action(description="Generate deployment message") @@ -175,34 +176,10 @@ def update_config(self, request, queryset): update_config.short_description = "Update configs of selected" -class CollectionForm(forms.ModelForm): - tdamm_tag = forms.CharField(required=False, label="TDAMM Tag") - - class Meta: - model = Collection - fields = "__all__" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.instance and hasattr(self.instance, "tdamm_tag"): - # Set the initial value of tdamm_tag to the computed value - self.fields["tdamm_tag"].initial = self.instance.tdamm_tag - - def clean(self): - cleaned_data = super().clean() - tdamm_value = cleaned_data.get("tdamm_tag") - if tdamm_value: - # Set the manual field with the value from tdamm - cleaned_data["tdamm_tag_manual"] = tdamm_value - return cleaned_data - - @admin.register(Collection) class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): """Admin View for Collection""" - form = CollectionForm - fieldsets = ( ( "Essential information", @@ -212,9 +189,6 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): "config_folder", "url", "division", - "tdamm_tag", - "tdamm_tag_ml", - "tdamm_tag_manual", "document_type", "update_frequency", "source", @@ -243,17 +217,11 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): ), ) - def tdamm_tag(self, obj): - return obj.tdamm_tag - list_display = ( "name", "candidate_urls_count", "config_folder", "url", - "tdamm_tag", - "tdamm_tag_ml", - "tdamm_tag_manual", "division", "new_collection", "is_multi_division", @@ -296,13 +264,104 @@ def exclude_and_delete_children(modeladmin, request, queryset): for candidate_url in queryset.all(): candidate_url.get_children().delete() +class CandidateURLForm(forms.ModelForm): + # tdamm_tag = forms.MultipleChoiceField( + # choices=CandidateURL.TDAMM_TAG_CHOICES, + # required=False, + # label="TDAMM Tags", + # widget=forms.CheckboxSelectMultiple, + # ) + + tdamm_tag_ml = forms.MultipleChoiceField( + choices=CandidateURL.TDAMM_TAG_CHOICES, + required=False, + label="TDAMM ML Tags", + widget=forms.CheckboxSelectMultiple, + ) + + tdamm_tag_manual = forms.MultipleChoiceField( + choices=CandidateURL.TDAMM_TAG_CHOICES, + required=False, + label="TDAMM Manual Tags", + widget=forms.CheckboxSelectMultiple, + ) + + class Meta: + model = CandidateURL + fields = '__all__' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Initialize tdamm_tag + # if self.instance and hasattr(self.instance, 'tdamm_tag'): + # self.fields['tdamm_tag'].initial = self.instance.tdamm_tag or [] + + # Initialize tdamm_tag_ml + if self.instance and self.instance.tdamm_tag_ml: + self.fields['tdamm_tag_ml'].initial = self.instance.tdamm_tag_ml + + # Initialize tdamm_tag_manual + if self.instance and self.instance.tdamm_tag_manual: + self.fields['tdamm_tag_manual'].initial = self.instance.tdamm_tag_manual + + def clean(self): + cleaned_data = super().clean() + + # Handle tdamm_tag + # tdamm_tag_value = cleaned_data.get('tdamm_tag', []) + # if not tdamm_tag_value: + # cleaned_data['tdamm_tag_manual'] = None + # else: + # cleaned_data['tdamm_tag_manual'] = tdamm_tag_value + + # Handle tdamm_tag_ml + tdamm_tag_ml_value = cleaned_data.get('tdamm_tag_ml', []) + if not tdamm_tag_ml_value: + cleaned_data['tdamm_tag_ml'] = None + + # Handle tdamm_tag_manual + tdamm_tag_manual_value = cleaned_data.get('tdamm_tag_manual', []) + if not tdamm_tag_manual_value: + cleaned_data['tdamm_tag_manual'] = None + + return cleaned_data class CandidateURLAdmin(admin.ModelAdmin): """Admin View for CandidateURL""" - list_display = ("url", "scraped_title", "collection") + form = CandidateURLForm + + list_display = ( + "url", + "scraped_title", + "collection", + # "tdamm_tag_display", + "tdamm_tag_ml_display", + "tdamm_tag_manual_display" + ) list_filter = ("collection",) + # @admin.display(description='TDAMM Tags') + # def tdamm_tag_display(self, obj): + # if obj.tdamm_tag: + # readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag] + # return ", ".join(readable_tags) + # return "" + + @admin.display(description='TDAMM ML Tags') + def tdamm_tag_ml_display(self, obj): + if obj.tdamm_tag_ml: + readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_ml] + return ", ".join(readable_tags) + return "" + + @admin.display(description='TDAMM Manual Tags') + def tdamm_tag_manual_display(self, obj): + if obj.tdamm_tag_manual: + readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_manual] + return ", ".join(readable_tags) + return "" + class TitlePatternAdmin(admin.ModelAdmin): """Admin View for TitlePattern""" diff --git a/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..057f1ed6 --- /dev/null +++ b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,134 @@ +# Generated by Django 4.2.9 on 2024-11-02 04:36 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + null=True, + size=None, + verbose_name="TDAMM Manual Tags", + ), + ), + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + null=True, + size=None, + verbose_name="TDAMM ML Tags", + ), + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_manual", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"), + ), + migrations.AddField( + model_name="collection", + name="tdamm_tag_ml", + field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"), + ), + ] diff --git a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py b/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py deleted file mode 100644 index 557ad13e..00000000 --- a/sde_collections/migrations/0059_collection_tdamm_manual_collection_tdamm_ml.py +++ /dev/null @@ -1,23 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-30 00:44 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), - ] - - operations = [ - migrations.AddField( - model_name="collection", - name="tdamm_manual", - field=models.CharField(blank=True, max_length=255, null=True), - ), - migrations.AddField( - model_name="collection", - name="tdamm_ml", - field=models.CharField(blank=True, max_length=255, null=True), - ), - ] diff --git a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py b/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py deleted file mode 100644 index 37b817a7..00000000 --- a/sde_collections/migrations/0060_remove_collection_tdamm_manual_and_more.py +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-30 21:05 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0059_collection_tdamm_manual_collection_tdamm_ml"), - ] - - operations = [ - migrations.RemoveField( - model_name="collection", - name="tdamm_manual", - ), - migrations.RemoveField( - model_name="collection", - name="tdamm_ml", - ), - migrations.AddField( - model_name="collection", - name="tdamm_tag_manual", - field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM Manual Tag"), - ), - migrations.AddField( - model_name="collection", - name="tdamm_tag_ml", - field=models.CharField(blank=True, max_length=255, null=True, verbose_name="TDAMM ML Tag"), - ), - ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 51c3a28b..f8c91a97 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -7,7 +7,8 @@ from .collection import Collection from .collection_choice_fields import Divisions, DocumentTypes from .pattern import ExcludePattern, TitlePattern - +from ..utils.paired_field_descriptor import PairedFieldDescriptor +from django.contrib.postgres.fields import ArrayField class CandidateURLQuerySet(models.QuerySet): def with_exclusion_status(self): @@ -80,6 +81,63 @@ class CandidateURL(models.Model): help_text="Helps keep track if the Current URL is present in production or not", ) + TDAMM_TAG_CHOICES = [ + ('MMA_M_EM', 'Messenger - EM Radiation'), + ('MMA_M_EM_G', 'Messenger - EM Radiation - Gamma rays'), + ('MMA_M_EM_X', 'Messenger - EM Radiation - X-rays'), + ('MMA_M_EM_U', 'Messenger - EM Radiation - Ultraviolet'), + ('MMA_M_EM_O', 'Messenger - EM Radiation - Optical'), + ('MMA_M_EM_I', 'Messenger - EM Radiation - Infrared'), + ('MMA_M_EM_M', 'Messenger - EM Radiation - Microwave'), + ('MMA_M_EM_R', 'Messenger - EM Radiation - Radio'), + ('MMA_M_G', 'Messenger - Gravitational Waves'), + ('MMA_M_G_CBI', 'Messenger - Gravitational Waves - Compact Binary Inspiral'), + ('MMA_M_G_S', 'Messenger - Gravitational Waves - Stochastic'), + ('MMA_M_G_CON', 'Messenger - Gravitational Waves - Continuous'), + ('MMA_M_G_B', 'Messenger - Gravitational Waves - Burst'), + ('MMA_M_C', 'Messenger - Cosmic Rays'), + ('MMA_M_N', 'Messenger - Neutrinos'), + ('MMA_O_BI', 'Objects - Binaries'), + ('MMA_O_BI_BBH', 'Objects - Binaries - Binary Black Holes'), + ('MMA_O_BI_BNS', 'Objects - Binaries - Binary Neutron Stars'), + ('MMA_O_BI_C', 'Objects - Binaries - Cataclysmic Variables'), + ('MMA_O_BI_N', 'Objects - Binaries - Neutron Star-Black Hole'), + ('MMA_O_BI_B', 'Objects - Binaries - Binary Pulsars'), + ('MMA_O_BI_W', 'Objects - Binaries - White Dwarf Binaries'), + ('MMA_O_BH', 'Objects - Black Holes'), + ('MMA_O_BH_AGN', 'Objects - Black Holes - Active Galactic Nuclei'), + ('MMA_O_BH_IM', 'Objects - Black Holes - Intermediate mass'), + ('MMA_O_BH_STM', 'Objects - Black Holes - Stellar mass'), + ('MMA_O_BH_SUM', 'Objects - Black Holes - Supermassive'), + ('MMA_O_E', 'Objects - Exoplanets'), + ('MMA_O_N', 'Objects - Neutron Stars'), + ('MMA_O_N_M', 'Objects - Neutron Stars - Magnetars'), + ('MMA_O_N_P', 'Objects - Neutron Stars - Pulsars'), + ('MMA_O_N_PWN', 'Objects - Neutron Stars - Pulsar Wind Nebula'), + ('MMA_O_S', 'Objects - Supernova Remnants'), + ('MMA_S_F', 'Signals - Fast Radio Bursts'), + ('MMA_S_G', 'Signals - Gamma-ray Bursts'), + ('MMA_S_K', 'Signals - Kilonovae'), + ('MMA_S_N', 'Signals - Novae'), + ('MMA_S_P', 'Signals - Pevatrons'), + ('MMA_S_ST', 'Signals - Stellar flares'), + ('MMA_S_SU', 'Signals - Supernovae'), + ] + + tdamm_tag_manual = ArrayField( + models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), + blank=True, + null=True, + verbose_name="TDAMM Manual Tags" + ) + tdamm_tag_ml = ArrayField( + models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), + blank=True, + null=True, + verbose_name="TDAMM ML Tags" + ) + tdamm_tag = PairedFieldDescriptor('tdamm_tag') + class Meta: """Meta definition for Candidate URL.""" diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 1d140889..31306b8c 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -26,7 +26,6 @@ UpdateFrequencies, WorkflowStatusChoices, ) -from ..utils.paired_field_descriptor import PairedFieldDescriptor User = get_user_model() @@ -34,10 +33,6 @@ class Collection(models.Model): """Model definition for Collection.""" - tdamm_tag_manual = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM Manual Tag") - tdamm_tag_ml = models.CharField(max_length=255, null=True, blank=True, verbose_name="TDAMM ML Tag") - tdamm_tag = PairedFieldDescriptor('tdamm_tag') - name = models.CharField("Name", max_length=1024) config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False) url = models.URLField("URL", max_length=2048) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 19717818..b7bb3b25 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -26,7 +26,6 @@ class Meta: "workflow_status_display", "curated_by", "division", - "tdamm_tag", "document_type", "name", ) @@ -123,6 +122,7 @@ class Meta: "hash", "file_extension", "tree_root", + "tdamm_tag" ) def get_document_type(self, obj): From 6bf48ff100d32cfe3e52605b13625f044210e79b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:46:19 -0600 Subject: [PATCH 022/330] adding admin views for DumpURL and URL models --- sde_collections/admin.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 4fce1ea7..a8fce352 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -7,7 +7,9 @@ from .models.collection import Collection, WorkflowHistory from .models.curated_url import CuratedUrl from .models.delta_url import DeltaUrl +from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern +from .models.url import Url from .tasks import import_candidate_urls_from_api @@ -315,6 +317,20 @@ class DeltaUrlAdmin(admin.ModelAdmin): list_filter = ("collection",) +class DumpUrlAdmin(admin.ModelAdmin): + """Admin View for DumpUrl""" + + list_display = ("url", "scraped_title", "generated_title", "collection") + list_filter = ("collection",) + + +class UrlAdmin(admin.ModelAdmin): + """Admin View for Url""" + + list_display = ("url", "scraped_title", "collection") + list_filter = ("collection",) + + admin.site.register(WorkflowHistory, WorkflowHistoryAdmin) admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) @@ -323,3 +339,5 @@ class DeltaUrlAdmin(admin.ModelAdmin): admin.site.register(DivisionPattern, DivisionPatternAdmin) admin.site.register(DeltaUrl, DeltaUrlAdmin) admin.site.register(CuratedUrl, CuratedUrlAdmin) +admin.site.register(DumpUrl, DumpUrlAdmin) +admin.site.register(Url, UrlAdmin) From 483685135cceffd131de25ffbf78c9d0bbdee929 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:46:34 -0600 Subject: [PATCH 023/330] migration for the dump URL file --- sde_collections/migrations/0061_dumpurl.py | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 sde_collections/migrations/0061_dumpurl.py diff --git a/sde_collections/migrations/0061_dumpurl.py b/sde_collections/migrations/0061_dumpurl.py new file mode 100644 index 00000000..4aeb0088 --- /dev/null +++ b/sde_collections/migrations/0061_dumpurl.py @@ -0,0 +1,35 @@ +# Generated by Django 4.2.9 on 2024-10-23 19:29 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0060_delete_dumpurl"), + ] + + operations = [ + migrations.CreateModel( + name="DumpUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Dump URL", + "verbose_name_plural": "Dump URLs", + }, + bases=("sde_collections.url",), + ), + ] From 19feff8cd273488bb727db9e9b81b9a0a112701b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:49:31 -0600 Subject: [PATCH 024/330] adding tasks to compare and add URLs to the new models --- sde_collections/tasks.py | 109 +++++++++++++++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 10 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index fa754efc..ecc3c1a9 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -6,11 +6,14 @@ from django.apps import apps from django.conf import settings from django.core import management -from django.core.management.commands import loaddata from config import celery_app from .models.collection import Collection, WorkflowStatusChoices +from .models.curated_url import CuratedUrl +from .models.delta_url import DeltaUrl +from .models.dump_url import DumpUrl +from .models.url import Url from .sinequa_api import Api from .utils.github_helper import GitHubHandler @@ -49,7 +52,7 @@ def _get_data_to_import(collection, server_name): continue augmented_data = { - "model": "sde_collections.candidateurl", + "model": "sde_collections.url", "fields": { "collection": collection_pk, "url": url, @@ -62,6 +65,88 @@ def _get_data_to_import(collection, server_name): return data_to_import +def _compare_and_populate_delta_urls(collection): + """Compare DumpUrl and CuratedUrl and populate DeltaUrl.""" + dump_urls = DumpUrl.objects.filter(collection=collection) + curated_urls = CuratedUrl.objects.filter(collection=collection) + + DeltaUrl.objects.filter(collection=collection).delete() + + curated_urls_dict = {url.url: url for url in curated_urls} + + # Iterate over Dump URLs to find deltas + for dump_url in dump_urls: + curated_url = curated_urls_dict.get(dump_url.url) + + if not curated_url: + # New URL found, add to DeltaUrl + DeltaUrl.objects.create( + collection=collection, + url=dump_url.url, + scraped_title=dump_url.scraped_title, + generated_title=dump_url.generated_title, + document_type=dump_url.document_type, + division=dump_url.division, + delete=False, + ) + elif ( + curated_url.scraped_title != dump_url.scraped_title + or curated_url.generated_title != dump_url.generated_title + or curated_url.document_type != dump_url.document_type + or curated_url.division != dump_url.division + ): + # Metadata changed, add to DeltaUrl + DeltaUrl.objects.create( + collection=collection, + url=dump_url.url, + scraped_title=dump_url.scraped_title, + generated_title=dump_url.generated_title, + document_type=dump_url.document_type, + division=dump_url.division, + delete=False, + ) + + # Mark any missing URLs in CuratedUrl as deleted in DeltaUrl + dump_url_set = set(dump_urls.values_list("url", flat=True)) + for curated_url in curated_urls: + if curated_url.url not in dump_url_set: + DeltaUrl.objects.create( + collection=collection, + url=curated_url.url, + scraped_title=curated_url.scraped_title, + generated_title=curated_url.generated_title, + document_type=curated_url.document_type, + division=curated_url.division, + delete=True, + ) + + +def populate_dump_urls(collection): + urls = Url.objects.filter(collection=collection) + + for url_instance in urls: + try: + # Create DumpUrl by passing in the parent Url fields + dump_url_instance = DumpUrl( + id=url_instance.id, + collection=url_instance.collection, + url=url_instance.url, + scraped_title=url_instance.scraped_title, + visited=url_instance.visited, + document_type=url_instance.document_type, + division=url_instance.division, + ) + dump_url_instance.save() # Save both Url and DumpUrl entries + + print(f"Created DumpUrl: {dump_url_instance.url} - {dump_url_instance.scraped_title}") + + except Exception as e: + print(f"Error creating DumpUrl for {url_instance.url}: {str(e)}") + continue + + print(f"Successfully populated DumpUrl model with {urls.count()} entries.") + + @celery_app.task(soft_time_limit=10000) def import_candidate_urls_from_api(server_name="test", collection_ids=[]): TEMP_FOLDER_NAME = "temp" @@ -76,26 +161,30 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]): data_to_import = _get_data_to_import(server_name=server_name, collection=collection) print(f"Got {len(data_to_import)} records for {collection.config_folder}") + print("Clearing DumpUrl model...") + DumpUrl.objects.filter(collection=collection).delete() + print("Dumping django fixture to file") json.dump(data_to_import, open(urls_file, "w")) - print("Deleting existing candidate URLs") - # this sometimes takes a while - collection.candidate_urls.all().delete() + print("Loading data into Url model using loaddata...") + management.call_command("loaddata", urls_file) - print("Loading fixture; this may take a while") - # subprocess.call(f'python manage.py loaddata "{urls_file}"', shell=True) - management.call_command(loaddata.Command(), urls_file) + print("Creating DumpUrl entries...") + populate_dump_urls(collection) print("Applying existing patterns; this may take a while") collection.apply_all_patterns() - if collection.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING: + print("Comparing DumpUrl with CuratedUrl...") + _compare_and_populate_delta_urls(collection) + + if collection.workflow_status != WorkflowStatusChoices.ENGINEERING_IN_PROGRESS: collection.workflow_status = WorkflowStatusChoices.ENGINEERING_IN_PROGRESS collection.save() # Finally set the status to READY_FOR_CURATION - collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION + # collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION collection.save() print("Deleting temp files") From 7e24495fb2489615c0b8a6fd4b79d2e7550c436c Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 10:50:26 -0600 Subject: [PATCH 025/330] adding a save method for dump URL --- sde_collections/models/dump_url.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py index 85ef85d9..82e168ca 100644 --- a/sde_collections/models/dump_url.py +++ b/sde_collections/models/dump_url.py @@ -7,3 +7,8 @@ class DumpUrl(Url): class Meta: verbose_name = "Dump URL" verbose_name_plural = "Dump URLs" + + def save(self, *args, **kwargs): + if not self.pk: # Ensure it's only called on create + super().save(*args, **kwargs) # Save the parent `Url` entry + super().save(*args, **kwargs) From e5e64f46c26d822c971d741c774bed8dabf1121b Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 4 Nov 2024 11:34:53 -0600 Subject: [PATCH 026/330] move all url models into the same file --- sde_collections/models/curated_url.py | 9 -------- sde_collections/models/delta_url.py | 13 ----------- sde_collections/models/dump_url.py | 14 ------------ sde_collections/models/url.py | 31 +++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 36 deletions(-) delete mode 100644 sde_collections/models/curated_url.py delete mode 100644 sde_collections/models/delta_url.py delete mode 100644 sde_collections/models/dump_url.py diff --git a/sde_collections/models/curated_url.py b/sde_collections/models/curated_url.py deleted file mode 100644 index d55dcb5f..00000000 --- a/sde_collections/models/curated_url.py +++ /dev/null @@ -1,9 +0,0 @@ -from .url import Url - - -class CuratedUrl(Url): - """Model for storing curated and live URLs after the curation process.""" - - class Meta: - verbose_name = "Curated URL" - verbose_name_plural = "Curated URLs" diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py deleted file mode 100644 index 028607ab..00000000 --- a/sde_collections/models/delta_url.py +++ /dev/null @@ -1,13 +0,0 @@ -from django.db import models - -from .url import Url - - -class DeltaUrl(Url): - """Model for storing delta URLs for curation purposes""" - - delete = models.BooleanField(default=False) - - class Meta: - verbose_name = "Delta URL" - verbose_name_plural = "Delta URLs" diff --git a/sde_collections/models/dump_url.py b/sde_collections/models/dump_url.py deleted file mode 100644 index 82e168ca..00000000 --- a/sde_collections/models/dump_url.py +++ /dev/null @@ -1,14 +0,0 @@ -from .url import Url - - -class DumpUrl(Url): - """Model for storing all the imported URLs before seperating them into delta URLs and Curated URLs.""" - - class Meta: - verbose_name = "Dump URL" - verbose_name_plural = "Dump URLs" - - def save(self, *args, **kwargs): - if not self.pk: # Ensure it's only called on create - super().save(*args, **kwargs) # Save the parent `Url` entry - super().save(*args, **kwargs) diff --git a/sde_collections/models/url.py b/sde_collections/models/url.py index 7ce86dff..3fc70243 100644 --- a/sde_collections/models/url.py +++ b/sde_collections/models/url.py @@ -83,3 +83,34 @@ def __str__(self) -> str: def save(self, *args, **kwargs): super().save(*args, **kwargs) + + +class DumpUrl(Url): + """Model for storing all the imported URLs before separating them into delta URLs and Curated URLs.""" + + class Meta: + verbose_name = "Dump URL" + verbose_name_plural = "Dump URLs" + + def save(self, *args, **kwargs): + if not self.pk: # Ensure it's only called on create + super().save(*args, **kwargs) # Save the parent `Url` entry + super().save(*args, **kwargs) + + +class DeltaUrl(Url): + """Model for storing delta URLs for curation purposes""" + + delete = models.BooleanField(default=False) + + class Meta: + verbose_name = "Delta URL" + verbose_name_plural = "Delta URLs" + + +class CuratedUrl(Url): + """Model for storing curated and live URLs after the curation process.""" + + class Meta: + verbose_name = "Curated URL" + verbose_name_plural = "Curated URLs" From 7a906b71d5355fc13cacafd1f985ee692e9474ef Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Mon, 4 Nov 2024 11:42:44 -0600 Subject: [PATCH 027/330] update admin url imports --- sde_collections/admin.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index a8fce352..df33af9d 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -5,11 +5,8 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory -from .models.curated_url import CuratedUrl -from .models.delta_url import DeltaUrl -from .models.dump_url import DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .models.url import Url +from .models.url import Url, CuratedUrl, DeltaUrl, DumpUrl from .tasks import import_candidate_urls_from_api From 728a5b425b76d402ffefb83aef5f574fa7b84c2c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 17:42:59 +0000 Subject: [PATCH 028/330] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/admin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index df33af9d..e7780846 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -6,7 +6,7 @@ from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .models.url import Url, CuratedUrl, DeltaUrl, DumpUrl +from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url from .tasks import import_candidate_urls_from_api From f5c69bd4ce64c1edcfdd700e15e0e0404b19ce67 Mon Sep 17 00:00:00 2001 From: Kiran Dawadi Date: Mon, 4 Nov 2024 14:46:19 -0600 Subject: [PATCH 029/330] refactor code --- sde_collections/admin.py | 171 ++++++++++++------ ..._candidateurl_tdamm_tag_manual_and_more.py | 151 ++++++++++++++++ sde_collections/models/candidate_url.py | 120 +++++++----- sde_collections/serializers.py | 21 ++- .../utils/paired_field_descriptor.py | 3 - 5 files changed, 349 insertions(+), 117 deletions(-) create mode 100644 sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 73576899..0860d0e5 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -8,7 +8,6 @@ from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api from django import forms -from django.contrib.postgres.fields import ArrayField @admin.action(description="Generate deployment message") @@ -264,14 +263,8 @@ def exclude_and_delete_children(modeladmin, request, queryset): for candidate_url in queryset.all(): candidate_url.get_children().delete() -class CandidateURLForm(forms.ModelForm): - # tdamm_tag = forms.MultipleChoiceField( - # choices=CandidateURL.TDAMM_TAG_CHOICES, - # required=False, - # label="TDAMM Tags", - # widget=forms.CheckboxSelectMultiple, - # ) +class CandidateURLForm(forms.ModelForm): tdamm_tag_ml = forms.MultipleChoiceField( choices=CandidateURL.TDAMM_TAG_CHOICES, required=False, @@ -285,83 +278,141 @@ class CandidateURLForm(forms.ModelForm): label="TDAMM Manual Tags", widget=forms.CheckboxSelectMultiple, ) - + class Meta: model = CandidateURL - fields = '__all__' + fields = "__all__" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Initialize tdamm_tag - # if self.instance and hasattr(self.instance, 'tdamm_tag'): - # self.fields['tdamm_tag'].initial = self.instance.tdamm_tag or [] - - # Initialize tdamm_tag_ml - if self.instance and self.instance.tdamm_tag_ml: - self.fields['tdamm_tag_ml'].initial = self.instance.tdamm_tag_ml - - # Initialize tdamm_tag_manual - if self.instance and self.instance.tdamm_tag_manual: - self.fields['tdamm_tag_manual'].initial = self.instance.tdamm_tag_manual + instance = kwargs.get("instance") + + # Only show TDAMM fields if is_tdamm is True + if not instance or not instance.is_tdamm: + if "tdamm_tag_ml" in self.fields: + del self.fields["tdamm_tag_ml"] + if "tdamm_tag_manual" in self.fields: + del self.fields["tdamm_tag_manual"] + else: + # Initialize tdamm fields only if is_tdamm is True + if hasattr(self.instance, "tdamm_tag_ml"): + self.fields["tdamm_tag_ml"].initial = self.instance.tdamm_tag_ml or [] + + if hasattr(self.instance, "tdamm_tag_manual"): + self.fields["tdamm_tag_manual"].initial = self.instance.tdamm_tag_manual or [] def clean(self): cleaned_data = super().clean() - - # Handle tdamm_tag - # tdamm_tag_value = cleaned_data.get('tdamm_tag', []) - # if not tdamm_tag_value: - # cleaned_data['tdamm_tag_manual'] = None - # else: - # cleaned_data['tdamm_tag_manual'] = tdamm_tag_value - - # Handle tdamm_tag_ml - tdamm_tag_ml_value = cleaned_data.get('tdamm_tag_ml', []) - if not tdamm_tag_ml_value: - cleaned_data['tdamm_tag_ml'] = None - - # Handle tdamm_tag_manual - tdamm_tag_manual_value = cleaned_data.get('tdamm_tag_manual', []) - if not tdamm_tag_manual_value: - cleaned_data['tdamm_tag_manual'] = None - return cleaned_data + def save(self, commit=True): + instance = super().save(commit=False) + + # Handle TDAMM fields if is_tdamm is True + if instance.is_tdamm: + # Get values from the form + tdamm_tag_ml = self.cleaned_data.get("tdamm_tag_ml", []) + tdamm_tag_manual = self.cleaned_data.get("tdamm_tag_manual", []) + + # Set the values directly on the instance + instance.tdamm_tag_ml = tdamm_tag_ml or None + instance.tdamm_tag_manual = tdamm_tag_manual or None + else: + # Clear TDAMM fields if is_tdamm is False + instance.tdamm_tag_ml = None + instance.tdamm_tag_manual = None + + if commit: + instance.save() + + return instance + + class CandidateURLAdmin(admin.ModelAdmin): """Admin View for CandidateURL""" form = CandidateURLForm - list_display = ( - "url", - "scraped_title", - "collection", - # "tdamm_tag_display", - "tdamm_tag_ml_display", - "tdamm_tag_manual_display" - ) - list_filter = ("collection",) - - # @admin.display(description='TDAMM Tags') - # def tdamm_tag_display(self, obj): - # if obj.tdamm_tag: - # readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag] - # return ", ".join(readable_tags) - # return "" - - @admin.display(description='TDAMM ML Tags') + def get_list_display(self, request): + list_display = [ + "url", + "scraped_title", + "collection", + "is_tdamm", + ] + # Add TDAMM-related fields only if any TDAMM-enabled URLs exist + if CandidateURL.objects.filter(is_tdamm=True).exists(): + list_display.extend(["tdamm_tag_ml_display", "tdamm_tag_manual_display"]) + return list_display + + list_filter = ("collection", "is_tdamm") + + @admin.display(description="TDAMM ML Tags") def tdamm_tag_ml_display(self, obj): - if obj.tdamm_tag_ml: + if obj.is_tdamm and obj.tdamm_tag_ml: readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_ml] return ", ".join(readable_tags) return "" - @admin.display(description='TDAMM Manual Tags') + @admin.display(description="TDAMM Manual Tags") def tdamm_tag_manual_display(self, obj): - if obj.tdamm_tag_manual: + if obj.is_tdamm and obj.tdamm_tag_manual: readable_tags = [dict(CandidateURL.TDAMM_TAG_CHOICES).get(tag, tag) for tag in obj.tdamm_tag_manual] return ", ".join(readable_tags) return "" + def get_fieldsets(self, request, obj=None): + """Dynamically adjust fieldsets based on is_tdamm""" + fieldsets = [ + ( + "Essential Information", + { + "fields": ( + "collection", + "url", + "hash", + "scraped_title", + "generated_title", + "test_title", + "production_title", + "level", + "visited", + "document_type", + "division", + "inferenced_by", + "is_pdf", + "present_on_test", + "present_on_prod", + "is_tdamm", + ) + }, + ), + ] + + # Add TDAMM fields only if is_tdamm is True + if obj and obj.is_tdamm: + fieldsets.append( + ( + "TDAMM Tags", + { + "fields": ( + "tdamm_tag_ml", + "tdamm_tag_manual", + ), + "classes": ("collapse",), + }, + ) + ) + + return fieldsets + + def save_model(self, request, obj, form, change): + """Ensure proper saving of the model""" + if not obj.is_tdamm: + obj.tdamm_tag_ml = None + obj.tdamm_tag_manual = None + super().save_model(request, obj, form, change) + class TitlePatternAdmin(admin.ModelAdmin): """Admin View for TitlePattern""" diff --git a/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..d8a0a4a7 --- /dev/null +++ b/sde_collections/migrations/0060_alter_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,151 @@ +# Generated by Django 4.2.9 on 2024-11-04 06:33 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_candidateurl_tdamm_tag_manual_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="candidateurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + verbose_name="TDAMM Manual Tags", + ), + ), + migrations.RenameField( + model_name="candidateurl", + old_name="tdamm_tag_manual", + new_name="_tdamm_tag_manual", + ), + migrations.AlterField( + model_name="candidateurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + verbose_name="TDAMM ML Tags", + ), + ), + migrations.RenameField( + model_name="candidateurl", + old_name="tdamm_tag_ml", + new_name="_tdamm_tag_ml", + ), + migrations.RemoveField( + model_name="collection", + name="tdamm_tag_manual", + ), + migrations.RemoveField( + model_name="collection", + name="tdamm_tag_ml", + ), + migrations.AddField( + model_name="candidateurl", + name="is_tdamm", + field=models.BooleanField( + default=False, help_text="Enable TDAMM tagging for this URL", verbose_name="Is TDAMM" + ), + ), + ] diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index f8c91a97..41c1072f 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -10,6 +10,7 @@ from ..utils.paired_field_descriptor import PairedFieldDescriptor from django.contrib.postgres.fields import ArrayField + class CandidateURLQuerySet(models.QuerySet): def with_exclusion_status(self): return self.annotate( @@ -80,63 +81,91 @@ class CandidateURL(models.Model): default=False, help_text="Helps keep track if the Current URL is present in production or not", ) + is_tdamm = models.BooleanField("Is TDAMM?", default=False, help_text="Enable TDAMM tagging for this URL") TDAMM_TAG_CHOICES = [ - ('MMA_M_EM', 'Messenger - EM Radiation'), - ('MMA_M_EM_G', 'Messenger - EM Radiation - Gamma rays'), - ('MMA_M_EM_X', 'Messenger - EM Radiation - X-rays'), - ('MMA_M_EM_U', 'Messenger - EM Radiation - Ultraviolet'), - ('MMA_M_EM_O', 'Messenger - EM Radiation - Optical'), - ('MMA_M_EM_I', 'Messenger - EM Radiation - Infrared'), - ('MMA_M_EM_M', 'Messenger - EM Radiation - Microwave'), - ('MMA_M_EM_R', 'Messenger - EM Radiation - Radio'), - ('MMA_M_G', 'Messenger - Gravitational Waves'), - ('MMA_M_G_CBI', 'Messenger - Gravitational Waves - Compact Binary Inspiral'), - ('MMA_M_G_S', 'Messenger - Gravitational Waves - Stochastic'), - ('MMA_M_G_CON', 'Messenger - Gravitational Waves - Continuous'), - ('MMA_M_G_B', 'Messenger - Gravitational Waves - Burst'), - ('MMA_M_C', 'Messenger - Cosmic Rays'), - ('MMA_M_N', 'Messenger - Neutrinos'), - ('MMA_O_BI', 'Objects - Binaries'), - ('MMA_O_BI_BBH', 'Objects - Binaries - Binary Black Holes'), - ('MMA_O_BI_BNS', 'Objects - Binaries - Binary Neutron Stars'), - ('MMA_O_BI_C', 'Objects - Binaries - Cataclysmic Variables'), - ('MMA_O_BI_N', 'Objects - Binaries - Neutron Star-Black Hole'), - ('MMA_O_BI_B', 'Objects - Binaries - Binary Pulsars'), - ('MMA_O_BI_W', 'Objects - Binaries - White Dwarf Binaries'), - ('MMA_O_BH', 'Objects - Black Holes'), - ('MMA_O_BH_AGN', 'Objects - Black Holes - Active Galactic Nuclei'), - ('MMA_O_BH_IM', 'Objects - Black Holes - Intermediate mass'), - ('MMA_O_BH_STM', 'Objects - Black Holes - Stellar mass'), - ('MMA_O_BH_SUM', 'Objects - Black Holes - Supermassive'), - ('MMA_O_E', 'Objects - Exoplanets'), - ('MMA_O_N', 'Objects - Neutron Stars'), - ('MMA_O_N_M', 'Objects - Neutron Stars - Magnetars'), - ('MMA_O_N_P', 'Objects - Neutron Stars - Pulsars'), - ('MMA_O_N_PWN', 'Objects - Neutron Stars - Pulsar Wind Nebula'), - ('MMA_O_S', 'Objects - Supernova Remnants'), - ('MMA_S_F', 'Signals - Fast Radio Bursts'), - ('MMA_S_G', 'Signals - Gamma-ray Bursts'), - ('MMA_S_K', 'Signals - Kilonovae'), - ('MMA_S_N', 'Signals - Novae'), - ('MMA_S_P', 'Signals - Pevatrons'), - ('MMA_S_ST', 'Signals - Stellar flares'), - ('MMA_S_SU', 'Signals - Supernovae'), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), ] - tdamm_tag_manual = ArrayField( + # Define TDAMM fields but make them optional + @property + def tdamm_tag_manual(self): + if hasattr(self, "_tdamm_tag_manual") and self.is_tdamm: + return self._tdamm_tag_manual + return None + + @tdamm_tag_manual.setter + def tdamm_tag_manual(self, value): + if self.is_tdamm: + self._tdamm_tag_manual = value + + @property + def tdamm_tag_ml(self): + if hasattr(self, "_tdamm_tag_ml") and self.is_tdamm: + return self._tdamm_tag_ml + return None + + @tdamm_tag_ml.setter + def tdamm_tag_ml(self, value): + if self.is_tdamm: + self._tdamm_tag_ml = value + + _tdamm_tag_manual = ArrayField( models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), blank=True, null=True, - verbose_name="TDAMM Manual Tags" + verbose_name="TDAMM Manual Tags", + db_column="tdamm_tag_manual", ) - tdamm_tag_ml = ArrayField( + + _tdamm_tag_ml = ArrayField( models.CharField(max_length=255, choices=TDAMM_TAG_CHOICES), blank=True, null=True, - verbose_name="TDAMM ML Tags" + verbose_name="TDAMM ML Tags", + db_column="tdamm_tag_ml", ) - tdamm_tag = PairedFieldDescriptor('tdamm_tag') + + tdamm_tag = PairedFieldDescriptor("tdamm_tag") class Meta: """Meta definition for Candidate URL.""" @@ -144,6 +173,7 @@ class Meta: verbose_name = "Candidate URL" verbose_name_plural = "Candidate URLs" ordering = ["url"] + db_table = "sde_collections_candidateurl" @property def fileext(self) -> str: diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index b7bb3b25..29d86c31 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -112,18 +112,21 @@ class CandidateURLAPISerializer(serializers.ModelSerializer): title = serializers.SerializerMethodField() file_extension = serializers.SerializerMethodField() tree_root = serializers.SerializerMethodField() + tdamm_tag = serializers.SerializerMethodField() class Meta: model = CandidateURL - fields = ( - "url", - "title", - "document_type", - "hash", - "file_extension", - "tree_root", - "tdamm_tag" - ) + fields = ("url", "title", "document_type", "hash", "file_extension", "tree_root", "is_tdamm", "tdamm_tag") + + def to_representation(self, instance): + """Remove tdamm_tag field if is_tdamm is False""" + representation = super().to_representation(instance) + if not instance.is_tdamm: + representation.pop("tdamm_tag", None) + return representation + + def get_tdamm_tag(self, obj): + return obj.tdamm_tag def get_document_type(self, obj): if obj.document_type is not None: diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py index e07d41dc..9ac0c4e3 100644 --- a/sde_collections/utils/paired_field_descriptor.py +++ b/sde_collections/utils/paired_field_descriptor.py @@ -1,6 +1,3 @@ -from django.db import models - - class PairedFieldDescriptor: def __init__(self, field_name): self.manual_field_name = f"{field_name}_manual" From 7e888e8457f02bbe8417f6e66ecc1d52be9608c4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 20:49:23 +0000 Subject: [PATCH 030/330] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sde_collections/admin.py | 2 +- sde_collections/models/candidate_url.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 0860d0e5..bf97cf02 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -1,5 +1,6 @@ import csv +from django import forms from django.contrib import admin, messages from django.http import HttpResponse @@ -7,7 +8,6 @@ from .models.collection import Collection, WorkflowHistory from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import import_candidate_urls_from_api -from django import forms @admin.action(description="Generate deployment message") diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 41c1072f..8d2776dd 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -2,13 +2,13 @@ import os from urllib.parse import urlparse +from django.contrib.postgres.fields import ArrayField from django.db import models +from ..utils.paired_field_descriptor import PairedFieldDescriptor from .collection import Collection from .collection_choice_fields import Divisions, DocumentTypes from .pattern import ExcludePattern, TitlePattern -from ..utils.paired_field_descriptor import PairedFieldDescriptor -from django.contrib.postgres.fields import ArrayField class CandidateURLQuerySet(models.QuerySet): From df88c6b11c1a91709bfcd01a1a88f8887f8b814b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:23:41 -0600 Subject: [PATCH 031/330] squashed migrations --- .../0059_url_curatedurl_deltaurl_dumpurl.py | 2 +- .../migrations/0060_delete_dumpurl.py | 16 --------- sde_collections/migrations/0061_dumpurl.py | 35 ------------------- 3 files changed, 1 insertion(+), 52 deletions(-) delete mode 100644 sde_collections/migrations/0060_delete_dumpurl.py delete mode 100644 sde_collections/migrations/0061_dumpurl.py diff --git a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py index 82f4d4af..58478546 100644 --- a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py +++ b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.9 on 2024-10-10 03:01 +# Generated by Django 4.2.9 on 2024-11-04 22:22 from django.db import migrations, models import django.db.models.deletion diff --git a/sde_collections/migrations/0060_delete_dumpurl.py b/sde_collections/migrations/0060_delete_dumpurl.py deleted file mode 100644 index db9a10c1..00000000 --- a/sde_collections/migrations/0060_delete_dumpurl.py +++ /dev/null @@ -1,16 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-14 16:37 - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"), - ] - - operations = [ - migrations.DeleteModel( - name="DumpUrl", - ), - ] diff --git a/sde_collections/migrations/0061_dumpurl.py b/sde_collections/migrations/0061_dumpurl.py deleted file mode 100644 index 4aeb0088..00000000 --- a/sde_collections/migrations/0061_dumpurl.py +++ /dev/null @@ -1,35 +0,0 @@ -# Generated by Django 4.2.9 on 2024-10-23 19:29 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ("sde_collections", "0060_delete_dumpurl"), - ] - - operations = [ - migrations.CreateModel( - name="DumpUrl", - fields=[ - ( - "url_ptr", - models.OneToOneField( - auto_created=True, - on_delete=django.db.models.deletion.CASCADE, - parent_link=True, - primary_key=True, - serialize=False, - to="sde_collections.url", - ), - ), - ], - options={ - "verbose_name": "Dump URL", - "verbose_name_plural": "Dump URLs", - }, - bases=("sde_collections.url",), - ), - ] From 48592cb6af69176b54fca27944dc0da370178aa1 Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:24:01 -0600 Subject: [PATCH 032/330] updated import references --- sde_collections/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 2f11700b..c42a84e6 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -3,7 +3,6 @@ from .models.candidate_url import CandidateURL from .models.collection import Collection, WorkflowHistory from .models.collection_choice_fields import Divisions, DocumentTypes -from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -11,6 +10,7 @@ IncludePattern, TitlePattern, ) +from .models.url import CuratedUrl class CollectionSerializer(serializers.ModelSerializer): From 266082c6f6054af9b0a72ed9cdf1a227012a080b Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:24:20 -0600 Subject: [PATCH 033/330] updated import references --- sde_collections/tasks.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index ecc3c1a9..77876500 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -10,10 +10,7 @@ from config import celery_app from .models.collection import Collection, WorkflowStatusChoices -from .models.curated_url import CuratedUrl -from .models.delta_url import DeltaUrl -from .models.dump_url import DumpUrl -from .models.url import Url +from .models.url import CuratedUrl, DeltaUrl, DumpUrl, Url from .sinequa_api import Api from .utils.github_helper import GitHubHandler From c3e2aee2be337ab04387e860c6cef24fcc8266ac Mon Sep 17 00:00:00 2001 From: Bishwas Praveen Date: Mon, 4 Nov 2024 16:24:39 -0600 Subject: [PATCH 034/330] update import references --- sde_collections/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sde_collections/views.py b/sde_collections/views.py index b8ff70a0..5d5d2982 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -27,7 +27,6 @@ DocumentTypes, WorkflowStatusChoices, ) -from .models.curated_url import CuratedUrl from .models.pattern import ( DivisionPattern, DocumentTypePattern, @@ -35,6 +34,7 @@ IncludePattern, TitlePattern, ) +from .models.url import CuratedUrl from .serializers import ( CandidateURLBulkCreateSerializer, CandidateURLSerializer, From f95a1a2666c3ab3d34c3331ccc883e83aa8c6006 Mon Sep 17 00:00:00 2001 From: Dhanur Sharma Date: Wed, 6 Nov 2024 15:55:28 -0600 Subject: [PATCH 035/330] Frontend work in progress --- sde_collections/serializers.py | 90 +- sde_collections/urls.py | 2 + sde_collections/views.py | 64 +- .../static/js/candidate_url_list.js | 848 +++++++++++++---- .../sde_collections/candidate_urls_list.html | 893 +++++++++++------- 5 files changed, 1378 insertions(+), 519 deletions(-) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index c42a84e6..ff1b6d3d 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -10,7 +10,7 @@ IncludePattern, TitlePattern, ) -from .models.url import CuratedUrl +from .models.url import CuratedUrl, DeltaUrl class CollectionSerializer(serializers.ModelSerializer): @@ -99,6 +99,94 @@ class Meta: ) +class CuratedURLSerializer(serializers.ModelSerializer): + excluded = serializers.BooleanField(required=False) + document_type_display = serializers.CharField(source="get_document_type_display", read_only=True) + division_display = serializers.CharField(source="get_division_display", read_only=True) + url = serializers.CharField(required=False) + generated_title_id = serializers.SerializerMethodField(read_only=True) + match_pattern_type = serializers.SerializerMethodField(read_only=True) + curated_urls_count = serializers.SerializerMethodField(read_only=True) + + def get_curated_urls_count(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.curated_urls.count() if titlepattern else 0 + + def get_generated_title_id(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.id if titlepattern else None + + def get_match_pattern_type(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.match_pattern_type if titlepattern else None + + class Meta: + model = CuratedUrl + fields = ( + "id", + "excluded", + "url", + "scraped_title", + "generated_title", + "generated_title_id", + "match_pattern_type", + "curated_urls_count", + "document_type", + "document_type_display", + "division", + "division_display", + "visited", + # "test_title", + # "production_title", + # "present_on_test", + # "present_on_prod", + ) + + +class DeltaURLSerializer(serializers.ModelSerializer): + excluded = serializers.BooleanField(required=False) + document_type_display = serializers.CharField(source="get_document_type_display", read_only=True) + division_display = serializers.CharField(source="get_division_display", read_only=True) + url = serializers.CharField(required=False) + generated_title_id = serializers.SerializerMethodField(read_only=True) + match_pattern_type = serializers.SerializerMethodField(read_only=True) + delta_urls_count = serializers.SerializerMethodField(read_only=True) + + def get_delta_urls_count(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.delta_urls.count() if titlepattern else 0 + + def get_generated_title_id(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.id if titlepattern else None + + def get_match_pattern_type(self, obj): + titlepattern = obj.titlepattern_urls.last() + return titlepattern.match_pattern_type if titlepattern else None + + class Meta: + model = DeltaUrl + fields = ( + "id", + "excluded", + "url", + "scraped_title", + "generated_title", + "generated_title_id", + "match_pattern_type", + "delta_urls_count", + "document_type", + "document_type_display", + "division", + "division_display", + "visited", + # "test_title", + # "production_title", + # "present_on_test", + # "present_on_prod", + ) + + class CandidateURLBulkCreateSerializer(serializers.ModelSerializer): class Meta: model = CandidateURL diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 214d1198..a17f6390 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -9,6 +9,8 @@ router.register(r"collections", views.CollectionViewSet, basename="collection") router.register(r"collections-read", views.CollectionReadViewSet, basename="collection-read") router.register(r"candidate-urls", views.CandidateURLViewSet) +router.register(r"curated-urls", views.CuratedURLViewSet) +router.register(r"delta-urls", views.DeltaURLViewSet) router.register(r"exclude-patterns", views.ExcludePatternViewSet) router.register(r"include-patterns", views.IncludePatternViewSet) router.register(r"title-patterns", views.TitlePatternViewSet) diff --git a/sde_collections/views.py b/sde_collections/views.py index 5d5d2982..f738b23d 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -34,13 +34,15 @@ IncludePattern, TitlePattern, ) -from .models.url import CuratedUrl +from .models.url import CuratedUrl, DeltaUrl from .serializers import ( CandidateURLBulkCreateSerializer, CandidateURLSerializer, CollectionReadSerializer, CollectionSerializer, CuratedUrlAPISerializer, + CuratedURLSerializer, + DeltaURLSerializer, DivisionPatternSerializer, DocumentTypePatternSerializer, ExcludePatternSerializer, @@ -285,6 +287,66 @@ def update_division(self, request, pk=None): return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) +class CuratedURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet): + queryset = CuratedUrl.objects.all() + serializer_class = CuratedURLSerializer + + def _filter_by_is_excluded(self, queryset, is_excluded): + if is_excluded == "false": + queryset = queryset.filter(excluded=False) + elif is_excluded == "true": + queryset = queryset.exclude(excluded=False) + return queryset + + def get_queryset(self): + queryset = super().get_queryset() + if self.request.method == "GET": + # Filter based on exclusion status + is_excluded = self.request.GET.get("is_excluded") + if is_excluded: + queryset = self._filter_by_is_excluded(queryset, is_excluded) + return queryset.order_by("url") + + def update_division(self, request, pk=None): + curated_url = get_object_or_404(CuratedUrl, pk=pk) + division = request.data.get("division") + if division: + curated_url.division = division + curated_url.save() + return Response(status=status.HTTP_200_OK) + return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) + + +class DeltaURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet): + queryset = DeltaUrl.objects.all() + serializer_class = DeltaURLSerializer + + def _filter_by_is_excluded(self, queryset, is_excluded): + if is_excluded == "false": + queryset = queryset.filter(excluded=False) + elif is_excluded == "true": + queryset = queryset.exclude(excluded=False) + return queryset + + def get_queryset(self): + queryset = super().get_queryset() + if self.request.method == "GET": + # Filter based on exclusion status + is_excluded = self.request.GET.get("is_excluded") + if is_excluded: + queryset = self._filter_by_is_excluded(queryset, is_excluded) + return queryset.order_by("url") + + def update_division(self, request, pk=None): + delta_url = get_object_or_404(DeltaUrl, pk=pk) + division = request.data.get("division") + if division: + delta_url.division = division + delta_url.save() + return Response(status=status.HTTP_200_OK) + return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) + + class CandidateURLBulkCreateView(generics.ListCreateAPIView): queryset = CandidateURL.objects.all() serializer_class = CandidateURLBulkCreateSerializer diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/candidate_url_list.js index ed6d3e4b..7b01cc6c 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/candidate_url_list.js @@ -322,6 +322,436 @@ function initializeDataTable() { }, 1000) ); + var curated_urls_table = $("#curated_urls_table").DataTable({ + pageLength: 100, + colReorder: true, + stateSave: true, + layout: { + bottomEnd: "inputPaging", + topEnd: null, + topStart: { + info: true, + pageLength: { + menu: [ + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], + ], + }, + buttons: [ + { + extend: "csv", + exportOptions: { + columns: [0, 11, 2, 12, 10], + }, + customize: function (csv) { + var lines = csv.split("\n"); + + // Reorder the header columns + var headers = lines[0].split(","); + headers[4] = "New Title"; + var reorderedHeaders = [ + headers[0], + headers[3], + headers[1], + headers[4], + headers[5], + headers[2], + ]; + lines[0] = reorderedHeaders.join(","); + + const appliedFilt = [ + [`URL:`, `${$("#curatedUrlFilter").val()}`.trim()], + [`Exclude:`, `${$(".dropdown-1").val()}`.trim()], + [ + `Scraped Title:`, + `${$("#curatedScrapedTitleFilter").val()}`.trim(), + ], + [`New Title:`, `${$("#curatedNewTitleFilter").val()}`.trim()], + [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()], + [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()], + ]; + + const filtersAreEmpty = appliedFilt.every((filter) => { + return filter[1] === "" || filter[1] === "undefined"; + }); + + // Remove the second row with the filters + if (lines.length > 2) { + lines.splice(1, 1); + } + let alteredLines = []; + lines.forEach((line) => { + let newLine = ""; + newLine = line.replace("open_in_new", ""); + alteredLines.push(newLine); + }); + + if (filtersAreEmpty) return alteredLines.join("\n"); + else { + // Add filter information to the first row + const secondRowFilters = [ + "Export of SDE Curated URLs", + `"(Applied Filters: ${appliedFilt + .reduce((acc, curr) => { + if ( + curr[1] !== " undefined" && + curr[1] !== " " && + curr[1] !== "" && + curr[1] !== "undefined" + ) { + acc = `${acc}, ${curr[0]} ${curr[1]}`; + } + return acc; + }, "") + .slice(2)})"`, + ]; + + var appliedFiltersInfo = secondRowFilters.join("\n"); + return appliedFiltersInfo + "\n" + alteredLines.join("\n"); + } + }, + }, + "spacer", + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#curated_urls_table"); + }, + }, + ], + }, + }, + serverSide: true, + orderCellsTop: true, + pagingType: "input", + rowId: "url", + stateLoadCallback: function (settings) { + var state = JSON.parse( + localStorage.getItem( + "DataTables_curated_urls_" + window.location.pathname + ) + ); + if (!state) { + settings.oInit.pageLength = 1; + } + return state; + }, + ajax: { + url: `/api/curated-urls/?format=datatables&collection_id=${collection_id}`, + data: function (d) { + d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; + }, + }, + initComplete: function (data) { + const addDropdownSelect = [1, 4, 5]; + const dict = { + 1: "Images", + 2: "Data", + 3: "Documentation", + 4: "Software and Tools", + 5: "Missions and Instruments", + }; + this.api() + .columns() + .every(function (index) { + let column = this; + if (addDropdownSelect.includes(index)) { + $("thead tr td select.dropdown-" + index).on("change", function () { + var val = $.fn.dataTable.util.escapeRegex($(this).val()); + column.search(val ? "^" + val + "$" : "", true, false).draw(); + }); + } + }); + }, + + columns: [ + getCuratedURLColumn(), + getExcludedColumn(true_icon, false_icon), + getScrapedTitleColumn(), + getCuratedGeneratedTitleColumn(), + getDocumentTypeColumn(), + getDivisionColumn(), + { data: "id", visible: false, searchable: false }, + { data: "generated_title_id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false, searchable: false }, + { data: "curated_urls_count", visible: false, searchable: false }, + { data: "excluded", visible: false, searchable: false }, + { + data: null, + render: function (data, type, row) { + if (!row.document_type) return "Select"; + return dict[row.document_type]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + const excludedDict = { + true: "Yes", + false: "No", + }; + return excludedDict[row.excluded]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + return row.generated_title; + }, + visible: false, + }, + // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []), + // getDivisionColumn(), + ], + createdRow: function (row, data, dataIndex) { + if (data["excluded"]) { + $(row).attr( + "style", + "background-color: rgba(255, 61, 87, 0.36) !important" + ); + } + }, + }); + + $("#curatedUrlFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(0).search(this.value).draw(); + }, 1000) + ); + + $("#curatedScrapedTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(2).search(this.value).draw(); + }, 1000) + ); + + $("#curatedNewTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(3).search(this.value).draw(); + }, 1000) + ); + + var delta_urls_table = $("#delta_urls_table").DataTable({ + pageLength: 100, + colReorder: true, + stateSave: true, + layout: { + bottomEnd: "inputPaging", + topEnd: null, + topStart: { + info: true, + pageLength: { + menu: [ + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], + ], + }, + buttons: [ + { + extend: "csv", + exportOptions: { + columns: [0, 11, 2, 12, 10], + }, + customize: function (csv) { + var lines = csv.split("\n"); + + // Reorder the header columns + var headers = lines[0].split(","); + headers[4] = "New Title"; + var reorderedHeaders = [ + headers[0], + headers[3], + headers[1], + headers[4], + headers[5], + headers[2], + ]; + lines[0] = reorderedHeaders.join(","); + + const appliedFilt = [ + [`URL:`, `${$("#deltaUrlFilter").val()}`.trim()], + [`Exclude:`, `${$(".dropdown-1").val()}`.trim()], + [ + `Scraped Title:`, + `${$("#deltaScrapedTitleFilter").val()}`.trim(), + ], + [`New Title:`, `${$("#deltaNewTitleFilter").val()}`.trim()], + [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()], + [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()], + ]; + + const filtersAreEmpty = appliedFilt.every((filter) => { + return filter[1] === "" || filter[1] === "undefined"; + }); + + // Remove the second row with the filters + if (lines.length > 2) { + lines.splice(1, 1); + } + let alteredLines = []; + lines.forEach((line) => { + let newLine = ""; + newLine = line.replace("open_in_new", ""); + alteredLines.push(newLine); + }); + + if (filtersAreEmpty) return alteredLines.join("\n"); + else { + // Add filter information to the first row + const secondRowFilters = [ + "Export of SDE Delta URLs", + `"(Applied Filters: ${appliedFilt + .reduce((acc, curr) => { + if ( + curr[1] !== " undefined" && + curr[1] !== " " && + curr[1] !== "" && + curr[1] !== "undefined" + ) { + acc = `${acc}, ${curr[0]} ${curr[1]}`; + } + return acc; + }, "") + .slice(2)})"`, + ]; + + var appliedFiltersInfo = secondRowFilters.join("\n"); + return appliedFiltersInfo + "\n" + alteredLines.join("\n"); + } + }, + }, + "spacer", + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#delta_urls_table"); + }, + }, + ], + }, + }, + serverSide: true, + orderCellsTop: true, + pagingType: "input", + rowId: "url", + stateLoadCallback: function (settings) { + var state = JSON.parse( + localStorage.getItem( + "DataTables_delta_urls_" + window.location.pathname + ) + ); + if (!state) { + settings.oInit.pageLength = 1; + } + return state; + }, + ajax: { + url: `/api/delta-urls/?format=datatables&collection_id=${collection_id}`, + data: function (d) { + d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; + }, + }, + initComplete: function (data) { + const addDropdownSelect = [1, 4, 5]; + const dict = { + 1: "Images", + 2: "Data", + 3: "Documentation", + 4: "Software and Tools", + 5: "Missions and Instruments", + }; + this.api() + .columns() + .every(function (index) { + let column = this; + if (addDropdownSelect.includes(index)) { + $("thead tr td select.dropdown-" + index).on("change", function () { + var val = $.fn.dataTable.util.escapeRegex($(this).val()); + column.search(val ? "^" + val + "$" : "", true, false).draw(); + }); + } + }); + }, + + columns: [ + getDeltaURLColumn(), + getExcludedColumn(true_icon, false_icon), + getScrapedTitleColumn(), + getDeltaGeneratedTitleColumn(), + getDocumentTypeColumn(), + getDivisionColumn(), + { data: "id", visible: false, searchable: false }, + { data: "generated_title_id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false, searchable: false }, + { data: "delta_urls_count", visible: false, searchable: false }, + { data: "excluded", visible: false, searchable: false }, + { + data: null, + render: function (data, type, row) { + if (!row.document_type) return "Select"; + return dict[row.document_type]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + const excludedDict = { + true: "Yes", + false: "No", + }; + return excludedDict[row.excluded]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + return row.generated_title; + }, + visible: false, + }, + // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []), + // getDivisionColumn(), + ], + createdRow: function (row, data, dataIndex) { + if (data["excluded"]) { + $(row).attr( + "style", + "background-color: rgba(255, 61, 87, 0.36) !important" + ); + } + }, + }); + + $("#deltaUrlFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + delta_urls_table.columns(0).search(this.value).draw(); + }, 1000) + ); + + $("#deltaScrapedTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + delta_urls_table.columns(2).search(this.value).draw(); + }, 1000) + ); + + $("#deltaNewTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + delta_urls_table.columns(3).search(this.value).draw(); + }, 1000) + ); + var exclude_patterns_table = $("#exclude_patterns_table").DataTable({ // scrollY: true, dom: "lBrtip", @@ -682,96 +1112,96 @@ function initializeDataTable() { var division_patterns_table = $("#division_patterns_table").DataTable({ dom: "lBrtip", buttons: [ - { - text: "Add Pattern", - className: "addPattern", - action: function () { - $modal = $("#divisionPatternModal").modal(); - }, + { + text: "Add Pattern", + className: "addPattern", + action: function () { + $modal = $("#divisionPatternModal").modal(); }, - { - text: "Customize Columns", - className: "customizeColumns", - action: function () { - modalContents("#division_patterns_table"); - }, + }, + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#division_patterns_table"); }, + }, ], lengthMenu: [ - [25, 50, 100, 500], - ["Show 25", "Show 50", "Show 100", "Show 500"], + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], ], orderCellsTop: true, pageLength: 100, ajax: `/api/division-patterns/?format=datatables&collection_id=${collection_id}`, initComplete: function (data) { - this.api() - .columns() - .every(function (index) { - var table = $("#division_patterns_table").DataTable(); - - let addDropdownSelect = { - 1: { - columnToSearch: 6, - matchPattern: { - "Individual URL Pattern": 1, - "Multi-URL Pattern": 2, - }, - }, - 2: { - columnToSearch: 7, - matchPattern: { - "Astrophysics": 1, - "Biological and Physical Sciences": 2, - "Earth Science": 3, - "Heliophysics": 4, - "Planetary Science": 5, - }, - }, - }; - - let column = this; - if (column.data().length === 0) { - $(`#division-patterns-dropdown-${index}`).prop("disabled", true); - } else if (index in addDropdownSelect) { - $("#division-patterns-dropdown-" + index).on("change", function () { - let col = addDropdownSelect[index].columnToSearch; - let searchInput = - addDropdownSelect[index].matchPattern[$(this).val()]; - if ($(this).val() === "" || $(this).val() === undefined) - table.columns(col).search("").draw(); - else { - table.columns(col).search(searchInput).draw(); - } - }); - } + this.api() + .columns() + .every(function (index) { + var table = $("#division_patterns_table").DataTable(); + + let addDropdownSelect = { + 1: { + columnToSearch: 6, + matchPattern: { + "Individual URL Pattern": 1, + "Multi-URL Pattern": 2, + }, + }, + 2: { + columnToSearch: 7, + matchPattern: { + "Astrophysics": 1, + "Biological and Physical Sciences": 2, + "Earth Science": 3, + "Heliophysics": 4, + "Planetary Science": 5, + }, + }, + }; + + let column = this; + if (column.data().length === 0) { + $(`#division-patterns-dropdown-${index}`).prop("disabled", true); + } else if (index in addDropdownSelect) { + $("#division-patterns-dropdown-" + index).on("change", function () { + let col = addDropdownSelect[index].columnToSearch; + let searchInput = + addDropdownSelect[index].matchPattern[$(this).val()]; + if ($(this).val() === "" || $(this).val() === undefined) + table.columns(col).search("").draw(); + else { + table.columns(col).search(searchInput).draw(); + } }); + } + }); }, columns: [ - { data: "match_pattern", class: "whiteText" }, - { - data: "match_pattern_type_display", - class: "text-center whiteText", - sortable: false, - }, - { data: "division_display", class: "whiteText" }, - { - data: "candidate_urls_count", - class: "text-center whiteText", - sortable: true, - }, - { - data: null, - sortable: false, - class: "text-center", - render: function (data, type, row) { - return ``; - }, + { data: "match_pattern", class: "whiteText" }, + { + data: "match_pattern_type_display", + class: "text-center whiteText", + sortable: false, + }, + { data: "division_display", class: "whiteText" }, + { + data: "candidate_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: null, + sortable: false, + class: "text-center", + render: function (data, type, row) { + return ``; }, - { data: "id", visible: false, searchable: false }, - { data: "match_pattern_type", visible: false }, - { data: "division", visible: false }, + }, + { data: "id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false }, + { data: "division", visible: false }, ], }); @@ -841,8 +1271,8 @@ function getDivisionColumn() { `; }, @@ -882,7 +1312,7 @@ $("#division_pattern_form").on("submit", function (e) { inputs = {}; input_serialized = $(this).serializeArray(); input_serialized.forEach((field) => { - inputs[field.name] = field.value; + inputs[field.name] = field.value; }); console.log("Form Inputs:", inputs); // Debugging line to check inputs @@ -902,43 +1332,43 @@ $(".division_form_select").on("click", function (e) { function postDivisionPatterns(match_pattern, match_pattern_type, division) { if (!match_pattern) { - toastr.error("Please highlight a pattern to add division."); - return; + toastr.error("Please highlight a pattern to add division."); + return; } $.ajax({ - url: "/api/division-patterns/", - type: "POST", - data: { - collection: collection_id, - match_pattern: match_pattern, - match_pattern_type: match_pattern_type, - division: division, - csrfmiddlewaretoken: csrftoken, - }, - success: function (data) { - $("#candidate_urls_table").DataTable().ajax.reload(null, false); - $("#division_patterns_table").DataTable().ajax.reload(null, false); - if (currentTab === "") { // Only add a notification if we are on the first tab - newDivisionPatternsCount = newDivisionPatternsCount + 1; - $("#divisionPatternsTab").html( - `Division Patterns ` + - newDivisionPatternsCount + " new" + - `` - ); - } - }, - error: function (xhr, status, error) { - var errorMessage = xhr.responseText; - if ( - errorMessage == - '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}' - ) { - toastr.success("Pattern already exists"); - return; - } - toastr.error(errorMessage); - }, + url: "/api/division-patterns/", + type: "POST", + data: { + collection: collection_id, + match_pattern: match_pattern, + match_pattern_type: match_pattern_type, + division: division, + csrfmiddlewaretoken: csrftoken, + }, + success: function (data) { + $("#candidate_urls_table").DataTable().ajax.reload(null, false); + $("#division_patterns_table").DataTable().ajax.reload(null, false); + if (currentTab === "") { // Only add a notification if we are on the first tab + newDivisionPatternsCount = newDivisionPatternsCount + 1; + $("#divisionPatternsTab").html( + `Division Patterns ` + + newDivisionPatternsCount + " new" + + `` + ); + } + }, + error: function (xhr, status, error) { + var errorMessage = xhr.responseText; + if ( + errorMessage == + '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}' + ) { + toastr.success("Pattern already exists"); + return; + } + toastr.error(errorMessage); + }, }); } @@ -950,9 +1380,36 @@ function getURLColumn() { return `
${remove_protocol( data )} - open_in_new
`; + open_in_new`; + }, + }; +} + +function getCuratedURLColumn() { + return { + data: "url", + width: "30%", + render: function (data, type, row) { + return `
${remove_protocol( + data + )} + open_in_new
`; + }, + }; +} + +function getDeltaURLColumn() { + return { + data: "url", + width: "30%", + render: function (data, type, row) { + return `
${remove_protocol( + data + )} + open_in_new
`; }, }; } @@ -972,13 +1429,36 @@ function getGeneratedTitleColumn() { data: "generated_title", width: "20%", render: function (data, type, row) { - return ``; + return ``; + }, + }; +} + +function getCuratedGeneratedTitleColumn() { + return { + data: "generated_title", + width: "20%", + render: function (data, type, row) { + return ``; + }, + }; +} + +function getDeltaGeneratedTitleColumn() { + return { + data: "generated_title", + width: "20%", + render: function (data, type, row) { + return ``; }, }; } @@ -991,11 +1471,11 @@ function getExcludedColumn(true_icon, false_icon) { render: function (data, type, row) { return data === true ? `${true_icon}` + row["url"] + )}>${true_icon}` : `${false_icon}`; + row["url"] + )}>${false_icon}`; }, }; } @@ -1016,8 +1496,8 @@ function getDocumentTypeColumn() { button_color = data ? "btn-success" : "btn-secondary"; return ` @@ -323,15 +523,18 @@
+ + + +
@@ -349,15 +552,18 @@
+ + + +
@@ -375,19 +581,23 @@
+ + + +
@@ -405,7 +615,8 @@