diff --git a/.dockerignore b/.dockerignore index 7369480e..ffcc7fd6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,5 +7,13 @@ .pre-commit-config.yaml .readthedocs.yml .travis.yml -venv .git + +# ignore local python environments +venv +.venv + +# prevent large backup files from being copied into the image +/backups +*.sql +*.gz diff --git a/.envs/.local/.django b/.envs/.local/.django index 97dfaab8..0978166d 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -33,7 +33,17 @@ SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch' # Slack Webhook # ------------------------------------------------------------------------------ SLACK_WEBHOOK_URL='' -LRM_USER='' -LRM_PASSWORD='' + +#Server Credentials +#-------------------------------------------------------------------------------- +LRM_DEV_USER='' +LRM_DEV_PASSWORD='' XLI_USER='' XLI_PASSWORD='' +LRM_QA_USER='' +LRM_QA_PASSWORD='' + +#Server Tokens +#-------------------------------------------------------------------------------- +LRM_DEV_TOKEN='' +XLI_TOKEN='' diff --git a/.github/workflows/run_full_test_suite.yml b/.github/workflows/run_full_test_suite.yml new file mode 100644 index 00000000..10c61336 --- /dev/null +++ b/.github/workflows/run_full_test_suite.yml @@ -0,0 +1,44 @@ +name: Django Test Suite on PR + +on: + pull_request: + branches: + - dev + paths-ignore: + - '**/*.md' + +jobs: + run-tests: + runs-on: ubuntu-latest + + services: + docker: + image: docker:24.0.5 + options: --privileged + ports: + - 5432:5432 + + steps: + - name: Check out merged code + uses: actions/checkout@v2 + + - name: Set up Docker Compose + run: | + sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + + - name: Build the Docker environment + run: docker-compose -f local.yml build + + - name: Run test suite + env: + DJANGO_ENV: test + run: docker-compose -f local.yml run --rm django bash ./init.sh + + - name: Generate Coverage Report + env: + DJANGO_ENV: test + run: docker-compose -f local.yml run --rm django bash -c "coverage report" + + - name: Cleanup + run: docker-compose -f local.yml down --volumes diff --git a/.gitignore b/.gitignore index 12fec5ec..24e03b4a 100644 --- a/.gitignore +++ b/.gitignore @@ -292,8 +292,7 @@ config_generation/config.py # Model's inference files Document_Classifier_inference/model.pt -# Database backup -backup.json - -# Prod backup -prod_backup-20240423.json +# Ignore Database Backup files +/backups +*.sql +*.gz diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8c4d553f..e00d2c3f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,48 +3,77 @@ default_stages: [commit] repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml + - id: check-merge-conflict + - id: debug-statements - repo: https://github.com/asottile/pyupgrade - rev: v3.3.1 + rev: v3.17.0 hooks: - id: pyupgrade args: [--py310-plus] - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 24.8.0 hooks: - id: black - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.1.1 hooks: - id: flake8 args: ["--config=setup.cfg"] additional_dependencies: [flake8-isort] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.4.0 + rev: v1.11.2 hooks: - id: mypy args: ["--strict"] - # ignoring everything for now - exclude: . - additional_dependencies: [django-stubs, celery, django-environ, django-extensions, django-crispy-forms, - crispy-bootstrap5, django-allauth, django-celery-beat, djangorestframework, djangorestframework-datatables, - django-debug-toolbar, psycopg2-binary, python-slugify, xmltodict, PyGithub, boto3, scrapy, types-requests] + exclude: "." + additional_dependencies: + - django-stubs + - celery + - django-environ + - django-extensions + - django-crispy-forms + - crispy-bootstrap5 + - django-allauth + - django-celery-beat + - djangorestframework + - djangorestframework-datatables + - django-debug-toolbar + - psycopg2-binary + - python-slugify + - xmltodict + - PyGithub + - boto3 + - scrapy + - types-requests + + - repo: https://github.com/PyCQA/bandit + rev: '1.7.0' + hooks: + - id: bandit + args: ['-r', '--configfile=bandit-config.yml'] + + - repo: https://github.com/zricethezav/gitleaks + rev: 'v8.0.4' + hooks: + - id: gitleaks + args: ['--config=gitleaks-config.toml'] + -# sets up .pre-commit-ci.yaml to ensure pre-commit dependencies stay up to date ci: autoupdate_schedule: weekly skip: [] diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..1a4f5035 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,67 @@ +## Overview +These are not the release notes, which can be found https://github.com/NASA-IMPACT/COSMOS/releases. Instead, this is a changelog that developers use to log key changes to the codebase with each pull request. + +## What to Include +For each PR made, an entry should be added to this changelog. It should contain +- a brief description of the deliverable of the feature or bugfix +- exact listing of key changes such as: + - API endpoint modified + - frontend components added + - model updates + - deployment changes needed on the servers + - etc. + +## Changelog +- 2889-serialize-the-tdamm-tags + - Description: Have TDAMM serialzed in a specific way and exposed via the Curated URLs API to be consumed into SDE Test/Prod + - Changes: + - Changed `get_tdamm_tag` method in the `CuratedURLAPISerializer` to process the TDAMM tags and pass them to the API endpoint + +- 960-notifications-add-a-dropdown-with-options-on-the-feedback-form + - Description: Generate an API endpoint and publish all the dropdown options necessary as a list for LRM to consume it. + - Changes: + - Created a new model `FeedbackFormDropdown` + - Added the migration file + - Added the `dropdown_option` field to the `Feedback` model + - Updated the slack notification structure by adding the dropdown option text + - Created a new serializer called `FeedbackFormDropdownSerializer` + - Added a new API endpoint `feedback-form-dropdown-options-api/` where the list is going to be accesible + - Added a list view called `FeedbackFormDropdownListView` + - Added tests + +- 1217-add-data-validation-to-the-feedback-form-api-to-restrict-html-content + - Description: The feedback form API does not currently have any form of data validation on the backend which makes it easy for the user with the endpoint to send in data with html tags. We need to have a validation scheme on the backend to protect this from happening. + - Changes: + - Defined a class `HTMLFreeCharField` which inherits `serializers.CharField` + - Used regex to catch any HTML content comming in as an input to form fields + - Called this class within the serializer for necessary fields + +- 3227-bugfix-title-patterns-selecting-multi-url-pattern-does-nothing + - Description: When selecting options from the match pattern type filter, the system does not filter the results as expected. Instead of displaying only the chosen variety of patterns, it continues to show all patterns. + - Changes: + - In `title_patterns_table` definition, corrected the column reference + - Made `match_pattern_type` searchable + - Corrected the column references and made code consistent on all the other tables, i.e., `exclude_patterns_table`, `include_patterns_table`, `division_patterns_table` and `document_type_patterns_table` + +- 1001-tests-for-critical-functionalities + - Description: Critical functionalities have been identified and listed, and critical areas lacking tests listed + - Changes: + - Integrated coverage.py as an indicative tool in the workflow for automated coverage reports on PRs, with separate display from test results. + - Introduced docs/architecture-decisions/testing_strategy.md, which includes the coverage report, lists critical areas, and specifically identifies those critical areas that are untested or under-tested. + +- 1192-finalize-the-infrastructure-for-frontend-testing + - Description: Set up comprehensive frontend testing infrastructure using Selenium WebDriver with Chrome, establishing a foundation for automated UI testing. + - Changes: + - Added Selenium testing dependency to `requirements/local.txt` + - Updated Dockerfile to support Chrome and ChromeDriver + - Created BaseTestCase and AuthenticationMixin for reusable test components + - Implemented core authentication test suite + +- 1195-implement-unit-test-for-forms-on-the-frontend + - Description: Implemented comprehensive frontend test suite covering authentication, collection management, search functionality, and pattern application forms. + - Changes: + - Added tests for authentication flows + - Implemented collection display and data table tests + - Added universal search functionality tests + - Created search pane filter tests + - Added pattern application form tests with validation checks diff --git a/CODE_STANDARDS.md b/CODE_STANDARDS.md new file mode 100644 index 00000000..39d473b7 --- /dev/null +++ b/CODE_STANDARDS.md @@ -0,0 +1,63 @@ +# Coding Standards and Conventions for COSMOS + +## Overview +To maintain high-quality code and ensure consistency across the entire COSMOS project, we have established coding standards and conventions. This document outlines the key standards and practices that all contributors are expected to follow. Adhering to these guidelines helps us to achieve a codebase that appears as if it were written by a single entity, regardless of the number of contributors. + +## Coding Standards + +### Formatting Standards +- **Line Length**: Maximum of 120 characters per line to ensure readability across various environments. +- **Code Formatting**: Utilize tools like Black for Python code to ensure consistent formatting across the entire codebase. +- **Import Ordering**: Follow a consistent import order: + - Standard library imports. + - Third-party imports. + - Application-specific imports. + +### Naming Conventions +- **Variables and Functions**: Use `snake_case`. +- **Classes and Exceptions**: Use `CamelCase`. +- **Constants**: Use `UPPER_CASE`. + +### Commenting +- Inline comments should be used sparingly and only when necessary to explain "why" something is done, not "what" is done. +- All public methods, classes, and modules should include docstrings that follow the [Google style guide](https://google.github.io/styleguide/pyguide.html). + +### Error Handling +- Explicit is better than implicit. Raise exceptions rather than returning None or any error codes. +- Use custom exceptions over generic exceptions when possible to make error handling more predictive. + +## Tool Configurations and Pre-commit Hooks + +To automate and enforce these standards, the following tools are configured with pre-commit hooks in our development process: + +### Pre-commit Hooks Setup + +To ensure that these tools are run automatically on every commit, contributors must set up pre-commit hooks locally. Run the following commands to install and configure pre-commit hooks: + +```bash +pip install pre-commit +pre-commit install +pre-commit run --all-files +``` + +The following pre-commit hooks are configured: + +- trailing-whitespace, end-of-file-fixer, check-yaml, check-merge-conflict, debug-statements: Checks for common formatting issues. +- pyupgrade: Automatically upgrades syntax for newer versions of the language. +- black: Formats Python code to ensure consistent styling. +- isort: Sorts imports alphabetically and automatically separated into sections. +- flake8: Lints code to catch styling errors and potential bugs. +- mypy: Checks type annotations to catch potential bugs. +- bandit: Scans code for common security issues. +- gitleaks: Prevents secrets from being committed to the repository. +- hadolint: Lints Dockerfiles to ensure best practices and common conventions are followed. + +## Continuous Integration (CI) +When a commit is pushed to a branch that is part of a Pull Request, our Continuous Integration (CI) pipeline automatically runs specified tools to check code quality, style, security and other standards. If these checks fail, the PR cannot be merged until all issues are resolved. + +## Quality Standards Enforcement +- PRs must pass all checks from the configured pre-commit hooks and CI pipeline to be eligible for merging. +- Code reviews additionally focus on logical errors and code quality beyond what automated tools can detect. + +## Conclusion +By adhering to these standards and utilizing the tools set up, we maintain the high quality and consistency of our codebase, making it easier for developers to collaborate effectively. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..b3d25f62 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,69 @@ +Thank you for your interest in contributing to COSMOS! We welcome contributions and appreciate your help in making this project better. Please follow the guidelines below to ensure a smooth contribution process. + +## Pull Requests + +### Prerequisites + +- **GitHub CLI (`gh`)**: Make sure you have the GitHub CLI installed. If not, you can install it from [GitHub CLI installation page](https://cli.github.com/). + +### 1. **Create an Issue on the Repo** + +1. **Navigate to Your Repository**: + + ```bash + $ cd path/to/your/repository + ``` + +2. **Create an Issue**: +Use the `gh issue create` command to create a new issue. + + ```bash + $ gh issue create --title "Issue Title" --body "Description of the issue" + ``` + + After running this command, you’ll get an issue number in the output. Note this number as it will be used to create a branch. + + +### 2. **Create a Branch for the Issue** + +1. **Create a Branch**: +Use the `gh` CLI to create a branch associated with the issue. The `gh` CLI can automatically create a branch for you based on the issue number. In this case, the `` is 989. + + ```bash + $ gh issue develop -c 989 + github.com/NASA-IMPACT/COSMOS/tree/989-make-coding-syntax-consistent + From https://github.com/NASA-IMPACT/COSMOS + * [new branch] 989-make-coding-syntax-consistent -> origin/989-make-coding-syntax-consistent + + ``` + + This command creates a new branch named `-issue` and switches to it. This branch will be used to work on the issue. + +2. **Make Your Changes and Push:** +Edit files, add code, or make any changes needed to address the issue. Commit your changes and push the branch to the remote repository. + + ```bash + git add . + git commit -m "Fixes issue #" + git push origin -issue + ``` + + +### 3. **Create a Pull Request** + +1. **Create the Pull Request**: +After pushing the branch, create a pull request using the `gh pr create` command: + + ```bash + gh pr create --base dev --head -issue --title "Title of the Pull Request" --body "Description of the changes" + ``` + + - **`-base`**: The base branch you want to merge your changes into (`dev` in our case) + - **`-head`**: The branch that contains your changes (e.g., `-issue`). + - **`-title`**: The title of the pull request. + - **`-body`**: The description or body of the pull request. + + This command will create a pull request from your branch into the base branch specified. + +2. **Review and Merge**: +Once the pull request is created, we will review it on GitHub and merge it if everything looks good. If any changes are required, we might ask you to make adjustments before the merge. diff --git a/README.md b/README.md index 61cf6b50..ab3da78b 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ $ docker-compose -f local.yml build ```bash $ docker-compose -f local.yml up ``` - ### Non-Docker Local Setup If you prefer to run the project without Docker, follow these steps: @@ -69,54 +68,104 @@ $ docker-compose -f local.yml run --rm django python manage.py createsuperuser #### Creating Additional Users Create additional users through the admin interface (/admin). +## Database Backup and Restore + +COSMOS provides dedicated management commands for backing up and restoring your PostgreSQL database. These commands handle both compressed and uncompressed backups and work seamlessly in both local and production environments using Docker. -### Loading Fixtures +### Backup Directory Structure -To load collections: +All backups are stored in the `/backups` directory at the root of your project. This directory is mounted as a volume in both local and production Docker configurations, making it easy to manage backups across different environments. +- Local development: `./backups/` +- Production server: `/path/to/project/backups/` + +If the directory doesn't exist, create it: ```bash -$ docker-compose -f local.yml run --rm django python manage.py loaddata sde_collections/fixtures/collections.json +mkdir backups ``` -### Manually Creating and Loading a ContentTypeless Backup -Navigate to the server running prod, then to the project folder. Run the following command to create a backup: +### Creating a Database Backup + +To create a backup of your database: ```bash -docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20240812.json +# Create a compressed backup (recommended) +docker-compose -f local.yml run --rm django python manage.py database_backup + +# Create an uncompressed backup +docker-compose -f local.yml run --rm django python manage.py database_backup --no-compress + +# Specify custom output location within backups directory +docker-compose -f local.yml run --rm django python manage.py database_backup --output my_custom_backup.sql ``` -This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine. + +The backup command will automatically: +- Detect your server environment (Production/Staging/Local) +- Use database credentials from your environment settings +- Generate a dated filename if no output path is specified +- Save the backup to the mounted `/backups` directory +- Compress the backup by default (can be disabled with --no-compress) + +### Restoring from a Database Backup + +To restore your database from a backup, it will need to be in the `/backups` directory. You can then run the following command: ```bash -mv ~/prod_backup-20240812.json /prod_backup-20240812.json -scp sde:/home/ec2-user/sde_indexing_helper/backups/prod_backup-20240812.json prod_backup-20240812.json +# Restore from a backup (handles both .sql and .sql.gz files) +docker-compose -f local.yml run --rm django python manage.py database_restore backups/backup_file_name.sql.gz ``` -Finally, load the backup into your local database: +The restore command will: +- Automatically detect if the backup is compressed (.gz) +- Terminate existing database connections +- Drop and recreate the database +- Restore all data from the backup +- Handle all database credentials from your environment settings + +### Working with Remote Servers +When working with production or staging servers: + +1. First, SSH into the appropriate server: ```bash -docker-compose -f local.yml run --rm django python manage.py loaddata prod_backup-20240812.json +# For production +ssh user@production-server +cd /path/to/project ``` -### Loading the Database from an Arbitrary Backup +2. Create a backup on the remote server: +```bash +docker-compose -f production.yml run --rm django python manage.py database_backup +``` -1. Build the project and run the necessary containers (as documented above). -2. Clear out content types using the Django shell: +3. Copy the backup from the remote server's backup directory to your local machine: +```bash +scp user@remote-server:/path/to/project/backups/backup_name.sql.gz ./backups/ +``` +4. Restore locally: ```bash -$ docker-compose -f local.yml run --rm django python manage.py shell ->>> from django.contrib.contenttypes.models import ContentType ->>> ContentType.objects.all().delete() ->>> exit() +docker-compose -f local.yml run --rm django python manage.py database_restore backups/backup_name.sql.gz ``` -3. Load your backup database: +### Alternative Methods + +While the database_backup and database_restore commands are the recommended approach, you can also use Django's built-in fixtures for smaller datasets: ```bash -$ docker cp /path/to/your/backup.json container_name:/path/inside/container/backup.json -$ docker-compose -f local.yml run --rm django python manage.py loaddata /path/inside/the/container/backup.json -$ docker-compose -f local.yml run --rm django python manage.py migrate +# Create a backup excluding content types +docker-compose -f production.yml run --rm django python manage.py dumpdata \ + --natural-foreign --natural-primary \ + --exclude=contenttypes --exclude=auth.Permission \ + --indent 2 \ + --output backups/prod_backup-$(date +%Y%m%d).json + +# Restore from a fixture +docker-compose -f local.yml run --rm django python manage.py loaddata backups/backup_name.json ``` +Note: For large databases (>1.5GB), the database_backup and database_restore commands are strongly recommended over JSON fixtures as they handle large datasets more efficiently. + ## Additional Commands ### Type Checks @@ -176,6 +225,7 @@ $ pip install pre-commit $ pre-commit install $ pre-commit run --all-files ``` +For detailed information on the coding standards and conventions we enforce, please see our [Coding Standards and Conventions](CODE_STANDARDS.md). ### Sentry Setup @@ -191,8 +241,7 @@ Documented [here](https://github.com/NASA-IMPACT/sde-indexing-helper/wiki/How-to ## Adding New Features/Fixes -1. Start with a [GitHub issue](https://github.com/NASA-IMPACT/sde-indexing-helper/issues). -2. Use the GitHub CLI to create branches and pull requests (`gh issue develop -c `). +We welcome contributions to improve the project! Before you begin, please take a moment to review our [Contributing Guidelines](./CONTRIBUTING.md). These guidelines will help you understand the process for submitting new features, bug fixes, and other improvements. ## Job Creation @@ -205,3 +254,24 @@ Eventually, job creation will be done seamlessly by the webapp. Until then, edit - JavaScript: `/sde_indexing_helper/static/js` - CSS: `/sde_indexing_helper/static/css` - Images: `/sde_indexing_helper/static/images` + + +## Running Long Scripts on the Server +```shell +tmux new -s docker_django +``` +Once you are inside, you can run dmshell or for example a managment command: + +```shell +docker-compose -f production.yml run --rm django python manage.py deduplicate_urls +``` + +Later, you can do this to get back in. +```shell +tmux attach -t docker_django +``` + +To delete the session: +```shell +tmux kill-session -t docker_django +``` diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md new file mode 100644 index 00000000..1ced2857 --- /dev/null +++ b/RELEASE_NOTES.md @@ -0,0 +1,108 @@ +# COSMOS Release Notes +## v3.0.0 from v2.0.1 + +COSMOS v3.0.0 introduces several major architectural changes that fundamentally enhance the system's capabilities. The primary feature is a new website reindexing system that allows COSMOS to stay up-to-date with source website changes, addressing a key limitation of previous versions where websites could only be scraped once. This release includes comprehensive updates to the data models, frontend interface, rule creation system, and backend processing along with some bugfixes from v2.0.1. + +The Environmental Justice (EJ) system has been significantly expanded, growing less than 100 manually curated datasets to approximately 1,000 datasets through the integration of machine learning classification of NASA CMR records. This expansion is supported by a new modular processing suite that generates and extracts metadata using Subject Matter Expert (SME) criteria. + +To support future machine learning integration, COSMOS now implements a sophisticated two-column system that allows fields to maintain both ML-generated classifications and manual curator overrides. This system has been seamlessly integrated into the data models, serializers, and APIs, ensuring that both automated and human-curated data can coexist while maintaining clear precedence rules. + +To ensure reliability and maintainability of these major changes, this release includes extensive testing coverage with 213 new tests spanning URL processing, pattern management, Environmental Justice functionality, workflow triggers, and data migrations. Additionally, we've added comprehensive documentation across 15 new README files that cover everything from fundamental pattern system concepts to detailed API specifications and ML integration guidelines. + + +### Major Features + +#### Reindexing System +- **New Data Models**: Introduced DumpUrl, DeltaUrl, and CuratedUrl to support the reindexing workflow +- **Automated Workflows**: + - New process to calculate deltas, deletions, and additions during migration + - Automatic promotion of DeltaUrls to CuratedUrls + - Status-based triggers for data ingestion and processing +- **Duplicate Prevention**: System now prevents duplicate patterns and URLs +- **Enhanced Frontend**: + - Added reindexing status column to collection and URL list pages + - New deletion tracking column on URL list page + - Updated collection list to display delta URL counts + - Improved URL list page accessibility via delta URL count + +#### Pattern System Improvements +- Complete modularization of the pattern system +- Enhanced handling of edge cases including overlapping patterns +- Improved unapply logic +- Functional inclusion rules +- Pattern precedence system: most specific pattern takes priority, with pattern length as tiebreaker + +#### Environmental Justice (EJ) Enhancement +- Expanded from 92 manual datasets to 1063 ML-classified NASA CMR records +- New modular processing suite for metadata generation +- Enhanced API with multiple data sources: + - Spreadsheet (original manual classifications) + - ML Production + - ML Testing + - Combined (ML production with spreadsheet overrides) +- Custom processing suite for CMR metadata extraction + +#### Infrastructure Updates +- Streamlined database backup and restore +- Optimized Docker builds +- Fixed LetsEncrypt staging issues +- Modified Traefik timeouts for long-running jobs +- Updated Sinequa worker configuration: + - Reduced worker count to 3 for neural workload optimization + - Added neural indexing to all webcrawlers + - Removed deprecated version mappings + +#### API Enhancements +- New endpoints for curated and delta URLs: + - GET /curated-urls-api// + - GET /delta-urls-api// +- Backwards compatibility through remapped CandidateUrl endpoint +- Updated Environmental Justice API with new data source parameter + +### Technical Improvements + +#### Two-Column System +- New architecture to support dual ML/manual classifications +- Seamless integration with models, serializers, and APIs +- Prioritization system for manual overrides + +#### Testing +Added 213 new tests across multiple areas: +- URL APIs and processing (19 tests) +- Delta and pattern management (31 tests) +- Environmental Justice API (7 tests) +- Environmental Justice Mappings and Thresholding (58) +- Workflow and status triggers (10 tests) +- Migration and promotion processes (31 tests) +- Field modifications and TDAMM tags (25 tests) +- Additional system functionality (30 tests) + + +#### Documentation +Added comprehensive documentation across 15 READMEs covering: +- Pattern system fundamentals and examples +- Reindexing statuses and triggers +- Model lifecycles and testing procedures +- URL inclusion/exclusion logic +- Environmental Justice classifier and API +- ML column functionality +- SQL dump restoration + +### Bug Fixes +- Fixed non-functional includes +- Resolved pagination issues for patterns (previously limited to 50) +- Eliminated ability to create duplicate URLs and patterns +- Corrected faulty unapply logic for modification patterns +- Fixed unrepeatable logic for overlapping patterns +- Allowed long running jobs to complete without timeouts + +### UI Updates +- Renamed application from "SDE Indexing Helper" to "COSMOS" +- Refactored collection list code for easier column management +- Enhanced URL list page with new status and deletion tracking +- Improved navigation through delta URL count integration + +### Administrative Changes +- Added new admin panels for enhanced system management +- Updated installation requirements +- Enhanced database backup and restore functionality diff --git a/SQLDumpRestoration.md b/SQLDumpRestoration.md new file mode 100644 index 00000000..866093db --- /dev/null +++ b/SQLDumpRestoration.md @@ -0,0 +1,192 @@ +## Restoring the Database from SQL Dump + +We generally load a database backup from a JSON file by using the following command. + +``` +docker-compose -f local.yml run --rm django python manage.py loaddata backup.json +``` + +However, if the JSON file is particularly large (>1.5GB), Docker might struggle with this method. In such cases, you can use SQL dump and restore commands as an alternative. + +### Steps for Using SQL Dump and Restore + +1. Begin by starting only the PostgreSQL container. This prevents the Django container from making changes while the PostgreSQL container is starting up. + +``` +docker-compose -f local.yml up postgres +``` + +2. Find the container ID using `docker ps`, then enter the PostgreSQL container to execute commands. + +``` +$ docker ps +CONTAINER ID IMAGE COMMAND +23d33f22cc43 sde_indexing_helper_production_postgres "docker-entrypoint.s…" + +$ docker exec -it 23d33f22cc43 bash +``` + +3. Create a connection to the database. + +``` +psql -U -d +``` + +**Note**: +- For local deployment, refer to the `.envs/.local/.postgres` file for the `POSTGRES_USER` and `POSTGRES_DB` variables. +- For production deployment, refer to the `.envs/.production/.postgres` file. + +4. Ensure that the database `` is empty. Here's an example: + +``` +sde_indexing_helper-# \c +You are now connected to database "sde_indexing_helper" as user "VnUvMKBSdk...". +sde_indexing_helper-# \dt +Did not find any relations. +``` + +If the database is not empty, delete its contents to create a fresh database: + +``` +sde_indexing_helper=# \c postgres //connect to a different database before dropping +You are now connected to database "postgres" as user "VnUvMKBSdk....". +postgres=# DROP DATABASE sde_indexing_helper; +DROP DATABASE +postgres=# CREATE DATABASE sde_indexing_helper; +CREATE DATABASE + +``` + +5. Transfer the backup SQL dump (`backup.sql`) from your local machine to the PostgreSQL container. + +``` +docker cp /local/path/backup.sql 23d33f22cc43:/ +``` + +6. Import the SQL dump into the PostgreSQL container. + +``` +psql -U -d -f backup.sql +``` + +**Note**: To create a SQL dump of your PostgreSQL database, use the following command: + +``` +pg_dump -U -W -F p -f backup.sql +``` + +7. Bring up all containers at once, and create a superuser account for logging in. + +``` +docker-compose -f local.yml up +docker-compose -f local.yml run --rm django python manage.py createsuperuser +``` + +8. Log in to the COSMOS frontend to ensure that all data has been correctly populated in the UI. + + + +# making the backup + +```bash +ssh sde +cat .envs/.production/.postgres +``` + +find the values for the variables: +POSTGRES_HOST=sde-indexing-helper-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com +POSTGRES_PORT=5432 +POSTGRES_DB=postgres +POSTGRES_USER=postgres +POSTGRES_PASSWORD=this_is_A_web_application_built_in_2023 + +```bash +docker ps +``` + +b3fefa2c19fb + +note here that you need to put the +```bash +docker exec -t your_postgres_container_id pg_dump -U your_postgres_user -d your_database_name > backup.sql +``` +```bash +docker exec -t container_id pg_dump -h host -U user -d database -W > prod_backup.sql +``` + +docker exec -t b3fefa2c19fb env PGPASSWORD="this_is_A_web_application_built_in_2023" pg_dump -h sde-indexing-helper-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres > prod_backup.sql + +# move the backup to local + go back to local computer and scp the file + +```bash +scp sde:/home/ec2-user/sde_indexing_helper/prod_backup.sql . +``` +scp prod_backup.sql sde_staging:/home/ec2-user/sde-indexing-helper +if you have trouble transferring the file, you can use rsync: +rsync -avzP prod_backup.sql sde_staging:/home/ec2-user/sde-indexing-helper/ + +# restoring the backup +bring down the local containers +```bash +docker-compose -f local.yml down +docker-compose -f local.yml up postgres +docker ps +``` + +find the container id + +c11d7bae2e56 + +find the local variables from +cat .envs/.production/.postgres +POSTGRES_HOST=sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com +POSTGRES_PORT=5432 +POSTGRES_DB=sde_staging +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres + + +```bash +docker exec -it bash +``` +docker exec -it c11d7bae2e56 bash + +## do all the database shit you need to + + +psql -U -d +psql -U postgres -d sde_staging +or, if you are on one of the servers: +psql -h sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres + +\c postgres +DROP DATABASE sde_staging; +CREATE DATABASE sde_staging; + +# do the backup + +```bash +docker cp prod_backup.sql c11d7bae2e56:/ +docker exec -it c11d7bae2e56 bash +``` + +```bash +psql -U -d -f backup.sql +``` +psql -U VnUvMKBSdkoFIETgLongnxYHrYVJKufn -d sde_indexing_helper -f prod_backup.sql + +psql -h sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres -f prod_backup.sql +pg_restore -h sde-indexing-helper-staging-db.c3cr2yyh5zt0.us-east-1.rds.amazonaws.com -U postgres -d postgres prod_backup.sql + + + +docker down + +docker up build + +migrate + +down + +up diff --git a/bandit-config.yml b/bandit-config.yml new file mode 100644 index 00000000..36a65525 --- /dev/null +++ b/bandit-config.yml @@ -0,0 +1,26 @@ +# bandit-config.yml +skips: + - B101 # Skip assert used (often used in tests) + - B403 # Skip import from the pickle module + +exclude: + - ./tests/ # Exclude test directories + - ./migrations/ # Exclude migration directories + - ./venv/ # Exclude virtual environment + +tests: + - B105 # Include test for hardcoded password strings + - B602 # Include test for subprocess call with shell equals true + +profiles: + default: + include: + - B403 # Include test for dangerous default argument + exclude: + - B401 # Exclude test for import telnetlib + +# Set the severity level to focus on higher-risk issues +severity: 'HIGH' + +# Set the confidence level to ensure that reported issues are likely true positives +confidence: 'HIGH' diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index 5d7fa082..b2fc17e6 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -38,6 +38,8 @@ WORKDIR ${APP_HOME} # Install required system dependencies RUN apt-get update && apt-get install --no-install-recommends -y \ + wget \ + gnupg \ # psycopg2 dependencies libpq-dev \ # Translations dependencies @@ -45,6 +47,12 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ # pycurl dependencies libcurl4-openssl-dev \ libssl-dev \ + # PostgreSQL 15 + && sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list' \ + && wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \ + && apt-get update \ + && apt-get install -y postgresql-15 postgresql-client-15 \ + && apt-get install -y chromium chromium-driver \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile index d470d493..4e4358bd 100644 --- a/compose/production/django/Dockerfile +++ b/compose/production/django/Dockerfile @@ -23,7 +23,6 @@ COPY ./requirements . RUN pip wheel --wheel-dir /usr/src/app/wheels \ -r ${BUILD_ENVIRONMENT}.txt - # Python 'run' stage FROM python AS python-run-stage @@ -39,9 +38,10 @@ WORKDIR ${APP_HOME} RUN addgroup --system django \ && adduser --system --ingroup django django - # Install required system dependencies RUN apt-get update && apt-get install --no-install-recommends -y \ + wget \ + gnupg \ # psycopg2 dependencies libpq-dev \ # Translations dependencies @@ -49,6 +49,11 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ # pycurl dependencies libcurl4-openssl-dev \ libssl-dev \ + # PostgreSQL 15 + && sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main" > /etc/apt/sources.list.d/pgdg.list' \ + && wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \ + && apt-get update \ + && apt-get install -y postgresql-15 postgresql-client-15 \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* @@ -61,25 +66,22 @@ COPY --from=python-build-stage /usr/src/app/wheels /wheels/ RUN pip install --no-cache-dir --no-index --find-links=/wheels/ /wheels/* \ && rm -rf /wheels/ - COPY --chown=django:django ./compose/production/django/entrypoint /entrypoint RUN sed -i 's/\r$//g' /entrypoint RUN chmod +x /entrypoint - COPY --chown=django:django ./compose/production/django/start /start RUN sed -i 's/\r$//g' /start RUN chmod +x /start + COPY --chown=django:django ./compose/production/django/celery/worker/start /start-celeryworker RUN sed -i 's/\r$//g' /start-celeryworker RUN chmod +x /start-celeryworker - COPY --chown=django:django ./compose/production/django/celery/beat/start /start-celerybeat RUN sed -i 's/\r$//g' /start-celerybeat RUN chmod +x /start-celerybeat - COPY ./compose/production/django/celery/flower/start /start-flower RUN sed -i 's/\r$//g' /start-flower RUN chmod +x /start-flower diff --git a/compose/production/django/start b/compose/production/django/start index 97216fa1..a8852d8a 100644 --- a/compose/production/django/start +++ b/compose/production/django/start @@ -7,4 +7,8 @@ set -o nounset python /app/manage.py collectstatic --noinput -exec /usr/local/bin/gunicorn config.wsgi --bind 0.0.0.0:5000 --chdir=/app +exec /usr/local/bin/gunicorn config.wsgi \ + --bind 0.0.0.0:5000 \ + --chdir=/app \ + --timeout 600 \ + --graceful-timeout 600 \ diff --git a/compose/production/traefik/traefik.yml b/compose/production/traefik/traefik.yml index d367bb22..7ab6ecb7 100644 --- a/compose/production/traefik/traefik.yml +++ b/compose/production/traefik/traefik.yml @@ -10,13 +10,28 @@ entryPoints: redirections: entryPoint: to: web-secure + transport: + respondingTimeouts: + readTimeout: "600s" + writeTimeout: "600s" + idleTimeout: "600s" web-secure: # https address: ":443" + transport: + respondingTimeouts: + readTimeout: "600s" + writeTimeout: "600s" + idleTimeout: "600s" flower: address: ":5555" + transport: + respondingTimeouts: + readTimeout: "600s" + writeTimeout: "600s" + idleTimeout: "600s" certificatesResolvers: letsencrypt: @@ -31,7 +46,7 @@ certificatesResolvers: http: routers: web-secure-router: - rule: "Host(`sde-indexing-helper.nasa-impact.net`)" + rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)' entryPoints: - web-secure middlewares: @@ -42,7 +57,7 @@ http: certResolver: letsencrypt flower-secure-router: - rule: "Host(`sde-indexing-helper.nasa-impact.net`)" + rule: 'Host(`{{ env "TRAEFIK_DOMAIN" }}`)' entryPoints: - flower service: flower diff --git a/config/settings/base.py b/config/settings/base.py index 55c3e758..0c16c59b 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -343,7 +343,9 @@ SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL") XLI_USER = env("XLI_USER") XLI_PASSWORD = env("XLI_PASSWORD") -LRM_USER = env("LRM_USER") -LRM_PASSWORD = env("LRM_PASSWORD") +LRM_DEV_USER = env("LRM_DEV_USER") +LRM_DEV_PASSWORD = env("LRM_DEV_PASSWORD") LRM_QA_USER = env("LRM_QA_USER") LRM_QA_PASSWORD = env("LRM_QA_PASSWORD") +LRM_DEV_TOKEN = env("LRM_DEV_TOKEN") +XLI_TOKEN = env("XLI_TOKEN") diff --git a/config/settings/production.py b/config/settings/production.py index aff7db28..270aa00c 100644 --- a/config/settings/production.py +++ b/config/settings/production.py @@ -70,11 +70,11 @@ # ------------------------ STATICFILES_STORAGE = "sde_indexing_helper.utils.storages.StaticRootS3Boto3Storage" COLLECTFAST_STRATEGY = "collectfast.strategies.boto3.Boto3Strategy" -STATIC_URL = f"https://{aws_s3_domain}/static/" +STATIC_URL = f"https://{aws_s3_domain}/static/" # noqa: E231 # MEDIA # ------------------------------------------------------------------------------ DEFAULT_FILE_STORAGE = "sde_indexing_helper.utils.storages.MediaRootS3Boto3Storage" -MEDIA_URL = f"https://{aws_s3_domain}/media/" +MEDIA_URL = f"https://{aws_s3_domain}/media/" # noqa: E231 # EMAIL # ------------------------------------------------------------------------------ diff --git a/config/urls.py b/config/urls.py index 93b2684f..fe43cf8b 100644 --- a/config/urls.py +++ b/config/urls.py @@ -4,9 +4,9 @@ from django.urls import include, path from django.views import defaults as default_views -admin.site.site_header = "SDE Indexing Helper Administration" # default: "Django Administration" -admin.site.index_title = "SDE Indexing Helper" # default: "Site administration" -admin.site.site_title = "SDE Indexing Helper" # default: "Django site admin" +admin.site.site_header = "COSMOS Administration" # default: "Django Administration" +admin.site.index_title = "COSMOS" # default: "Site administration" +admin.site.site_title = "COSMOS" # default: "Django site admin" urlpatterns = [ path("", include("sde_collections.urls", namespace="sde_collections")), diff --git a/config/wsgi.py b/config/wsgi.py index bc448e89..becaf23d 100644 --- a/config/wsgi.py +++ b/config/wsgi.py @@ -1,5 +1,5 @@ """ -WSGI config for SDE Indexing Helper project. +WSGI config for COSMOS. This module contains the WSGI application used by Django's development server and any production WSGI deployments. It should expose a module-level variable @@ -13,6 +13,7 @@ framework. """ + import os import sys from pathlib import Path diff --git a/config_generation/db_to_xml_file_based.py b/config_generation/db_to_xml_file_based.py index 88252366..14b077b7 100644 --- a/config_generation/db_to_xml_file_based.py +++ b/config_generation/db_to_xml_file_based.py @@ -98,7 +98,7 @@ def update_or_add_element_value( parent_element = xml_root if not parent_element_name else xml_root.find(parent_element_name) if parent_element is None: - raise ValueError(f"Parent element '{parent_element_name}' not found in XML.") + raise ValueError(f"Parent element '{parent_element_name}' not found in XML.") # noqa: E713 existing_element = parent_element.find(element_name) if not add_duplicate and existing_element: diff --git a/config_generation/delete_config_folders.py b/config_generation/delete_config_folders.py index 0fc138d6..119d48fc 100644 --- a/config_generation/delete_config_folders.py +++ b/config_generation/delete_config_folders.py @@ -5,6 +5,7 @@ - commands - jobs """ + import glob import os import shutil diff --git a/config_generation/generate_collection_list.py b/config_generation/generate_collection_list.py index 86556c53..ee0e9b47 100644 --- a/config_generation/generate_collection_list.py +++ b/config_generation/generate_collection_list.py @@ -4,6 +4,7 @@ - filter anything that isn't a webcrawler - provide a variable, turned_on_remaining_webcrawlers for import by other files """ + import os from db_to_xml import XmlEditor diff --git a/config_generation/generate_commands.py b/config_generation/generate_commands.py index a538ee03..1b41858c 100644 --- a/config_generation/generate_commands.py +++ b/config_generation/generate_commands.py @@ -2,6 +2,7 @@ sometimes spot fixes need to be run on a list of collections this file provides a quick framework to generate a batch of commands based on an input json """ + from db_to_xml_file_based import XmlEditor from generate_jobs import ParallelJobCreator diff --git a/config_generation/xmls/indexing_template.xml b/config_generation/xmls/indexing_template.xml index 46e47f99..5faa8506 100644 --- a/config_generation/xmls/indexing_template.xml +++ b/config_generation/xmls/indexing_template.xml @@ -144,7 +144,7 @@ false - 8 + 3 your_url_here diff --git a/config_generation/xmls/job_template.xml b/config_generation/xmls/job_template.xml index c5406ea9..7763ecf1 100644 --- a/config_generation/xmls/job_template.xml +++ b/config_generation/xmls/job_template.xml @@ -3,7 +3,7 @@ collection - _ForceReindexation + false diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml index b7a9ce63..03f0f7aa 100644 --- a/config_generation/xmls/plugin_indexing_template.xml +++ b/config_generation/xmls/plugin_indexing_template.xml @@ -8,7 +8,7 @@ 1 - true + false SMD_Plugins/Sinequa.Plugin.WebCrawler_Index_URLList 6 @@ -20,7 +20,7 @@ - false + true true false @@ -268,8 +268,4 @@ id doc.url1 - - version - Md5(doc.url1) - diff --git a/config_generation/xmls/scraper_template.xml b/config_generation/xmls/scraper_template.xml index 0f44a2b5..4817394f 100644 --- a/config_generation/xmls/scraper_template.xml +++ b/config_generation/xmls/scraper_template.xml @@ -145,7 +145,7 @@ false - 8 + 3 diff --git a/config_generation/xmls/webcrawler_initial_crawl.xml b/config_generation/xmls/webcrawler_initial_crawl.xml index e86bc04e..9e02dd61 100644 --- a/config_generation/xmls/webcrawler_initial_crawl.xml +++ b/config_generation/xmls/webcrawler_initial_crawl.xml @@ -227,7 +227,7 @@ - 8 + 3 diff --git a/default_scraper.xml b/default_scraper.xml index 642b2e53..ef583b3b 100644 --- a/default_scraper.xml +++ b/default_scraper.xml @@ -33,8 +33,7 @@ false false - - + false true @@ -143,7 +142,7 @@ false - 8 + 3 enter your url here diff --git a/docs/architecture-decisions/README.md b/docs/architecture-decisions/README.md new file mode 100644 index 00000000..5fda8cd0 --- /dev/null +++ b/docs/architecture-decisions/README.md @@ -0,0 +1,48 @@ +# Architecture Decision Records (ADRs) + +This directory contains Architecture Decision Records (ADRs) documenting key architectural decisions made in this project. + +## Index of ADRs (To be updated when an ADR is added, modified, or updated) + +- [Uniform Error Handling](uniform-error-handling.md) + **Status**: Proposed | **Date**: 2024-12-12 + **Related Links**: [Issue #1112](#) + +## Maintenance Guidelines + +1. Keep Index Updated: Always update the index above when a new ADR is added or its status changes. +2. Use Consistent Formatting: Follow the provided template to ensure clarity and uniformity. +3. Cross-Reference Decisions: Link to related Issues, PRs, or other ADRs for better traceability. + +## Format for New ADRs + +To add a new ADR to this directory: + +1. Create a new markdown file in this directory with a descriptive filename (e.g., `use-graphql.md`). +2. Use the following template for the ADR content: + +```markdown +# [Title of the Decision] + +## Status +[Proposed | Accepted | Deprecated | Rejected] + +## Context +[Explain why this decision is being made. Provide background information, such as the problem to be solved, goals, and relevant constraints.] + +## Decision +[Clearly describe the decision made. Include details about what was chosen and how it will be implemented.] + +## Consequences +### Positive +[Describe the benefits of the decision.] + +### Negative +[Describe the trade-offs, risks, or potential issues resulting from this decision.] + +## Alternatives Considered +1. [Alternative 1]: [Brief description of the alternative, its pros, and cons.] +2. [Alternative 2]: [Brief description of the alternative, its pros, and cons.] + +## References +[Provide links to relevant documents, discussions, RFCs, PRs, Issues or resources that support this decision.] diff --git a/docs/architecture-decisions/testing_strategy.md b/docs/architecture-decisions/testing_strategy.md new file mode 100644 index 00000000..a8754750 --- /dev/null +++ b/docs/architecture-decisions/testing_strategy.md @@ -0,0 +1,184 @@ +## Overview +As of early 2025, we have only recently been writing tests for new features, and have about 250 tests in total, mostly centered around the EJ portal, the reindexing process, and pattern applications. + +Although this covers much of the core system logic, there still remain a number of untested logical areas such as the config file generation, core project settings, frontend features, etc. + +This document outlines a testing strategy for the project, which will guide us towards adding tests in the most critical areas first, followed by a plan to fully cover the remaining areas. + +## Current Coverage +Using the coverage library, the following report was generated: +Name | Stmts | Miss | Cover | Missing +----------|--------------------------------------------------------------------------------------------------|-------|--------|-------- +config/__init__.py | 2 | 0 | 100% | +config/celery_app.py | 6 | 0 | 100% | +config/settings/__init__.py | 0 | 0 | 100% | +config/settings/base.py | 94 | 0 | 100% | +config/settings/local.py | 20 | 20 | 0% | 1-65 +config/settings/production.py | 48 | 48 | 0% | 1-162 +config/urls.py | 14 | 4 | 71% | 26-47 +config/wsgi.py | 8 | 8 | 0% | 17-36 +config_generation/__init__.py | 0 | 0 | 100% | +config_generation/api.py | 34 | 34 | 0% | 1-88 +config_generation/config_example.py | 15 | 15 | 0% | 1-69 +config_generation/db_to_xml.py | 203 | 133 | 34% | 45, 47, 50, 96, 119-125, 129-136, 142-149, 197-200, 206-214, 225-230, 242-271, 274-278, 285-292, 303-308, 311, 317, 326-332, 342-349, 361-368, 371-374, 377-378, 382-390, 393-399, 402-412, 415-429 +config_generation/db_to_xml_file_based.py | 52 | 52 | 0% | 4-119 +config_generation/delete_config_folders.py | 24 | 24 | 0% | 9-50 +config_generation/delete_server_content.py | 12 | 12 | 0% | 3-25 +config_generation/delete_webapp_collections.py | 5 | 5 | 0% | 6-12 +config_generation/export_collections.py | 36 | 36 | 0% | 1-73 +config_generation/export_whole_index.py | 28 | 28 | 0% | 1-58 +config_generation/generate_collection_list.py | 29 | 29 | 0% | 8-69 +config_generation/generate_commands.py | 41 | 41 | 0% | 6-87 +config_generation/generate_emac_indexer.py | 24 | 24 | 0% | 1-81 +config_generation/generate_jobs.py | 42 | 42 | 0% | 8-100 +config_generation/generate_scrapers.py | 15 | 15 | 0% | 2-54 +config_generation/minimum_api.py | 33 | 33 | 0% | 1-81 +config_generation/preprocess_sources.py | 25 | 25 | 0% | 1-50 +config_generation/sources_to_scrape.py | 28 | 28 | 0% | 2-1631 +docs/__init__.py | 0 | 0 | 100% | +docs/conf.py | 17 | 17 | 0% | 13-62 +environmental_justice/__init__.py | 0 | 0 | 100% | +environmental_justice/admin.py | 5 | 0 | 100% | +environmental_justice/apps.py | 4 | 0 | 100% | +environmental_justice/models.py | 29 | 1 | 97% | 44 +environmental_justice/serializers.py | 6 | 0 | 100% | +environmental_justice/views.py | 23 | 0 | 100% | +feedback/__init__.py | 0 | 0 | 100% | +feedback/admin.py | 14 | 0 | 100% | +feedback/apps.py | 4 | 0 | 100% | +feedback/models.py | 42 | 15 | 64% | 20-29, 35-44, 61-63 +feedback/serializers.py | 10 | 0 | 100% | +feedback/urls.py | 4 | 0 | 100% | +feedback/views.py | 9 | 0 | 100% | +manage.py | 16 | 16 | 0% | 2-31 +merge_production_dotenvs_in_dotenv.py | 15 | 1 | 93% | 26 +scripts/ej/cmr_processing.py | 241 | 5 | 98% | 160, 186-188, 397, 410 +scripts/ej/config.py | 6 | 0 | 100% | +scripts/ej/test_cmr_processing.py | 225 | 1 | 99% | 610 +scripts/ej/test_threshold_processing.py | 97 | 1 | 99% | 209 +scripts/ej/threshold_processing.py | 20 | 0 | 100% | +sde_collections/__init__.py | 0 | 0 | 100% | +sde_collections/admin.py | 212 | 72 | 66% | 22-24, 29, 34, 40-60, 65-81, 86-89, 98-101, 110-112, 120-134, 143, 148, 153, 158, 163, 168, 173, 178-189, 196-197, 260, 265, 270, 275, 302-303, 308-309, 314-316, 345-372, 478-480 +sde_collections/apps.py | 4 | 0 | 100% | +sde_collections/forms.py | 15 | 0 | 100% | +sde_collections/management/commands/database_backup.py | 62 | 1 | 98% | 68 +sde_collections/management/commands/database_restore.py | 83 | 8 | 90% | 34, 36, 87-89, 142-145 +sde_collections/models/__init__.py | 0 | 0 | 100% | +sde_collections/models/candidate_url.py | 89 | 16 | 82% | 124, 128-134, 138-142, 145, 176-177 +sde_collections/models/collection.py | 414 | 144 | 65% | 241, 269, 277-287, 291-301, 305-315, 319-344, 348-357, 361, 365, 369-376, 380-387, 394, 403-406, 419, 436-439, 449-470, 478, 482-515, 519, 523, 527, 531-532, 536, 540-546, 550-553, 558-567, 575-617, 640, 679, 689, 703, 707-732, 765, 769-777, 785 +sde_collections/models/collection_choice_fields.py | 138 | 20 | 86% | 14-17, 36-39, 56-59, 74-77, 168-171 +sde_collections/models/delta_patterns.py | 313 | 33 | 89% | 119, 123, 139, 226-227, 263, 267, 291, 382-389, 439-449, 498, 503-506, 592, 627-641 +sde_collections/models/delta_url.py | 81 | 19 | 77% | 117-125, 129-135, 139-143, 146 +sde_collections/models/pattern.py | 145 | 79 | 46% | 40-48, 56-63, 66, 69, 73-74, 78-79, 87, 94-96, 105, 117-119, 128, 139-151, 163-205, 208-212, 215-216, 230-233, 243, 257-260, 268 +sde_collections/serializers.py | 191 | 47 | 75% | 80-81, 84-85, 88-89, 92-93, 129-130, 133-134, 137-138, 141-142, 197, 201, 211-214, 244-247, 257-260, 271, 274, 307-315, 335-343, 358-366 +sde_collections/sinequa_api.py | 102 | 3 | 97% | 65, 255, 289 +sde_collections/tasks.py | 119 | 67 | 44% | 25-67, 72-108, 113-117, 122-125, 130-148, 153-155, 215-216 +sde_collections/urls.py | 17 | 0 | 100% | +sde_collections/utils/__init__.py | 0 | 0 | 100% | +sde_collections/utils/bulk_github_push.py | 8 | 8 | 0% | 7-22 +sde_collections/utils/generate_deployment_message.py | 8 | 8 | 0% | 1-24 +sde_collections/utils/github_helper.py | 115 | 93 | 19% | 12-18, 30-42, 49-52, 60-68, 81-96, 104-110, 119-123, 127-129, 132-142, 145-152, 155-172, 175, 178-185, 189-192, 196-224, 227 +sde_collections/utils/health_check.py | 123 | 106 | 14% | 33-46, 51-57, 61-98, 102-143, 155-165, 172-187, 191-273 +sde_collections/utils/paired_field_descriptor.py | 33 | 2 | 94% | 35, 52 +sde_collections/utils/slack_utils.py | 19 | 4 | 79% | 57-58, 66-67 +sde_collections/utils/title_resolver.py | 90 | 5 | 94% | 64, 75, 83, 85, 92 +sde_collections/views.py | 368 | 229 | 38% | 70, 82-89, 102-141, 144-187, 194, 208-212, 215-223, 226-237, 246, 249-251, 256-265, 273-277, 280-306, 309-315, 323-327, 330-336, 339-345, 353-355, 358-368, 410, 413-422, 430, 433-442, 450, 458, 461-475, 483, 486-490, 505-511, 523-530, 538-566, 577-583, 586-607, 610-613, 628-634 +sde_indexing_helper/__init__.py | 2 | 0 | 100% | +sde_indexing_helper/conftest.py | 9 | 0 | 100% | +sde_indexing_helper/contrib/__init__.py | 0 | 0 | 100% | +sde_indexing_helper/contrib/sites/__init__.py | 0 | 0 | 100% | +sde_indexing_helper/users/__init__.py | 0 | 0 | 100% | +sde_indexing_helper/users/adapters.py | 11 | 11 | 0% | 1-16 +sde_indexing_helper/users/admin.py | 13 | 0 | 100% | +sde_indexing_helper/users/apps.py | 10 | 0 | 100% | +sde_indexing_helper/users/context_processors.py | 3 | 0 | 100% | +sde_indexing_helper/users/forms.py | 15 | 0 | 100% | +sde_indexing_helper/users/models.py | 10 | 0 | 100% | +sde_indexing_helper/users/tasks.py | 6 | 0 | 100% | +sde_indexing_helper/users/urls.py | 4 | 0 | 100% | +sde_indexing_helper/users/views.py | 27 | 0 | 100% | +sde_indexing_helper/utils/__init__.py | 0 | 0 | 100% | +sde_indexing_helper/utils/exceptions.py | 7 | 0 | 100% | +sde_indexing_helper/utils/storages.py | 7 | 7 | 0% | 1-11 +tests/test_merge_production_dotenvs_in_dotenv.py | 13 | 0 | 100 |% + +## Critical Areas +### Config Generation +- config_generation/db_to_xml.py + - update_or_add_element_value() + - _update_config_xml() + - convert_template_to_scraper() + - add_document_type() + - add_url_exclude() + - add_title_mapping() + - add_job_list_item() + - get_tag_value() + - fetch_treeroot() + - fetch_document_type() +- config_generation/generate_jobs.py + - make_all_parallel_jobs() + +### Models + - environmental_justice/models.py + - sde_collections/models/collection.py + - clear_delta_urls() + - clear_dump_urls() + - refresh_url_lists_for_all_patterns () + - migrate_dump_to_delta () + - create_or_update_delta_url + - promote_to_curate + - add_to_public_query() + - create_scraper_config() + - create_indexer_config() + - create_plugin_config() + - _write_to_github() + - update_config_xml() + - apply_all_patterns() + - create_configs_on_status_change() + - sde_collections/models/collection_choice_fields.py + - sde_collections/models/delta_patterns.py + - sde_collections/models/delta_url.py + - sde_collections/models/pattern.py + - sde_indexing_helper/users/models.py + +### Views + - environmental_justice/views.py + - sde_collections/views.py + - sde_indexing_helper/users/views.py + +### Serializers and APIs + - environmental_justice/serializers.py + - sde_collections/serializers.py + +### Admin Interface + - environmental_justice/admin.py + - sde_collections/admin.py + - fetch_full_text_lrm_dev_action() + - fetch_full_text_xli_action() + - sde_indexing_helper/users/admin.py + +### Utilities and Helpers + - sde_collections/utils/github_helper.py + - sde_collections/utils/health_check.py + - sde_collections/utils/title_resolver.py + - sde_collections/utils/github_helper.py + - fetch_metadata() + - _get_contents_from_path() + +### Task Automation and Background Jobs + - sde_collections/tasks.py + +### Key Operational Pipelines in the Repository +The selection of critical areas for testing is guided by the following pipelines of the repository: +1. Sinequa config files are generated +2. COSMOS imports data from LRM Dev +3. Imported data is processed +4. Curators update URL metadata +5. Sinequa reads results from the COSMOS APIs + +### Critical Areas Lacking Tests +- **Config Generation**: Config generation files are under-tested. Develop unit tests for all critical functions in the config_generation files. +- **Project Settings**: Environment-specific configurations (`local.py`, `production.py`) have no tests. +- **Frontend Features**: Currently, there are no tests covering frontend logic and interactions. +- **Utilities and Helpers**: Essential utility modules like github_helper.py and health_check.py lack tests + diff --git a/docs/architecture-decisions/uniform-error-handling.md b/docs/architecture-decisions/uniform-error-handling.md new file mode 100644 index 00000000..f111e11f --- /dev/null +++ b/docs/architecture-decisions/uniform-error-handling.md @@ -0,0 +1,76 @@ +# Uniform Error Handling + +## Status +Proposed + +## Context +The current error handling system logs errors only in the terminal, which are neither preserved nor useful for developers or users. This approach fails to inform users of encountered issues and does not support debugging by developers. A consistent and efficient error-handling strategy is required to enhance user experience and simplify debugging. + +## Decision +The **Error Dashboard** approach is proposed to be adopted as the method for handling system errors. This decision aligns with the need to consolidate real-time and asynchronous errors in a centralized location for better tracking and resolution. + +### Why This Decision Was Taken +Since there are tasks that run asynchronously (e.g., Celery tasks), errors from these operations cannot be shown to the user in real-time. To ensure that both real-time and asynchronous errors are recorded and displayed, this approach was chosen. + +- **Real-Time Errors**: These will be updated in the notification dashboard immediately as they occur. +- **Asynchronous Errors**: Errors encountered during asynchronous tasks (e.g., Celery tasks) will be recorded and updated in the dashboard once the task completes. + +To achieve this, a targeted script will be developed to monitor asynchronous tasks. Details of this script are outlined in the **Monitoring Asynchronous Tasks** section of this ADR. + +## Consequences + +### Positive +- **Centralization**: All errors are consolidated in one place, reducing the chances of overlooked issues. +- **Improved User Experience**: Users have a clear view of errors affecting their operations without relying on backend logs or email notifications. +- **Scalability**: Suitable for large-scale operations involving asynchronous and real-time tasks. + +### Negative +- **Navigation Overhead**: Users must navigate to the dashboard, which could be less convenient than inline error notifications. +- **Resource Requirements**: Developing and maintaining the dashboard requires additional frontend and backend resources. + +## Alternatives Considered + +### 1. **Logging System** +- **Approach**: Use Python’s built-in logging library to save structured logs with details like timestamps and severity levels. +- **Advantages**: + - Preserves logs for debugging. + - Differentiates critical and minor errors for better prioritization. + - Efficient for monitoring asynchronous tasks. +- **Disadvantage**: + - Primarily benefits developers; does not directly improve user interaction. + +### 2. **Frontend Notifications** +- **Approach**: Display error messages directly on the frontend. +- **Advantages**: + - Enhances user interaction by providing immediate feedback. +- **Disadvantages**: + - Resource-intensive to implement. + - Real-time feedback is challenging for asynchronous tasks. + +### 3. **Email Notifications** +- **Approach**: Send email alerts for critical failures. +- **Advantages**: + - Simple to implement. + - Suitable for asynchronous task monitoring. +- **Disadvantage**: + - Users need to register their emails. + - Requires users to check emails, reducing immediacy of feedback. + +### 4. **Error Dashboard** +- **Approach**: Display errors on a dedicated frontend dashboard. +- **Advantages**: + - Consolidates both real-time and asynchronous errors. + - Provides a centralized location for error tracking. +- **Disadvantages**: + - Requires navigation away from the operational page to view errors. + +## Monitoring Asynchronous Tasks +A targeted script will be developed to monitor asynchronous tasks for errors during import operations: +- **Trigger**: Activated at the start of an import operation. +- **Duration**: Runs for 10 minutes, polling the Flower API every minute. +- **Functionality**: + - Detects failed tasks related to the import. + - Notifies developers or users as appropriate. + +## References +- Issue: [#1112](#) diff --git a/docs/conf.py b/docs/conf.py index 93a07713..530d8841 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,7 +28,7 @@ # -- Project information ----------------------------------------------------- -project = "SDE Indexing Helper" +project = "COSMOS" copyright = """2023, NASA IMPACT""" author = "NASA IMPACT" diff --git a/docs/test/frontend_testing_methodologies.md b/docs/test/frontend_testing_methodologies.md new file mode 100644 index 00000000..3a9348a4 --- /dev/null +++ b/docs/test/frontend_testing_methodologies.md @@ -0,0 +1,283 @@ +# Frontend Testing Methodologies for Django Projects +## Overview + +This document outlines testing methodologies for Django projects with HTML forms and JavaScript enhancements, focusing on Python-based testing solutions. While going through the codebase, I can see it is primarily a JavaScript-heavy frontend that uses plain HTML forms enhanced with JavaScript/jQuery rather than server-rendered Django forms. Django forms are being used only in the admin panel of the project. + +## Primary Testing Tools + +### 1. Selenium with Python (Chosen) + +#### Capabilities +- Full browser automation +- JavaScript execution support +- Real DOM interaction +- Cross-browser testing +- Modal dialog handling +- AJAX request testing +- File upload testing +- DataTables interaction + +#### Implementation +```python +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +class TestCollectionDetail: + def setup_method(self): + self.driver = webdriver.Chrome() + self.wait = WebDriverWait(self.driver, 10) + + def test_title_change_modal(self): + # Example test for title change modal + self.driver.get("/collections/1/") + title_button = self.wait.until( + EC.element_to_be_clickable((By.ID, "change-title-btn")) + ) + title_button.click() + + modal = self.wait.until( + EC.visibility_of_element_located((By.ID, "title-modal")) + ) + + form = modal.find_element(By.TAG_NAME, "form") + input_field = form.find_element(By.NAME, "title") + input_field.send_keys("New Title") + form.submit() + + # Wait for AJAX completion + self.wait.until( + EC.text_to_be_present_in_element((By.ID, "collection-title"), "New Title") + ) +``` + +#### Pros +- Complete end-to-end testing +- Real browser interaction +- JavaScript support +- Comprehensive API +- Strong community support + +#### Drawbacks +- Slower execution +- Browser dependencies +- More complex setup +- Can be flaky with timing issues + +### 2. pytest-django with django-test-client + +#### Capabilities +- Form submission testing +- Response validation +- Header verification +- Status code checking +- Session handling +- Template rendering testing + +#### Implementation +```python +import pytest +from django.urls import reverse + +@pytest.mark.django_db +class TestCollectionForms: + def test_collection_create(self, client): + url = reverse('collection_create') + data = { + 'title': 'Test Collection', + 'division': 'division1', + 'workflow_status': 'active' + } + response = client.post(url, data) + assert response.status_code == 302 # Redirect after success + + # Verify creation + response = client.get(reverse('collection_detail', kwargs={'pk': 1})) + assert 'Test Collection' in response.content.decode() +``` + +#### Pros +- Fast execution +- No browser dependency +- Simpler setup +- Integrated with Django + +#### Drawbacks +- **No JavaScript support (Dealbreaker)** +- Limited DOM interaction +- Can't test real user interactions + +### 3. Playwright for Python + +#### Capabilities +- Modern browser automation +- Async/await support +- Network interception +- Mobile device emulation +- Automatic waiting +- Screenshot and video capture + +#### Implementation +```python +from playwright.sync_api import sync_playwright + +def test_modal_form_submission(): + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + + page.goto("/collections/") + + # Click button to open modal + page.click("#add-collection-btn") + + # Fill form in modal + page.fill("#title-input", "New Collection") + page.fill("#division-input", "Division A") + + # Submit form + page.click("#submit-btn") + + # Wait for success message + success_message = page.wait_for_selector(".toast-success") + assert "Collection created" in success_message.text_content() + + browser.close() +``` + +#### Pros +- Modern API design +- Better stability than Selenium +- Built-in async support +- Powerful debugging tools + +#### Drawbacks +- **Newer tool, smaller community (Dealbreaker)** +- Additional system dependencies +- Learning curve for async features + + +### 4. Beautiful Soup with Requests +A combination for testing HTML structure and content. + +**Capabilities:** +- HTML parsing and validation +- Content extraction +- Structure verification +- Link checking +- Form field validation +- Template testing + +**Pros:** +- Lightweight solution +- Flexible HTML parsing +- No browser dependency +- Fast execution +- Simple API +- Low resource usage + +**Drawbacks:** +- **No JavaScript support (Dealbreaker)** +- Limited interaction testing +- No visual testing +- Basic functionality only +- No real browser simulation + +## Feature Comparison Table + +| Feature | Selenium | Django Test Client | Playwright | Beautiful Soup | +|---------------------------|----------|-------------------|------------|----------------| +| JavaScript Support | ✅ Yes | ❌ No | ✅ Yes | ❌ No | +| Setup Complexity | 🟡 Medium | 🟢 Low | 🟡 Medium | 🟢 Low | +| Execution Speed | 🔴 Slow | 🟢 Fast | 🟡 Medium | 🟢 Fast | +| Modal Testing | ✅ Yes | ❌ No | ✅ Yes | ❌ No | +| AJAX Testing | ✅ Yes | ❌ No | ✅ Yes | ❌ No | +| Cross-browser Testing | ✅ Yes | ❌ No | ✅ Yes | ❌ No | +| Real User Interaction | ✅ Yes | ❌ No | ✅ Yes | ❌ No | +| Documentation Quality | ✅ Excellent| ✅ Good | ✅ Good | ✅ Good | +| Community Support | ✅ Large | ✅ Large | 🟡 Growing | ✅ Large | + +## Testing Strategy Recommendations + +1. **Primary Testing Tool**: Selenium with Python + - Best suited for your JavaScript-heavy interface + - Handles modals and AJAX naturally + - Extensive documentation and community support + +2. **Test Coverage Areas**: + - Modal form interactions + - AJAX submissions + - DataTables functionality + - Form validation + - Success/error messages + - URL routing + - DOM updates + +## Implementation Steps + +1. Add testing dependencies to requirements file `requirements/local.txt`: +```text +# Testing Dependencies +selenium>=4.15.2 +pytest-xdist>=3.3.1 +pytest-cov>=4.1.0 +``` + +2. Update Dockerfile `compose/local/django/Dockerfile` to install Chrome and ChromeDriver: +```dockerfile +# Install Chrome and ChromeDriver for Selenium tests +RUN apt-get update && apt-get install -y \ + chromium \ + chromium-driver \ + && rm -rf /var/lib/apt/lists/* +``` + +3. Rebuild Docker container to apply changes: +```bash +docker-compose -f local.yml build django +``` + +4. Create test directory structure: +```bash +mkdir -p tests/frontend +touch tests/frontend/__init__.py +touch tests/frontend/base.py +touch tests/frontend/test_setup.py +``` + +5. Create base test classes: +```python +import pytest +from selenium import webdriver + +class BaseUITest: + @pytest.fixture(autouse=True) + def setup_class(self): + self.driver = webdriver.Chrome() + yield + self.driver.quit() + + def login(self): + # Common login logic + pass +``` + +6. Organize tests by feature: +```python +class TestCollectionManagement(BaseUITest): + def test_create_collection(self): + pass + + def test_edit_collection(self): + pass + +class TestURLPatterns(BaseUITest): + def test_add_include_pattern(self): + pass +``` + +7. Run tests: +```bash +docker-compose -f local.yml run --rm django pytest tests/frontend/test_setup.py -v +``` diff --git a/document_classifier/async_scraper.py b/document_classifier/async_scraper.py index fb2fb7c7..12c039a3 100644 --- a/document_classifier/async_scraper.py +++ b/document_classifier/async_scraper.py @@ -1,4 +1,5 @@ """Asynchronously scrapes the HTML content of a given URL using a headless browser.""" + import asyncio import re diff --git a/document_classifier/encoder.py b/document_classifier/encoder.py index c62bfafc..1bacc5d9 100644 --- a/document_classifier/encoder.py +++ b/document_classifier/encoder.py @@ -1,4 +1,5 @@ """ Encoding the url response """ + import pandas as pd diff --git a/document_classifier/load_dataset.py b/document_classifier/load_dataset.py index 8c64e03b..d61efdad 100644 --- a/document_classifier/load_dataset.py +++ b/document_classifier/load_dataset.py @@ -1,4 +1,5 @@ """ Module for loading dataset """ + from torch.utils.data import DataLoader, SequentialSampler, TensorDataset diff --git a/environmental_justice/README.md b/environmental_justice/README.md new file mode 100644 index 00000000..0dffaf84 --- /dev/null +++ b/environmental_justice/README.md @@ -0,0 +1,86 @@ +# Environmental Justice API + +## Overview +This API provides access to Environmental Justice data from multiple sources. It supports retrieving data from individual sources or as a combined dataset with defined precedence rules. + +## Endpoints + +### GET /api/environmental-justice/ + +Retrieves environmental justice data based on specified data source. + +#### Query Parameters + +| Parameter | Description | Default | Options | +|-------------|-------------|------------|----------------------------------------------| +| data_source | Data source filter | "combined" | "spreadsheet", "ml_production", "ml_testing", "combined" | + +#### Data Source Behavior + +1. **Single Source** + - `?data_source=spreadsheet`: Returns only spreadsheet data + - `?data_source=ml_production`: Returns only ML production data + - `?data_source=ml_testing`: Returns only ML testing data + +2. **Combined Data** (Default) + - Access via `?data_source=combined` or no parameter + - Merges data from 'spreadsheet' and 'ml_production' sources + - Precedence rules: + - If the same dataset exists in both sources, the spreadsheet version is used + - Unique datasets from ml_production are included + - ML testing data is not included in combined view + +#### Example Requests + +```bash +# Get combined data (default) +GET /api/environmental-justice/ + +# Get combined data (explicit) +GET /api/environmental-justice/?data_source=combined + +# Get only spreadsheet data +GET /api/environmental-justice/?data_source=spreadsheet + +# Get only ML production data +GET /api/environmental-justice/?data_source=ml_production + +# Get only ML testing data +GET /api/environmental-justice/?data_source=ml_testing +``` + +#### Response Fields + +Each record includes the following fields: +- dataset +- description +- description_simplified +- indicators +- intended_use +- latency +- limitations +- project +- source_link +- strengths +- format +- geographic_coverage +- data_visualization +- spatial_resolution +- temporal_extent +- temporal_resolution +- sde_link +- data_source + +## Data Source Definitions + +- **spreadsheet**: Primary source data from environmental justice spreadsheets +- **ml_production**: Production machine learning processed data +- **ml_testing**: Testing/staging machine learning processed data + +## Precedence Rules +When retrieving combined data: +1. If a dataset exists in both spreadsheet and ml_production: + - The spreadsheet version takes precedence + - The ml_production version is excluded +2. Datasets unique to ml_production are included in the response +3. ML testing data is never included in combined results diff --git a/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py b/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py new file mode 100644 index 00000000..c51219b4 --- /dev/null +++ b/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py @@ -0,0 +1,52 @@ +# Generated by Django 4.2.9 on 2024-11-23 03:18 + +from django.db import migrations, models + + +def migrate_destination_server_to_data_source(apps, schema_editor): + EnvironmentalJusticeRow = apps.get_model("environmental_justice", "EnvironmentalJusticeRow") + + # Migrate prod to spreadsheet + EnvironmentalJusticeRow.objects.filter(destination_server="prod").update( + data_source="spreadsheet", destination_server="" + ) + + # Migrate dev to ml_production + EnvironmentalJusticeRow.objects.filter(destination_server="dev").update( + data_source="ml_production", destination_server="" + ) + + # Migrate test to ml_testing + EnvironmentalJusticeRow.objects.filter(destination_server="test").update( + data_source="ml_testing", destination_server="" + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ("environmental_justice", "0005_environmentaljusticerow_destination_server"), + ] + + operations = [ + migrations.AddField( + model_name="environmentaljusticerow", + name="data_source", + field=models.CharField( + blank=True, + choices=[ + ("spreadsheet", "Spreadsheet"), + ("ml_production", "ML Production"), + ("ml_testing", "ML Testing"), + ], + default="", + max_length=20, + verbose_name="Data Source", + ), + ), + migrations.RunPython(migrate_destination_server_to_data_source, reverse_code=migrations.RunPython.noop), + migrations.RemoveField( + model_name="environmentaljusticerow", + name="destination_server", + ), + ] diff --git a/environmental_justice/models.py b/environmental_justice/models.py index 97cb1d61..d7cb705b 100644 --- a/environmental_justice/models.py +++ b/environmental_justice/models.py @@ -6,13 +6,13 @@ class EnvironmentalJusticeRow(models.Model): Environmental Justice data from the spreadsheet """ - class DestinationServerChoices(models.TextChoices): - DEV = "dev", "Development" - TEST = "test", "Testing" - PROD = "prod", "Production" + class DataSourceChoices(models.TextChoices): + SPREADSHEET = "spreadsheet", "Spreadsheet" + ML_PRODUCTION = "ml_production", "ML Production" + ML_TESTING = "ml_testing", "ML Testing" - destination_server = models.CharField( - "Destination Server", max_length=10, choices=DestinationServerChoices.choices, default="", blank=True + data_source = models.CharField( + "Data Source", max_length=20, choices=DataSourceChoices.choices, default="", blank=True ) dataset = models.CharField("Dataset", blank=True, default="") diff --git a/environmental_justice/tests.py b/environmental_justice/tests.py deleted file mode 100644 index 9a30df3b..00000000 --- a/environmental_justice/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase # noqa - -# Create your tests here. diff --git a/environmental_justice/tests/conftest.py b/environmental_justice/tests/conftest.py new file mode 100644 index 00000000..d8b53c9a --- /dev/null +++ b/environmental_justice/tests/conftest.py @@ -0,0 +1,30 @@ +import pytest +from django.urls import include, path +from rest_framework.routers import DefaultRouter +from rest_framework.test import APIClient + +from environmental_justice.views import EnvironmentalJusticeRowViewSet + +# Create router and register our viewset +router = DefaultRouter() +router.register(r"environmental-justice", EnvironmentalJusticeRowViewSet) + +# Create temporary urlpatterns for testing +urlpatterns = [ + path("api/", include(router.urls)), +] + + +# Override default URL conf for testing +@pytest.fixture +def client(): + """Return a Django REST framework API client""" + return APIClient() + + +@pytest.fixture(autouse=True) +def setup_urls(): + """Setup URLs for testing""" + from django.conf import settings + + settings.ROOT_URLCONF = __name__ diff --git a/environmental_justice/tests/factories.py b/environmental_justice/tests/factories.py new file mode 100644 index 00000000..42d05735 --- /dev/null +++ b/environmental_justice/tests/factories.py @@ -0,0 +1,28 @@ +import factory +from factory.django import DjangoModelFactory + +from environmental_justice.models import EnvironmentalJusticeRow + + +class EnvironmentalJusticeRowFactory(DjangoModelFactory): + class Meta: + model = EnvironmentalJusticeRow + + dataset = factory.Sequence(lambda n: f"dataset_{n}") + description = factory.Faker("sentence") + description_simplified = factory.Faker("sentence") + indicators = factory.Faker("sentence") + intended_use = factory.Faker("sentence") + latency = factory.Faker("word") + limitations = factory.Faker("sentence") + project = factory.Faker("word") + source_link = factory.Faker("url") + strengths = factory.Faker("sentence") + format = factory.Faker("file_extension") + geographic_coverage = factory.Faker("country") + data_visualization = factory.Faker("sentence") + spatial_resolution = factory.Faker("word") + temporal_extent = factory.Faker("date") + temporal_resolution = factory.Faker("word") + sde_link = factory.Faker("url") + data_source = EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET diff --git a/environmental_justice/tests/test_ej_api.py b/environmental_justice/tests/test_ej_api.py new file mode 100644 index 00000000..1632d45b --- /dev/null +++ b/environmental_justice/tests/test_ej_api.py @@ -0,0 +1,153 @@ +# docker-compose -f local.yml run --rm django pytest environmental_justice/tests/test_views.py +import pytest +from rest_framework import status + +from environmental_justice.models import EnvironmentalJusticeRow +from environmental_justice.tests.factories import EnvironmentalJusticeRowFactory + + +@pytest.mark.django_db +class TestEnvironmentalJusticeRowViewSet: + """Test suite for the EnvironmentalJusticeRow API endpoints""" + + def setup_method(self): + """Setup URL for API endpoint""" + self.url = "/api/environmental-justice/" + + def test_empty_database_returns_empty_list(self, client): + """Should return empty list when no records exist""" + response = client.get(self.url) + assert response.status_code == status.HTTP_200_OK + assert response.json()["results"] == [] + assert response.json()["count"] == 0 + + def test_single_source_filtering(self, client): + """Should return records only from requested data source""" + # Create records for each data source + spreadsheet_record = EnvironmentalJusticeRowFactory( + dataset="test_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET + ) + ml_prod_record = EnvironmentalJusticeRowFactory( + dataset="another_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ) + ml_test_record = EnvironmentalJusticeRowFactory( + dataset="test_dataset_3", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_TESTING + ) + + # Test spreadsheet filter + response = client.get(f"{self.url}?data_source=spreadsheet") + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["dataset"] == spreadsheet_record.dataset + + # Test ml_production filter + response = client.get(f"{self.url}?data_source=ml_production") + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["dataset"] == ml_prod_record.dataset + + # Test ml_testing filter + response = client.get(f"{self.url}?data_source=ml_testing") + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["dataset"] == ml_test_record.dataset + + def test_combined_data_precedence(self, client): + """ + Should return combined data with spreadsheet taking precedence over ml_production + for matching datasets + """ + # Create spreadsheet record + EnvironmentalJusticeRowFactory( + dataset="common_dataset", + description="spreadsheet version", + data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET, + ) + + # Create ML production record with same dataset + EnvironmentalJusticeRowFactory( + dataset="common_dataset", + description="ml version", + data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, + ) + + # Create unique ML production record + EnvironmentalJusticeRowFactory( + dataset="unique_ml_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ) + + # Test combined view (default) + response = client.get(self.url) + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 2 # Should only return 2 records (not 3) + + # Verify correct records are returned + datasets = [record["dataset"] for record in data] + assert "common_dataset" in datasets + assert "unique_ml_dataset" in datasets + + # Verify precedence - should get spreadsheet version of common dataset + common_record = next(r for r in data if r["dataset"] == "common_dataset") + assert common_record["description"] == "spreadsheet version" + + def test_combined_explicit_parameter(self, client): + """Should handle explicit 'combined' parameter same as default""" + EnvironmentalJusticeRowFactory(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) + EnvironmentalJusticeRowFactory( + dataset="unique_ml_dataset", # Ensure different dataset + data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, + ) + + # Compare default and explicit combined responses + default_response = client.get(self.url) + combined_response = client.get(f"{self.url}?data_source=combined") + + assert default_response.status_code == status.HTTP_200_OK + assert combined_response.status_code == status.HTTP_200_OK + assert default_response.json()["results"] == combined_response.json()["results"] + + def test_invalid_data_source(self, client): + """Should return 400 error for invalid data_source parameter""" + response = client.get(f"{self.url}?data_source=invalid") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert "Invalid data_source" in str(response.json()) + + def test_sorting_in_combined_view(self, client): + """Should return combined results sorted by dataset name""" + # Create records in non-alphabetical order + EnvironmentalJusticeRowFactory( + dataset="zebra_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET + ) + EnvironmentalJusticeRowFactory( + dataset="alpha_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ) + + response = client.get(self.url) + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + + # Verify sorting + datasets = [record["dataset"] for record in data] + assert datasets == sorted(datasets) + + def test_http_methods_allowed(self, client): + """Should only allow GET requests""" + # Test GET (should work) + get_response = client.get(self.url) + assert get_response.status_code == status.HTTP_200_OK + + # Test POST (should fail) + post_response = client.post(self.url, {}) + assert post_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED + + # Test PUT (should fail) + put_response = client.put(self.url, {}) + assert put_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED + + # Test DELETE (should fail) + delete_response = client.delete(self.url) + assert delete_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED diff --git a/environmental_justice/views.py b/environmental_justice/views.py index 4e999a4c..4959c168 100644 --- a/environmental_justice/views.py +++ b/environmental_justice/views.py @@ -1,5 +1,6 @@ from django_filters.rest_framework import DjangoFilterBackend from rest_framework import viewsets +from rest_framework.exceptions import ValidationError from .models import EnvironmentalJusticeRow from .serializers import EnvironmentalJusticeRowSerializer @@ -8,19 +9,52 @@ class EnvironmentalJusticeRowViewSet(viewsets.ModelViewSet): """ API endpoint that allows environmental justice rows to be read. + When combining spreadsheet and ml_production data, spreadsheet takes precedence + for any matching dataset values. """ queryset = EnvironmentalJusticeRow.objects.all() serializer_class = EnvironmentalJusticeRowSerializer http_method_names = ["get"] filter_backends = [DjangoFilterBackend] - filterset_fields = ["destination_server"] + filterset_fields = [] + + def get_combined_queryset(self): + """ + Returns combined data where: + 1. All spreadsheet data is included + 2. ML production data is included only if there's no spreadsheet data with matching dataset + Records are sorted by dataset name and then data_source (ensuring spreadsheet comes before ml_production) + """ + # Get spreadsheet data + spreadsheet_data = EnvironmentalJusticeRow.objects.filter( + data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET + ) + + # Get ML production data excluding datasets that exist in spreadsheet + ml_production_data = EnvironmentalJusticeRow.objects.filter( + data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ).exclude(dataset__in=spreadsheet_data.values_list("dataset", flat=True)) + + # Combine the querysets and sort + return spreadsheet_data.union(ml_production_data).order_by("dataset", "data_source") def get_queryset(self): """ - if no destination_server is provided, default to PROD + Handle different data_source filter scenarios: + - No filter: Return combined data (spreadsheet takes precedence) + - 'combined': Same as no filter + - specific source: Return data for that source only """ - queryset = super().get_queryset() - if not self.request.query_params.get("destination_server"): - queryset = queryset.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.PROD) - return queryset + data_source = self.request.query_params.get("data_source", "combined") + + # Handle the 'combined' case or no parameter case + if not data_source or data_source == "combined": + return self.get_combined_queryset() + + # Validate specific data source + if data_source not in EnvironmentalJusticeRow.DataSourceChoices.values: + valid_choices = list(EnvironmentalJusticeRow.DataSourceChoices.values) + ["combined"] + raise ValidationError(f"Invalid data_source. Valid choices are: {', '.join(valid_choices)}") + + return super().get_queryset().filter(data_source=data_source).order_by("dataset") diff --git a/feedback/migrations/0005_feedbackformdropdown_feedback_dropdown_option.py b/feedback/migrations/0005_feedbackformdropdown_feedback_dropdown_option.py new file mode 100644 index 00000000..da4bf7d8 --- /dev/null +++ b/feedback/migrations/0005_feedbackformdropdown_feedback_dropdown_option.py @@ -0,0 +1,37 @@ +# Generated by Django 4.2.9 on 2025-01-29 19:27 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("feedback", "0004_contentcurationrequest_created_at_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="FeedbackFormDropdown", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("name", models.CharField(max_length=200)), + ("display_order", models.PositiveIntegerField(default=1)), + ], + options={ + "verbose_name": "Dropdowm Option", + "verbose_name_plural": "Dropdown Options", + "ordering": ["display_order", "name"], + }, + ), + migrations.AddField( + model_name="feedback", + name="dropdown_option", + field=models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="feedback", + to="feedback.feedbackformdropdown", + ), + ), + ] diff --git a/feedback/models.py b/feedback/models.py index 0666080f..843fd2fb 100644 --- a/feedback/models.py +++ b/feedback/models.py @@ -4,12 +4,36 @@ from sde_collections.utils.slack_utils import send_slack_message +class FeedbackFormDropdown(models.Model): + DEFAULT_OPTIONS = [ + {"name": "I need help or have a general question", "display_order": 1}, + {"name": "I have a data/content question or comment", "display_order": 2}, + {"name": "I would like to report an error", "display_order": 3}, + {"name": "I have an idea or suggested improvement to share", "display_order": 4}, + {"name": "General comment or feedback", "display_order": 5}, + ] + + name = models.CharField(max_length=200) + display_order = models.PositiveIntegerField(default=1) + + class Meta: + ordering = ["display_order", "name"] + verbose_name = "Dropdowm Option" + verbose_name_plural = "Dropdown Options" + + def __str__(self): + return self.name + + class Feedback(models.Model): name = models.CharField(max_length=150) email = models.EmailField() subject = models.CharField(max_length=400) comments = models.TextField() source = models.CharField(max_length=50, default="SDE", blank=True) + dropdown_option = models.ForeignKey( + FeedbackFormDropdown, on_delete=models.SET_NULL, null=True, related_name="feedback" + ) created_at = models.DateTimeField(null=True, blank=True) class Meta: @@ -32,10 +56,12 @@ def format_notification_message(self): """ Returns a formatted notification message containing details from this Feedback instance. """ + dropdown_option_text = self.dropdown_option.name if self.dropdown_option else "No Option Selected" notification_message = ( - f" New Feedback Received : \n" + f" New Feedback Received : \n" # noqa: E203 f"Name: {self.name}\n" f"Email: {self.email}\n" + f"Dropdown Choice: {dropdown_option_text}\n" f"Subject: {self.subject}\n" f"Comments: {self.comments}\n" f"Source: {self.source}\n" diff --git a/feedback/serializers.py b/feedback/serializers.py index 6956e16f..f70c8181 100644 --- a/feedback/serializers.py +++ b/feedback/serializers.py @@ -1,9 +1,33 @@ +import re + from rest_framework import serializers -from .models import ContentCurationRequest, Feedback +from .models import ContentCurationRequest, Feedback, FeedbackFormDropdown + + +class FeedbackFormDropdownSerializer(serializers.ModelSerializer): + class Meta: + model = FeedbackFormDropdown + fields = ["id", "name"] + + +class HTMLFreeCharField(serializers.CharField): + def to_internal_value(self, data): + value = super().to_internal_value(data) + + if re.search(r"<[^>]+>", value): + raise serializers.ValidationError("HTML tags are not allowed in this field") + + return value class FeedbackSerializer(serializers.ModelSerializer): + + name = HTMLFreeCharField() + subject = HTMLFreeCharField() + comments = HTMLFreeCharField() + source = HTMLFreeCharField() + class Meta: model = Feedback fields = [ @@ -12,6 +36,7 @@ class Meta: "subject", "comments", "source", + "dropdown_option", "created_at", ] diff --git a/feedback/tests.py b/feedback/tests.py index e69de29b..4c956f35 100644 --- a/feedback/tests.py +++ b/feedback/tests.py @@ -0,0 +1,146 @@ +# docker compose -f local.yml run --rm django pytest feedback/tests.py + +import pytest +from django.urls import reverse +from rest_framework import status +from rest_framework.test import APIClient + +from feedback.models import ContentCurationRequest, Feedback, FeedbackFormDropdown + + +@pytest.fixture +def api_client(): + return APIClient() + + +@pytest.fixture +def dropdown_option(db): + return FeedbackFormDropdown.objects.create(name="I need help or have a general question", display_order=1) + + +@pytest.fixture +def feedback_data(dropdown_option): + return { + "name": "Test User", + "email": "test@example.com", + "subject": "Test Subject", + "comments": "Test Comments", + "source": "TEST", + "dropdown_option": dropdown_option.id, + } + + +@pytest.fixture +def content_curation_data(): + return { + "name": "Test User", + "email": "test@example.com", + "scientific_focus": "Biology", + "data_type": "Genomics", + "data_link": "https://example.com/data", + "additional_info": "Extra details", + } + + +@pytest.mark.django_db +class TestFeedbackFormDropdown: + def test_dropdown_str_representation(self, dropdown_option): + """Test string representation of dropdown options""" + assert str(dropdown_option) == "I need help or have a general question" + + def test_dropdown_ordering(self): + """Test that dropdown options are ordered by display_order""" + dropdown1 = FeedbackFormDropdown.objects.create(name="First Option", display_order=1) + dropdown2 = FeedbackFormDropdown.objects.create(name="Second Option", display_order=2) + dropdowns = FeedbackFormDropdown.objects.all() + assert dropdowns[0] == dropdown1 + assert dropdowns[1] == dropdown2 + + +@pytest.mark.django_db +class TestFeedbackAPI: + def test_get_dropdown_options(self, api_client, dropdown_option): + """Test retrieving dropdown options""" + url = reverse("feedback:feedback-form-dropdown-options-api") + response = api_client.get(url) + assert response.status_code == status.HTTP_200_OK + assert len(response.data["results"]) == 1 + assert response.data["results"][0]["name"] == dropdown_option.name + + def test_create_feedback_success(self, api_client, feedback_data): + """Test successful feedback creation""" + url = reverse("feedback:contact-us-api") + response = api_client.post(url, feedback_data, format="json") + assert response.status_code == status.HTTP_201_CREATED + assert Feedback.objects.count() == 1 + + def test_create_feedback_invalid_email(self, api_client, feedback_data): + """Test feedback creation with invalid email""" + url = reverse("feedback:contact-us-api") + feedback_data["email"] = "invalid-email" + response = api_client.post(url, feedback_data, format="json") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert "email" in response.data["error"] + + @pytest.mark.parametrize("field", ["name", "email", "subject", "comments"]) + def test_create_feedback_missing_required_fields(self, api_client, feedback_data, field): + """Test feedback creation with missing required fields""" + url = reverse("feedback:contact-us-api") + feedback_data.pop(field) + response = api_client.post(url, feedback_data, format="json") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert field in response.data["error"] + + def test_create_feedback_invalid_dropdown(self, api_client, feedback_data): + """Test feedback creation with non-existent dropdown option""" + url = reverse("feedback:contact-us-api") + feedback_data["dropdown_option"] = 999 + response = api_client.post(url, feedback_data, format="json") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert "dropdown_option" in response.data["error"] + + +@pytest.mark.django_db +class TestContentCurationRequestAPI: + def test_create_request_success(self, api_client, content_curation_data): + """Test successful content curation request creation""" + url = reverse("feedback:content-curation-request-api") + response = api_client.post(url, content_curation_data, format="json") + assert response.status_code == status.HTTP_201_CREATED + assert ContentCurationRequest.objects.count() == 1 + + def test_create_request_without_additional_info(self, api_client, content_curation_data): + """Test request creation without optional additional info""" + url = reverse("feedback:content-curation-request-api") + del content_curation_data["additional_info"] + response = api_client.post(url, content_curation_data, format="json") + assert response.status_code == status.HTTP_201_CREATED + assert ContentCurationRequest.objects.first().additional_info == "" + + def test_create_request_invalid_email(self, api_client, content_curation_data): + """Test request creation with invalid email""" + url = reverse("feedback:content-curation-request-api") + content_curation_data["email"] = "invalid-email" + response = api_client.post(url, content_curation_data, format="json") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert "email" in response.data["error"] + + @pytest.mark.parametrize("field", ["name", "email", "scientific_focus", "data_type", "data_link"]) + def test_create_request_missing_required_fields(self, api_client, content_curation_data, field): + """Test request creation with missing required fields""" + url = reverse("feedback:content-curation-request-api") + content_curation_data.pop(field) + response = api_client.post(url, content_curation_data, format="json") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert field in response.data["error"] + + @pytest.mark.parametrize( + "field,length", [("name", 151), ("data_link", 1001), ("scientific_focus", 201), ("data_type", 101)] + ) + def test_create_request_field_max_lengths(self, api_client, content_curation_data, field, length): + """Test request creation with fields exceeding max length""" + url = reverse("feedback:content-curation-request-api") + content_curation_data[field] = "x" * length + response = api_client.post(url, content_curation_data, format="json") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert field in response.data["error"] diff --git a/feedback/urls.py b/feedback/urls.py index 63ee219c..b44ca0a0 100644 --- a/feedback/urls.py +++ b/feedback/urls.py @@ -1,10 +1,19 @@ from django.urls import path -from .views import ContactFormModelView, ContentCurationRequestView +from .views import ( + ContactFormModelView, + ContentCurationRequestView, + FeedbackFormDropdownListView, +) app_name = "feedback" urlpatterns = [ path("contact-us-api/", ContactFormModelView.as_view(), name="contact-us-api"), + path( + "feedback-form-dropdown-options-api/", + FeedbackFormDropdownListView.as_view(), + name="feedback-form-dropdown-options-api", + ), path( "content-curation-request-api/", ContentCurationRequestView.as_view(), diff --git a/feedback/views.py b/feedback/views.py index 6e4b0174..ca070564 100644 --- a/feedback/views.py +++ b/feedback/views.py @@ -1,7 +1,11 @@ from rest_framework import generics -from .models import ContentCurationRequest, Feedback -from .serializers import ContentCurationRequestSerializer, FeedbackSerializer +from .models import ContentCurationRequest, Feedback, FeedbackFormDropdown +from .serializers import ( + ContentCurationRequestSerializer, + FeedbackFormDropdownSerializer, + FeedbackSerializer, +) class ContactFormModelView(generics.CreateAPIView): @@ -9,6 +13,11 @@ class ContactFormModelView(generics.CreateAPIView): serializer_class = FeedbackSerializer +class FeedbackFormDropdownListView(generics.ListAPIView): + queryset = FeedbackFormDropdown.objects.all() + serializer_class = FeedbackFormDropdownSerializer + + class ContentCurationRequestView(generics.CreateAPIView): queryset = ContentCurationRequest.objects.all() serializer_class = ContentCurationRequestSerializer diff --git a/functional_tests/check_collection.py b/functional_tests/test_check_collection.py similarity index 100% rename from functional_tests/check_collection.py rename to functional_tests/test_check_collection.py diff --git a/init.sh b/init.sh new file mode 100644 index 00000000..c6d357a4 --- /dev/null +++ b/init.sh @@ -0,0 +1,42 @@ +#!/bin/bash +echo "Running all test cases across the project with coverage analysis..." + +# Initialize a failure counter +failure_count=0 + +# Exclude tests in `document_classifier` and `functional_tests` directories +excluded_dirs="document_classifier functional_tests" + +# Find all test files except those in excluded directories +test_files=$(find . -type f -name "test_*.py" | grep -Ev "$(echo $excluded_dirs | sed 's/ /|/g')") + +coverage erase # Clear any existing coverage data + +# Setup .coveragerc configuration to include all Python files +echo "[run] +source = . +include = */*.py + +[report] +show_missing = True" > .coveragerc + +# Run each test file with coverage (without generating report yet) +for test_file in $test_files; do + echo "Running $test_file..." + coverage run --append -m pytest "$test_file" # Collect coverage data + + # Check the exit status of pytest + if [ $? -ne 0 ]; then + echo "Test failed: $test_file" + failure_count=$((failure_count + 1)) + fi +done + +# Report the results without generating the coverage report +if [ $failure_count -ne 0 ]; then + echo "$failure_count test(s) failed. Refer to the terminal output for details." + exit 1 +else + echo "All tests passed successfully!" + echo "Coverage data collected. Coverage report will be generated separately." +fi diff --git a/production.yml b/production.yml index 8e5853e1..cf9a5244 100644 --- a/production.yml +++ b/production.yml @@ -10,7 +10,7 @@ services: dockerfile: ./compose/production/django/Dockerfile image: sde_indexing_helper_production_django volumes: - - ./backups:/app/backups + - ./backups:/app/backups:z depends_on: - postgres env_file: @@ -38,6 +38,10 @@ services: - django volumes: - production_traefik:/etc/traefik/acme + env_file: + # this should contain TRAEFIK_DOMAIN=sde-indexing-helper-staging.nasa-impact.net or + # TRAEFIK_DOMAIN=sde-indexing-helper.nasa-impact.net or + - ./.envs/.production/.traefik ports: - "0.0.0.0:80:80" - "0.0.0.0:443:443" diff --git a/requirements/base.txt b/requirements/base.txt index ee7e5f70..b5882ced 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -25,8 +25,11 @@ django-cors-headers==4.4.0 django-filter==24.3 djangorestframework-datatables==0.7.2 djangorestframework==3.15.2 +factory-boy==3.3.0 lxml==4.9.2 PyGithub==2.2.0 +pytest-django==4.8.0 +pytest==8.0.0 tqdm==4.66.3 unidecode==1.3.8 xmltodict==0.13.0 diff --git a/requirements/local.txt b/requirements/local.txt index ef637bca..6db73be1 100644 --- a/requirements/local.txt +++ b/requirements/local.txt @@ -13,6 +13,10 @@ pytest==8.0.0 # https://github.com/pytest-dev/pytest pytest-sugar==1.0.0 # https://github.com/Frozenball/pytest-sugar types-requests # maybe instead, we should add `mypy --install-types` to the dockerfile? types-xmltodict +pytest-xdist>=3.3.1 +pytest-cov>=4.1.0 +selenium>=4.15.2 # Selenium (Frontend Testing) +coverage==7.4.1 # Documentation # ------------------------------------------------------------------------------ diff --git a/scraper/sinequa_webcrawler_base_template.xml b/scraper/sinequa_webcrawler_base_template.xml index 6db8d54c..c05c93d2 100644 --- a/scraper/sinequa_webcrawler_base_template.xml +++ b/scraper/sinequa_webcrawler_base_template.xml @@ -227,7 +227,7 @@ - 8 + 3 diff --git a/scraper/url_grouper.py b/scraper/url_grouper.py index df3be3f9..01b12f8b 100644 --- a/scraper/url_grouper.py +++ b/scraper/url_grouper.py @@ -42,10 +42,12 @@ output_file.write(f"

{BASE_URL}

\n") output_file.write("
    \n") for key, value in my_dict.items(): - output_file.write(f'
  • {key}\n') + output_file.write(f'
  • {key}\n') # noqa: E231 output_file.write("
      \n") for item in value: - output_file.write(f'
    • {item}
    • \n') + output_file.write( + f'
    • {item}
    • \n' # noqa: E231 + ) output_file.write("
    \n") output_file.write("
  • \n") output_file.write("
\n") diff --git a/scripts/analyze_and_remove_url_duplicates.py b/scripts/analyze_and_remove_url_duplicates.py new file mode 100644 index 00000000..61693a9f --- /dev/null +++ b/scripts/analyze_and_remove_url_duplicates.py @@ -0,0 +1,76 @@ +from collections import defaultdict + +from django.db import models + +from sde_collections.models.candidate_url import CandidateURL +from sde_collections.models.collection import Collection + +# Get all field names except 'id' and 'collection' (since we're already looping by collection) +duplicate_fields = [field.name for field in CandidateURL._meta.get_fields() if field.name not in ["id", "collection"]] + + +def analyze_duplicates(): + """Analyze duplicates and print how many would be deleted in each collection.""" + deletion_stats = defaultdict(lambda: {"total": 0, "to_delete": 0}) + + # Loop through each collection + for collection in Collection.objects.all(): + # Count total URLs for the collection + total_urls = CandidateURL.objects.filter(collection=collection).count() + deletion_stats[collection.config_folder]["total"] = total_urls + + # Group CandidateURL instances by all fields dynamically + duplicates_in_collection = ( + CandidateURL.objects.filter(collection=collection) + .values(*duplicate_fields) + .annotate(count=models.Count("id")) + .filter(count__gt=1) + ) + + # Count potential deletions without deleting + for entry in duplicates_in_collection: + duplicates_count = CandidateURL.objects.filter( + collection=collection, **{field: entry[field] for field in duplicate_fields} + ).count() + deletion_stats[collection.config_folder]["to_delete"] += duplicates_count - 1 + + # Print analysis results + print("Duplicate analysis completed.") + for config_folder, stats in deletion_stats.items(): + print(f"{config_folder}' has {stats['total']} total URL(s), with {stats['to_delete']} duplicates.") + + +def delete_duplicates(): + """Delete duplicates based on previously analyzed duplicates.""" + deletion_stats = defaultdict(int) + + # Loop through each collection + for collection in Collection.objects.all(): + # Group CandidateURL instances by all fields dynamically + duplicates_in_collection = ( + CandidateURL.objects.filter(collection=collection) + .values(*duplicate_fields) + .annotate(count=models.Count("id")) + .filter(count__gt=1) + ) + + # Delete duplicates and track deletions + for entry in duplicates_in_collection: + duplicates = CandidateURL.objects.filter( + collection=collection, **{field: entry[field] for field in duplicate_fields} + ) + + # Keep the first instance and delete the rest + for candidate in duplicates[1:]: # Skip the first to retain it + candidate.delete() + deletion_stats[collection.config_folder] += 1 + + # Print deletion results + print("Duplicate URL cleanup completed.") + for config_folder, deleted_count in deletion_stats.items(): + print(f"Collection '{config_folder}' had {deleted_count} duplicate URL(s) deleted.") + + +# Usage +analyze_duplicates() # First analyze duplicates +delete_duplicates() # Then delete duplicates based on analysis diff --git a/scripts/delete_duplicate_urls_on_webapp.py b/scripts/delete_duplicate_urls_on_webapp.py index 0cc561f5..b3027ad5 100644 --- a/scripts/delete_duplicate_urls_on_webapp.py +++ b/scripts/delete_duplicate_urls_on_webapp.py @@ -1,35 +1,109 @@ -from django.db.models import Count +import time + +from django.db.models import Count, Min from sde_collections.models.candidate_url import CandidateURL from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices + + +def is_priority_collection(collection): + priority_statuses = { + WorkflowStatusChoices.CURATED, + WorkflowStatusChoices.QUALITY_FIXED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED, + WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, + WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, + WorkflowStatusChoices.QUALITY_CHECK_FAILED, + WorkflowStatusChoices.QUALITY_CHECK_MINOR, + WorkflowStatusChoices.QUALITY_CHECK_PERFECT, + WorkflowStatusChoices.PROD_PERFECT, + WorkflowStatusChoices.PROD_MINOR, + WorkflowStatusChoices.PROD_MAJOR, + } + return collection.workflow_status in priority_statuses + +def deduplicate_candidate_urls(): + start_time = time.time() -def remove_duplicate_urls(collection_name): - """ - Removes duplicate CandidateURL entries for a given collection name. - - Args: - - collection_name: The name of the collection for which to remove duplicate URLs. - """ - try: - collection = Collection.objects.get(name=collection_name) - except Collection.DoesNotExist: - print(f"Collection with name '{collection_name}' does not exist.") - return - - duplicate_urls = ( - CandidateURL.objects.filter(collection=collection) - .values("url") - .annotate(url_count=Count("id")) - .filter(url_count__gt=1) + # Keep the existing collection preprocessing + collection_counts = { + c["id"]: c["url_count"] + for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count") + } + collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()} + + # Phase 1: Intra-collection duplicates (keep this part the same) + intra_dupes = ( + CandidateURL.objects.values("collection_id", "url") + .annotate(count=Count("id"), min_id=Min("id")) + .filter(count__gt=1) ) - for entry in duplicate_urls: - duplicate_entries = CandidateURL.objects.filter(collection=collection, url=entry["url"]).order_by("id") + intra_ids_to_delete = [] + for dupe in intra_dupes: + dupe_ids = set( + CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"]) + .exclude(id=dupe["min_id"]) + .values_list("id", flat=True) + ) + intra_ids_to_delete.extend(dupe_ids) + + CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete() + + # Phase 2: Modified Cross-collection duplicates + cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1) + + cross_ids_to_delete = [] + for dupe in cross_dupes: + # Get all instances of this URL with their relevant data + instances = list(CandidateURL.objects.filter(url=dupe["url"]).order_by("id").values("id", "collection_id")) - duplicates_to_delete = duplicate_entries.exclude(id=duplicate_entries.first().id) - count_deleted = duplicates_to_delete.count() - duplicates_to_delete.delete() - print(f"Deleted {count_deleted} duplicate entries for URL '{entry['url']}'.") + while len(instances) > 1: # Process until we only have one instance left + # Create comparison data for each instance + instance_data = [ + { + "id": inst["id"], + "collection_id": inst["collection_id"], + "is_priority": collection_status[inst["collection_id"]], + "url_count": collection_counts[inst["collection_id"]], + } + for inst in instances + ] - print("Completed deleting duplicated URLs...") + # Find the instance to keep based on the new rules + def get_instance_to_delete(instances_list): + # First, separate by priority + priority_instances = [i for i in instances_list if i["is_priority"]] + non_priority_instances = [i for i in instances_list if not i["is_priority"]] + + # If we have both priority and non-priority, delete from non-priority + if priority_instances and non_priority_instances: + return non_priority_instances[0] + + # If all instances are of same priority type, compare url counts + working_list = priority_instances if priority_instances else non_priority_instances + min_count = min(i["url_count"] for i in working_list) + lowest_count_instances = [i for i in working_list if i["url_count"] == min_count] + + # If multiple instances have the same count, take the one with lowest ID + return min(lowest_count_instances, key=lambda x: x["id"]) + + # Get the instance to delete + instance_to_delete = get_instance_to_delete(instance_data) + + # Add it to our delete list and remove from instances + cross_ids_to_delete.append(instance_to_delete["id"]) + instances = [inst for inst in instances if inst["id"] != instance_to_delete["id"]] + + CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete() + + elapsed_time = time.time() - start_time + action = "Deleted" + print( + f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} " + f"cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) " + f"in {elapsed_time:.2f} seconds" + ) diff --git a/scripts/ej/README.md b/scripts/ej/README.md new file mode 100644 index 00000000..eb74c490 --- /dev/null +++ b/scripts/ej/README.md @@ -0,0 +1,80 @@ +# EJ Data Processing Pipeline + +This pipeline processes NASA Common Metadata Repository (CMR) data and environmental justice (EJ) classifications to create standardized data dumps for the Science Discovery Engine (SDE). + +## Overview + +The pipeline consists of several components: +- CMR data processing +- Environmental justice classification processing +- Threshold-based filtering +- Data dump creation + +## Setup + +1. Clone the repository +2. Install dependencies +3. Configure settings in `scripts/ej/config.py` + +## Input Files + +You need two main input files: + +1. **CMR Collections Data**: Generated using: +```bash +github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py +``` + +2. **Classification Predictions**: Provided by the classification model, contact Bishwas for access + +## Configuration + +Edit `scripts/ej/config.py` to customize: + +- Classification thresholds +- Authorized classifications +- Input/output filenames +- Timestamp formats + +Example configuration: +```python +# Adjust thresholds for different indicators +INDICATOR_THRESHOLDS = { + "Climate Change": 1.0, + "Disasters": 0.80, + # ... other thresholds +} + +# Change filenames +CMR_FILENAME = "your_cmr_file.json" +INFERENCE_FILENAME = "your_predictions.json" +``` + +## Usage + +### Basic Usage + +Run the pipeline on a local machine with the input files: +```bash +python create_ej_dump.py +``` + +## Output + +The pipeline generates a JSON file named `ej_dump_YYYYMMDD_HHMMSS.json` containing: +- Processed CMR metadata +- Environmental justice classifications + +## Server Deployment + +To deploy the output to the server: +```bash +# Copy to server +scp ej_dump_YYYYMMDD_HHMMSS.json sde:/home/ec2-user/sde_indexing_helper/backups/ + +# Process on server using dm shell +dmshell + +# add your file name to cmr_to_models.py +# paste and run the contents within the shell +``` diff --git a/scripts/ej/cmr_example.json b/scripts/ej/cmr_example.json new file mode 100644 index 00000000..dfc92fac --- /dev/null +++ b/scripts/ej/cmr_example.json @@ -0,0 +1,692 @@ +[ + { + "meta": { + "revision-id": 41, + "deleted": false, + "format": "application/vnd.nasa.cmr.umm+json", + "provider-id": "SEDAC", + "has-combine": false, + "user-id": "mhansen", + "has-formats": false, + "has-spatial-subsetting": false, + "native-id": "2000 Pilot Environmental Sustainability Index (ESI)", + "has-transforms": false, + "has-variables": false, + "concept-id": "C179001887-SEDAC", + "revision-date": "2022-12-05T20:48:32.236Z", + "has-temporal-subsetting": false, + "concept-type": "collection" + }, + "umm": { + "DataLanguage": "English", + "CollectionCitations": [ + { + "Version": "2000.00", + "Title": "2000 Pilot Environmental Sustainability Index (ESI)", + "Creator": "World Economic Forum - WEF - Global Leaders for Tomorrow Environment Task Force, Yale Center for Environmental Law and Policy - YCELP - Yale University, and Center for International Earth Science Information Network - CIESIN - Columbia University", + "ReleaseDate": "2000-12-31T00:00:00.000Z", + "ReleasePlace": "New Haven, CT", + "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University", + "OnlineResource": { + "Linkage": "https://doi.org/10.7927/H4NK3BZJ" + } + } + ], + "SpatialExtent": { + "SpatialCoverageType": "HORIZONTAL", + "HorizontalSpatialDomain": { + "Geometry": { + "CoordinateSystem": "CARTESIAN", + "BoundingRectangles": [ + { + "NorthBoundingCoordinate": 90.0, + "WestBoundingCoordinate": -180.0, + "EastBoundingCoordinate": 180.0, + "SouthBoundingCoordinate": -55.0 + } + ] + } + }, + "GranuleSpatialRepresentation": "CARTESIAN" + }, + "CollectionProgress": "COMPLETE", + "ScienceKeywords": [ + { + "Category": "EARTH SCIENCE", + "Topic": "HUMAN DIMENSIONS", + "Term": "SUSTAINABILITY", + "VariableLevel1": "ENVIRONMENTAL SUSTAINABILITY" + } + ], + "TemporalExtents": [ + { + "EndsAtPresentFlag": false, + "RangeDateTimes": [ + { + "BeginningDateTime": "1978-01-01T00:00:00.000Z", + "EndingDateTime": "1999-12-31T00:00:00.000Z" + } + ] + } + ], + "ProcessingLevel": { + "Id": "4" + }, + "DOI": { + "DOI": "10.7927/H4NK3BZJ", + "Authority": "https://doi.org/" + }, + "ShortName": "CIESIN_SEDAC_ESI_2000", + "EntryTitle": "2000 Pilot Environmental Sustainability Index (ESI)", + "PublicationReferences": [ + { + "Title": "2001 Environmental Sustainability Index (ESI)", + "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University", + "DOI": { + "DOI": "10.7927/H4X34VDM", + "Authority": "https://doi.org/" + }, + "Author": "World Economic Forum - WEF - Global Leaders for Tomorrow Environment Task Force, Yale Center for Environmental Law and Policy - YCELP - Yale University, and Center for International Earth Science Information Network - CIESIN - Columbia University", + "PublicationDate": "2001-12-31T00:00:00.000Z", + "Edition": "2001.00", + "PublicationPlace": "New Haven, CT" + }, + { + "Title": "2002 Environmental Sustainability Index (ESI)", + "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University", + "DOI": { + "DOI": "10.7927/H4SB43P8", + "Authority": "https://doi.org/" + }, + "Author": "World Economic Forum - WEF - Global Leaders for Tomorrow Environment Task Force, Yale Center for Environmental Law and Policy - YCELP - Yale University, and Center for International Earth Science Information Network - CIESIN - Columbia University", + "PublicationDate": "2002-12-31T00:00:00.000Z", + "Edition": "2002.00", + "PublicationPlace": "New Haven, CT" + }, + { + "Title": "2005 Environmental Sustainability Index (ESI)", + "Publisher": "Yale Center for Environmental Law and Policy (YCELP)/Yale University", + "DOI": { + "DOI": "10.7927/H40V89R6", + "Authority": "https://doi.org/" + }, + "Author": "Yale Center for Environmental Law and Policy - YCELP - Yale University, Center for International Earth Science Information Network - CIESIN - Columbia University, World Economic Forum - WEF, and Joint Research Centre - JRC - European Commission", + "PublicationDate": "2005-12-31T00:00:00.000Z", + "Edition": "2005.00", + "PublicationPlace": "New Haven, CT" + } + ], + "AccessConstraints": { + "Description": "None" + }, + "RelatedUrls": [ + { + "Description": "Sample browse graphic of the data set.", + "URLContentType": "VisualizationURL", + "Type": "GET RELATED VISUALIZATION", + "URL": "https://sedac.ciesin.columbia.edu/downloads/maps/esi/esi-pilot-environmental-sustainability-index-2000/sedac-logo.jpg" + }, + { + "Description": "Data Download Page", + "URLContentType": "DistributionURL", + "Type": "GET DATA", + "Subtype": "DIRECT DOWNLOAD", + "URL": "https://sedac.ciesin.columbia.edu/data/set/esi-pilot-environmental-sustainability-index-2000/data-download" + }, + { + "Description": "Data Set\u00a0Overview Page", + "URLContentType": "PublicationURL", + "Type": "VIEW RELATED INFORMATION", + "Subtype": "GENERAL DOCUMENTATION", + "URL": "https://sedac.ciesin.columbia.edu/data/set/esi-pilot-environmental-sustainability-index-2000" + } + ], + "ContactGroups": [ + { + "Roles": [ + "Metadata Author" + ], + "GroupName": "CIESIN METADATA ADMINISTRATION", + "ContactInformation": { + "ContactMechanisms": [ + { + "Type": "Email", + "Value": "metadata@ciesin.columbia.edu" + }, + { + "Type": "Fax", + "Value": "+1 845-365-8922" + }, + { + "Type": "Telephone", + "Value": "+1 845-365-8988" + } + ], + "Addresses": [ + { + "Country": "United States", + "StreetAddresses": [ + "CIESIN, Columbia University, 61 Route 9W, P.O. Box 1000" + ], + "City": "Palisades", + "StateProvince": "New York", + "PostalCode": "10964" + } + ], + "RelatedUrls": [ + { + "Description": "Metadata Author\u00a0Home\u00a0Page", + "URLContentType": "DataContactURL", + "Type": "HOME PAGE", + "URL": "https://sedac.ciesin.columbia.edu" + } + ] + } + } + ], + "DataDates": [ + { + "Date": "2000-12-31T00:00:00.000Z", + "Type": "CREATE" + }, + { + "Date": "2000-12-31T00:00:00.000Z", + "Type": "UPDATE" + } + ], + "Abstract": "The 2000 Pilot Environmental Sustainability Index (ESI) is an exploratory effort to construct an index that measures the ability of a nation's economy to achieve sustainable development, with the long term goal of finding a single indicator for environmental sustainability analagous to that of the Gross Domestic Product (GDP). The index covering 56 countries is a composite measure of the current status of a nation's environmental systems, pressures on those systems, human vulnerability to environmental change, national capacity to respond, and contributions to global environmental stewardship. The index was unveiled at the World Economic Forum's annual meeting, January 2000, Davos, Switzerland. The 2000 Pilot ESI is the result of collaboration among the World Economic Forum (WEF), Yale Center for Environmental Law and Policy (YCELP), and the Columbia University Center for International Earth Science Information Network (CIESIN).", + "Purpose": "To test the feasibility of creating a comparative index of national-level environmental sustainability.", + "LocationKeywords": [ + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "WESTERN ASIA", + "Subregion2": "MIDDLE EAST", + "Subregion3": "BAHRAIN" + }, + { + "Category": "CONTINENT", + "Type": "NORTH AMERICA", + "Subregion1": "CANADA" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "ARGENTINA" + }, + { + "Category": "CONTINENT", + "Type": "AUSTRALIA/NEW ZEALAND", + "Subregion1": "AUSTRALIA" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "WESTERN EUROPE", + "Subregion2": "AUSTRIA" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "WESTERN EUROPE", + "Subregion2": "BELGIUM" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "BOLIVIA" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "BRAZIL" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "EASTERN EUROPE", + "Subregion2": "BULGARIA" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "CHILE" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "COLOMBIA" + }, + { + "Category": "CONTINENT", + "Type": "NORTH AMERICA", + "Subregion1": "CENTRAL AMERICA", + "Subregion2": "COSTA RICA" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "NORTHERN EUROPE", + "Subregion2": "SCANDINAVIA", + "Subregion3": "DENMARK" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "ECUADOR" + }, + { + "Category": "CONTINENT", + "Type": "AFRICA", + "Subregion1": "NORTHERN AFRICA", + "Subregion2": "EGYPT" + }, + { + "Category": "CONTINENT", + "Type": "NORTH AMERICA", + "Subregion1": "CENTRAL AMERICA", + "Subregion2": "EL SALVADOR" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "NORTHERN EUROPE", + "Subregion2": "SCANDINAVIA", + "Subregion3": "FINLAND" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "WESTERN EUROPE", + "Subregion2": "FRANCE" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "WESTERN EUROPE", + "Subregion2": "GERMANY" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "SOUTHERN EUROPE", + "Subregion2": "GREECE" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "EASTERN EUROPE", + "Subregion2": "HUNGARY" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "NORTHERN EUROPE", + "Subregion2": "ICELAND" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "SOUTHCENTRAL ASIA", + "Subregion2": "INDIA" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "NORTHERN EUROPE", + "Subregion2": "BRITISH ISLES", + "Subregion3": "IRELAND" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "WESTERN ASIA", + "Subregion2": "MIDDLE EAST", + "Subregion3": "ISRAEL" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "SOUTHERN EUROPE", + "Subregion2": "ITALY" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "EASTERN ASIA", + "Subregion2": "JAPAN" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "WESTERN ASIA", + "Subregion2": "MIDDLE EAST", + "Subregion3": "JORDAN" + }, + { + "Category": "OCEAN", + "Type": "INDIAN OCEAN", + "Subregion1": "MALAYSIA" + }, + { + "Category": "OCEAN", + "Type": "INDIAN OCEAN", + "Subregion1": "MAURITIUS" + }, + { + "Category": "CONTINENT", + "Type": "NORTH AMERICA", + "Subregion1": "MEXICO" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "WESTERN EUROPE", + "Subregion2": "NETHERLANDS" + }, + { + "Category": "CONTINENT", + "Type": "AUSTRALIA/NEW ZEALAND", + "Subregion1": "NEW ZEALAND" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "NORTHERN EUROPE", + "Subregion2": "SCANDINAVIA", + "Subregion3": "NORWAY" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "PERU" + }, + { + "Category": "OCEAN", + "Type": "PACIFIC OCEAN", + "Subregion1": "WESTERN PACIFIC OCEAN", + "Subregion2": "PHILIPPINES" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "EASTERN EUROPE", + "Subregion2": "POLAND" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "WESTERN EUROPE", + "Subregion2": "PORTUGAL" + }, + { + "Category": "OCEAN", + "Type": "INDIAN OCEAN", + "Subregion1": "SINGAPORE" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "EASTERN EUROPE", + "Subregion2": "SLOVAKIA" + }, + { + "Category": "CONTINENT", + "Type": "AFRICA", + "Subregion1": "SOUTHERN AFRICA", + "Subregion2": "SOUTH AFRICA" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "EASTERN ASIA", + "Subregion2": "SOUTH KOREA" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "SOUTHERN EUROPE", + "Subregion2": "SPAIN", + "Subregion3": "GIBRALTAR" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "NORTHERN EUROPE", + "Subregion2": "SCANDINAVIA", + "Subregion3": "SWEDEN" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "WESTERN EUROPE", + "Subregion2": "SWITZERLAND" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "SOUTHEASTERN ASIA", + "Subregion2": "THAILAND" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "WESTERN ASIA", + "Subregion2": "TURKEY" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "EASTERN EUROPE", + "Subregion2": "UKRAINE" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "NORTHERN EUROPE", + "Subregion2": "BRITISH ISLES", + "Subregion3": "UNITED KINGDOM" + }, + { + "Category": "CONTINENT", + "Type": "SOUTH AMERICA", + "Subregion1": "VENEZUELA" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "SOUTHEASTERN ASIA", + "Subregion2": "VIETNAM" + }, + { + "Category": "CONTINENT", + "Type": "AFRICA", + "Subregion1": "EASTERN AFRICA", + "Subregion2": "ZIMBABWE" + }, + { + "Category": "GEOGRAPHIC REGION", + "Type": "GLOBAL" + }, + { + "Category": "CONTINENT", + "Type": "NORTH AMERICA", + "Subregion1": "UNITED STATES OF AMERICA" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "SOUTHERN EUROPE", + "Subregion2": "SPAIN" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "EASTERN EUROPE", + "Subregion2": "RUSSIAN FEDERATION" + }, + { + "Category": "OCEAN", + "Type": "INDIAN OCEAN", + "Subregion1": "INDONESIA" + }, + { + "Category": "CONTINENT", + "Type": "ASIA", + "Subregion1": "EASTERN ASIA", + "Subregion2": "CHINA" + }, + { + "Category": "CONTINENT", + "Type": "EUROPE", + "Subregion1": "EASTERN EUROPE", + "Subregion2": "CZECHIA" + } + ], + "MetadataDates": [ + { + "Type": "CREATE", + "Date": "2012-12-12T00:00:00.000Z" + }, + { + "Type": "UPDATE", + "Date": "2022-12-05T20:40:00.000Z" + } + ], + "DirectoryNames": [ + { + "ShortName": "CIESIN_SEDAC_ESI_2000" + } + ], + "Version": "2000.00", + "Projects": [ + { + "ShortName": "ESI", + "LongName": "Environmental Sustainability Index" + } + ], + "UseConstraints": { + "Description": "Users are free to use, copy, distribute, transmit, and adapt the work for commercial and non-commercial purposes, without restriction, as long as clear attribution of the source is provided." + }, + "CollectionDataType": "SCIENCE_QUALITY", + "DataCenters": [ + { + "Roles": [ + "DISTRIBUTOR" + ], + "ShortName": "SEDAC", + "LongName": "Socioeconomic Data and Applications Center", + "ContactInformation": { + "ContactMechanisms": [ + { + "Type": "Telephone", + "Value": "+1 845-365-8920" + }, + { + "Type": "Fax", + "Value": "+1 845-365-8922" + }, + { + "Type": "Email", + "Value": "ciesin.info@ciesin.columbia.edu" + } + ], + "Addresses": [ + { + "Country": "United States", + "StreetAddresses": [ + "CIESIN, Columbia University, 61 Route 9W, P.O. Box 1000" + ], + "City": "Palisades", + "StateProvince": "New York", + "PostalCode": "10964" + } + ], + "RelatedUrls": [ + { + "Description": "Distributor Home Page", + "URLContentType": "DataCenterURL", + "Type": "HOME PAGE", + "URL": "https://sedac.ciesin.columbia.edu/" + } + ] + } + }, + { + "Roles": [ + "ARCHIVER" + ], + "ShortName": "SEDAC", + "LongName": "Socioeconomic Data and Applications Center", + "ContactInformation": { + "ContactMechanisms": [ + { + "Type": "Telephone", + "Value": "+1 845-365-8920" + }, + { + "Type": "Fax", + "Value": "+1 845-365-8922" + }, + { + "Type": "Email", + "Value": "ciesin.info@ciesin.columbia.edu" + } + ], + "Addresses": [ + { + "Country": "United States", + "StreetAddresses": [ + "CIESIN, Columbia University, 61 Route 9W, P.O. Box 1000" + ], + "City": "Palisades", + "StateProvince": "New York", + "PostalCode": "10964" + } + ], + "RelatedUrls": [ + { + "Description": "Archiver Home Page", + "URLContentType": "DataCenterURL", + "Type": "HOME PAGE", + "URL": "https://sedac.ciesin.columbia.edu/" + } + ] + } + } + ], + "Platforms": [ + { + "Type": "Models", + "ShortName": "MODELS", + "LongName": "MODELS", + "Instruments": [ + { + "ShortName": "Computer", + "LongName": "Computer" + } + ] + } + ], + "MetadataSpecification": { + "URL": "https://cdn.earthdata.nasa.gov/umm/collection/v1.18.1", + "Name": "UMM-C", + "Version": "1.18.1" + }, + "ArchiveAndDistributionInformation": { + "FileArchiveInformation": [ + { + "Format": "PDF" + } + ], + "FileDistributionInformation": [ + { + "Format": "PDF", + "Fees": "0" + } + ] + } + }, + "indicators": "Human Dimensions" + } +] diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py new file mode 100644 index 00000000..6b6c77ab --- /dev/null +++ b/scripts/ej/cmr_processing.py @@ -0,0 +1,428 @@ +import urllib.parse +from datetime import datetime +from typing import NamedTuple + + +class TemporalInfo(NamedTuple): + """Container for processed temporal information.""" + + latest_end_date: datetime | None + total_duration: int + resolution: str + resolution_unit: str + single_date_times: list[str] + + +class SpatialInfo(NamedTuple): + """Container for processed spatial information.""" + + is_global: bool + resolution: str + bounding_rectangles: list[dict] + + +class DownloadInfo(NamedTuple): + """Container for processed download information.""" + + has_distribution: bool + has_direct_download: bool + visualization_urls: list[str] + formats: list[str] # Changed from single format to list of formats + + +class ProcessingInfo(NamedTuple): + """Container for processing level information.""" + + level: str + collection_type: str + data_centers: list[str] + + +class CmrDataset: + """Comprehensive processor for CMR dataset information.""" + + def __init__(self, dataset: dict): + self.dataset = dataset + self.meta = dataset.get("meta", {}) + self.umm = dataset.get("umm", {}) + self.today = datetime.now() + + # Process all information once during initialization + self.temporal_info = self._process_temporal_extents() + self.spatial_info = self._process_spatial_info() + self.download_info = self._process_download_info() + self.processing_info = self._process_processing_info() + + @staticmethod + def _parse_datetime(date_str: str) -> datetime: + """Parse CMR datetime string to datetime object.""" + try: + return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ") + except ValueError: + # Some dates might not have milliseconds + return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ") + + def _check_temporal_range(self, range_datetime: dict) -> tuple[datetime, datetime]: + """Extract begin and end dates from a range datetime entry.""" + begin_date = self._parse_datetime(range_datetime["BeginningDateTime"]) + end_date = self._parse_datetime(range_datetime["EndingDateTime"]) + return begin_date, end_date + + def _process_temporal_extents(self) -> TemporalInfo: + temporal_extents = self.umm.get("TemporalExtents", []) + latest_end_date = None + total_duration = 0 + all_temporal_strings = [] + + for extent in temporal_extents: + # Process single dates + all_temporal_strings.extend(extent.get("SingleDateTimes", [])) + + # Process range dates + for range_dt in extent.get("RangeDateTimes", []): + try: + begin_date, end_date = self._check_temporal_range(range_dt) + range_str = f"{range_dt['BeginningDateTime']} - {range_dt['EndingDateTime']}" + all_temporal_strings.append(range_str) + + if latest_end_date is None or end_date > latest_end_date: + latest_end_date = end_date + total_duration += (end_date - begin_date).days + except (KeyError, ValueError): + continue + + temporal_resolution_dict = temporal_extents[0].get("TemporalResolution", {}) if temporal_extents else {} + resolution_value = temporal_resolution_dict.get("Value", "") + resolution_unit = temporal_resolution_dict.get("Unit", "") + + return TemporalInfo( + latest_end_date=latest_end_date, + total_duration=total_duration, + resolution=str(resolution_value), + resolution_unit=resolution_unit, + single_date_times=sorted(all_temporal_strings), + ) + + def _process_spatial_info(self) -> SpatialInfo: + """Process all spatial information.""" + spatial_extent = self.umm.get("SpatialExtent", {}) + horizontal_domain = spatial_extent.get("HorizontalSpatialDomain", {}) + geometry = horizontal_domain.get("Geometry", {}) + rectangles = geometry.get("BoundingRectangles", []) + + is_global = any( + abs(rect.get("NorthBoundingCoordinate", 0)) >= 85 + and abs(rect.get("SouthBoundingCoordinate", 0)) >= 85 + and abs(rect.get("WestBoundingCoordinate", 0)) >= 175 + and abs(rect.get("EastBoundingCoordinate", 0)) >= 175 + for rect in rectangles + ) + + resolution = self._extract_spatial_resolution(horizontal_domain) + + return SpatialInfo(is_global, resolution, rectangles) + + def _extract_spatial_resolution(self, horizontal_domain: dict) -> str: + """ + Extract and format spatial resolution from horizontal domain data. + + Args: + horizontal_domain: Dictionary containing resolution information + + Returns: + Formatted resolution string or empty string if not available + """ + resolution_system = horizontal_domain.get("ResolutionAndCoordinateSystem", {}) + resolution_data = resolution_system.get("HorizontalDataResolution", {}) + + if not resolution_data: + return "" + + # Check for Varies resolution + if resolution_data.get("VariesResolution") == "Varies": + return "Varies" + + # Check for GriddedRangeResolutions (use maximum values) + gridded_range = resolution_data.get("GriddedRangeResolutions", []) + if gridded_range: + # I spot checked 200 datasets, and never saw more than one entry + # so I'm just going to use the first one for now for simplicity + range_data = gridded_range[0] + # in a gridded range, MinimumXDimension is also available, + # however I have chosen to use the less impressive MaximumXDimension + max_x = range_data.get("MaximumXDimension") + max_y = range_data.get("MaximumYDimension") + unit = range_data.get("Unit", "").lower() + if max_x and max_y and unit: + # Use the larger of the two dimensions + max_dim = max(max_x, max_y) + return f"{max_dim} {unit}" + return "" + + # Check for GriddedResolutions + gridded = resolution_data.get("GriddedResolutions", []) + if gridded: + grid_data = gridded[0] + x_dim = grid_data.get("XDimension") + y_dim = grid_data.get("YDimension") + unit = grid_data.get("Unit", "").lower() + if x_dim and y_dim and unit: + # If dimensions differ, use the larger one + max_dim = max(x_dim, y_dim) + return f"{max_dim} {unit}" + return "" + + # Check for GenericResolutions + generic = resolution_data.get("GenericResolutions", []) + if generic: + generic_data = generic[0] + x_dim = generic_data.get("XDimension") + y_dim = generic_data.get("YDimension") + unit = generic_data.get("Unit", "").lower() + if x_dim and y_dim and unit: + # If dimensions differ, use the larger one + max_dim = max(x_dim, y_dim) + return f"{max_dim} {unit}" + return "" + + return "" + + def _process_download_info(self) -> DownloadInfo: + """Process all download and visualization information.""" + has_distribution = False + has_direct_download = False + visualization_urls = [] + formats = [] + + # Extract formats from FileDistributionInformation + archive_info = self.umm.get("ArchiveAndDistributionInformation", {}) + distribution_info = archive_info.get("FileDistributionInformation", []) + + for info in distribution_info: + if "Format" in info: + formats.append(info["Format"]) + + # Process RelatedUrls + related_urls = self.umm.get("RelatedUrls", []) + for url in related_urls: + if url.get("URLContentType") == "DistributionURL" and url.get("Type") == "GET DATA": + has_distribution = True + if url.get("Subtype") == "DIRECT DOWNLOAD": + has_direct_download = True + elif url.get("URLContentType") == "VisualizationURL": + visualization_urls.append(url.get("URL", "")) + + return DownloadInfo( + has_distribution=has_distribution, + has_direct_download=has_direct_download, + visualization_urls=visualization_urls, + formats=formats, + ) + + @property + def format(self) -> str: + """Get dataset formats as semicolon-separated string.""" + return "; ".join(self.download_info.formats) if self.download_info.formats else "" + + def _process_processing_info(self) -> ProcessingInfo: + """Process all processing level information.""" + processing_level = self.umm.get("ProcessingLevel", {}).get("Id", "") + collection_type = self.umm.get("CollectionDataType", "") + # Get all data center short names + data_centers = [ + center.get("ShortName", "") for center in self.umm.get("DataCenters", []) if center.get("ShortName") + ] + + return ProcessingInfo(processing_level, collection_type, data_centers) + + def get_properties(self) -> tuple[str, str]: + """ + Get dataset strengths and weaknesses together. + Returns tuple of (strengths_string, weaknesses_string). + """ + strengths = set() + weaknesses = set() + + # Collection activity + if self.umm.get("CollectionProgress") == "ACTIVE": + strengths.add("Data collection is ongoing") + + # Data type + if self.processing_info.collection_type == "NEAR_REAL_TIME": + strengths.add("Near real-time data is available") + + # Temporal characteristics + if self.temporal_info.latest_end_date: + age_in_days = (self.today - self.temporal_info.latest_end_date).days + if age_in_days <= (3 * 365): + strengths.add("Recent data is available") + else: + weaknesses.add("No recent data available") + + if self.temporal_info.total_duration: + if self.temporal_info.total_duration >= (5 * 365): + strengths.add("Long temporal extent") + elif self.temporal_info.total_duration < 365: + weaknesses.add("Limited temporal extent") + + # Download availability + if self.download_info.has_direct_download: + strengths.add("Direct data download available") + elif self.download_info.has_distribution: + weaknesses.add("Direct data download not available") + + return ( + "; ".join(sorted(strengths)) if strengths else "", + "; ".join(sorted(weaknesses)) if weaknesses else "", + ) + + @property + def strengths(self) -> str: + """Get dataset strengths.""" + strengths, _ = self.get_properties() + return strengths + + @property + def weaknesses(self) -> str: + """Get dataset weaknesses.""" + _, weaknesses = self.get_properties() + return weaknesses + + @property + def latency(self) -> str: + """Get dataset latency.""" + latency_mapping = { + "NEAR_REAL_TIME": "1-3 Hours", + "LOW_LATENCY": "3 Hours to 1 Day", + "EXPEDITED": "1-4 Days", + "SCIENCE_QUALITY": "Not Provided", + } + return latency_mapping.get(self.processing_info.collection_type, "Not Provided") + + @property + def intended_use(self) -> str: + """Get dataset intended use path.""" + level = self.processing_info.level + collection_type = self.processing_info.collection_type + data_centers = self.processing_info.data_centers + + if level == "4" and collection_type == "SCIENCE_QUALITY": + return "Path A" # maps to "exploration" + + if ( + (level in ["2", "2a", "2b"] and "SEDAC" in data_centers and collection_type == "SCIENCE_QUALITY") + or (level in ["3", "3a"] and collection_type == "SCIENCE_QUALITY") + or (level == "4" and collection_type != "SCIENCE_QUALITY") + ): + return "Path B" # maps to "basic analysis" + + return "Path C" # maps to "advanced analysis" + + @property + def geographic_coverage(self) -> str: + """Get dataset geographic coverage.""" + return "Global" if self.spatial_info.is_global else "" + + @property + def data_visualization(self) -> str: + """Get dataset visualization URLs.""" + return "; ".join(self.download_info.visualization_urls) + + @property + def temporal_resolution(self) -> str: + """Get dataset temporal resolution.""" + if self.temporal_info.resolution and self.temporal_info.resolution_unit: + return f"{self.temporal_info.resolution} {self.temporal_info.resolution_unit}" + return "" + + @property + def spatial_resolution(self) -> str: + """Get dataset spatial resolution.""" + return self.spatial_info.resolution + + @property + def projects(self) -> str: + """Get dataset projects with both short and long names where available.""" + projects = self.umm.get("Projects", []) + formatted_projects = [] + + for project in projects: + short_name = project.get("ShortName", "") + long_name = project.get("LongName", "") + + if short_name and long_name: + formatted_projects.append(f"{short_name} - {long_name}") + elif short_name: + formatted_projects.append(short_name) + elif long_name: + formatted_projects.append(long_name) + + return "; ".join(formatted_projects) + + @property + def dataset_name(self) -> str: + """Get dataset entry title or shortname.""" + return self.umm.get("EntryTitle", self.umm.get("ShortName", "")) + + @property + def description(self) -> str: + """Get dataset abstract.""" + return self.umm.get("Abstract", "") + + @property + def limitations(self) -> str: + """Get dataset access constraints.""" + return self.umm.get("AccessConstraints", {}).get("Description", "") + + @property + def temporal_extent(self) -> str: + """Get dataset temporal extent.""" + return ", ".join(self.temporal_info.single_date_times) + + @property + def source_link(self) -> str: + """Generate source link from DOI information.""" + doi_field = self.umm.get("DOI", {}) + authority = doi_field.get("Authority") + doi = doi_field.get("DOI") + if authority and doi: + return urllib.parse.urljoin(authority, doi) + return "" + + @property + def sde_link(self) -> str: + """Generate SDE link from concept ID.""" + concept_id = self.meta.get("concept-id", "") + if not concept_id: + return "" + + base_url = "https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview" + query = '{"name":"query-smd-primary","scope":"All","text":""}' + sinequa_id = f"/SDE/CMR_API/|{concept_id}" + + encoded_id = urllib.parse.quote(sinequa_id, safe="") + encoded_query = urllib.parse.quote(query, safe="") + + return f"{base_url}?id={encoded_id}&query={encoded_query}" + + def to_dict(self) -> dict: + """Convert CmrDataset to a dictionary with all final ej fields.""" + return { + "concept_id": self.meta.get("concept-id", ""), + "dataset": self.dataset_name, + "description": self.description, + "limitations": self.limitations, + "format": self.format, + "temporal_extent": self.temporal_extent, + "intended_use": self.intended_use, + "source_link": self.source_link, + "sde_link": self.sde_link, + "strengths": self.strengths, + "weaknesses": self.weaknesses, + "latency": self.latency, + "geographic_coverage": self.geographic_coverage, + "data_visualization": self.data_visualization, + "temporal_resolution": self.temporal_resolution, + "spatial_resolution": self.spatial_resolution, + "projects": self.projects, + } diff --git a/scripts/ej/cmr_to_models.py b/scripts/ej/cmr_to_models.py index 130de722..1cbb31c9 100644 --- a/scripts/ej/cmr_to_models.py +++ b/scripts/ej/cmr_to_models.py @@ -1,96 +1,50 @@ """ -the ej_dump is generated by running create_ej_dump.py and is scp'd to the COSMOS server -this script is then run via the dm shell on the COSMOS server to populate the database +Loads preprocessed EJ dump and creates database entries. + +See README.md for more information. """ import json -import urllib.parse from environmental_justice.models import EnvironmentalJusticeRow -def generate_source_link(doi_field): - authority = doi_field.get("Authority") - doi = doi_field.get("DOI") - if authority and doi: - return urllib.parse.urljoin(authority, doi) - return "" - - -def concept_id_to_sinequa_id(concept_id: str) -> str: - return f"/SDE/CMR_API/|{concept_id}" - - -def sinequa_id_to_url(sinequa_id: str) -> str: - base_url = "https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview" - query = '{"name":"query-smd-primary","scope":"All","text":""}' - - encoded_id = urllib.parse.quote(sinequa_id, safe="") - encoded_query = urllib.parse.quote(query, safe="") - - return f"{base_url}?id={encoded_id}&query={encoded_query}" - - -def categorize_processing_level(level): - advanced_analysis_levels = {"0", "Level 0", "NA", "Not Provided", "Not provided"} - - basic_analysis_levels = { - "1", - "1A", - "1B", - "1C", - "1T", - "2", - "2A", - "2B", - "2G", - "2P", - "Level 1", - "Level 1A", - "Level 1B", - "Level 1C", - "Level 2", - "Level 2A", - "Level 2B", - } - - exploration_levels = {"3", "4", "Level 3", "Level 4", "L2"} - - if level in exploration_levels: - return "exploration" - elif level in basic_analysis_levels: - return "basic analysis" - elif level in advanced_analysis_levels: - return "advanced analysis" - else: - return "advanced analysis" - - -# remove existing data -EnvironmentalJusticeRow.objects.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV).delete() - -ej_dump = json.load(open("backups/ej_dump_20240815_112916.json")) -for dataset in ej_dump: - ej_row = EnvironmentalJusticeRow( - destination_server=EnvironmentalJusticeRow.DestinationServerChoices.DEV, - sde_link=sinequa_id_to_url(concept_id_to_sinequa_id(dataset.get("meta", {}).get("concept-id", ""))), - dataset=dataset.get("umm", {}).get("ShortName", ""), - description=dataset.get("umm", {}).get("Abstract", ""), - limitations=dataset.get("umm", {}).get("AccessConstraints", {}).get("Description", ""), - format=dataset.get("meta", {}).get("format", ""), - temporal_extent=", ".join(dataset.get("umm", {}).get("TemporalExtents", [{}])[0].get("SingleDateTimes", [])), - intended_use=categorize_processing_level( - dataset.get("umm", {}).get("ProcessingLevel", {}).get("Id", "advanced analysis") - ), - source_link=generate_source_link(dataset.get("umm", {}).get("DOI", {})), - indicators=dataset["indicators"], - geographic_coverage="", # Not provided in the data - data_visualization="", # dataset.get("umm", {}).get("RelatedUrls", [{}])[0].get("URL", ""), - latency="", # Not provided in the data - spatial_resolution="", # Not provided in the data - temporal_resolution="", # Not provided in the data - description_simplified="", # Not provided in the data - project="", # Not provided in the data - strengths="", # Not provided in the data - ) - ej_row.save() +def process_ej_dump(file_path: str) -> None: + """Process EJ dump file and create database entries.""" + + data_source = EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + + # Clear existing data + EnvironmentalJusticeRow.objects.filter(data_source=data_source).delete() + + # Load the preprocessed data + with open(file_path) as f: + clean_data = json.load(f) + + # Create database entries + for entry in clean_data: + ej_row = EnvironmentalJusticeRow( + data_source=data_source, + sde_link=entry["sde_link"], + dataset=entry["dataset"], + description=entry["description"], + description_simplified="", # This field exists in model but not in data + # I think the "limitations" in SDE is equivalent to "weaknesses" from emily's data + limitations=entry["weaknesses"], + format=entry["format"], + temporal_extent=entry["temporal_extent"], + intended_use=entry["intended_use"], + source_link=entry["source_link"], + indicators=entry["indicators"], + strengths=entry["strengths"], + latency=entry["latency"], + geographic_coverage=entry["geographic_coverage"], + data_visualization=entry["data_visualization"], + temporal_resolution=entry["temporal_resolution"], + spatial_resolution=entry["spatial_resolution"], + project=entry["projects"], # Changed from 'projects' to 'project' to match model + ) + ej_row.save() + + +process_ej_dump("backups/ej_dump_20241203_170124.json") diff --git a/scripts/ej/config.py b/scripts/ej/config.py new file mode 100644 index 00000000..90b4d10f --- /dev/null +++ b/scripts/ej/config.py @@ -0,0 +1,34 @@ +"""Configuration settings for EJ data processing.""" + +# Threshold values for different indicators +INDICATOR_THRESHOLDS = { + "Not EJ": 0.80, + "Climate Change": 1.0, + "Disasters": 0.80, + "Extreme Heat": 0.50, + "Food Availability": 0.80, + "Health & Air Quality": 0.90, + "Human Dimensions": 0.80, + "Urban Flooding": 0.50, + "Water Availability": 0.80, +} + +# List of authorized classifications +AUTHORIZED_CLASSIFICATIONS = [ + # "Climate Change", + "Disasters", + "Extreme Heat", + "Food Availability", + "Health & Air Quality", + "Human Dimensions", + "Urban Flooding", + "Water Availability", +] + +# File paths and names +CMR_FILENAME = "cmr_collections_umm_20240807_142146.json" +INFERENCE_FILENAME = "alpha-1.3-wise-vortex-42-predictions.json" + +# Output format +TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S" +OUTPUT_FILENAME_TEMPLATE = "ej_dump_{}.json" diff --git a/scripts/ej/create_ej_dump.py b/scripts/ej/create_ej_dump.py index bab5baac..0ee73270 100644 --- a/scripts/ej/create_ej_dump.py +++ b/scripts/ej/create_ej_dump.py @@ -1,100 +1,132 @@ """ -inferences are supplied by the classification model. the contact point is Bishwas -cmr is supplied by running https://github.com/NASA-IMPACT/llm-app-EJ-classifier/blob/develop/scripts/data_processing/download_cmr.py -move to the serve like this: scp ej_dump_20240814_143036.json sde:/home/ec2-user/sde_indexing_helper/backups/ +Creates EJ dump files by processing CMR data and classifications. """ import json from datetime import datetime +from cmr_processing import CmrDataset +from threshold_processing import ThresholdProcessor + +try: + from config import ( + CMR_FILENAME, + INFERENCE_FILENAME, + OUTPUT_FILENAME_TEMPLATE, + TIMESTAMP_FORMAT, + ) +except ImportError: + from scripts.ej.config import ( + CMR_FILENAME, + INFERENCE_FILENAME, + OUTPUT_FILENAME_TEMPLATE, + TIMESTAMP_FORMAT, + ) + def load_json_file(file_path: str) -> dict: + """Load and parse a JSON file.""" with open(file_path) as file: return json.load(file) def save_to_json(data: dict | list, file_path: str) -> None: + """Save data to a JSON file with proper formatting.""" with open(file_path, "w") as file: json.dump(data, file, indent=2) -def process_classifications(predictions: list[dict[str, float]], threshold: float = 0.5) -> list[str]: - """ - Process the predictions and classify as follows: - 1. If 'Not EJ' is the highest scoring prediction, return 'Not EJ' as the only classification - 2. Filter classifications based on the threshold, excluding 'Not EJ' - 3. Default to 'Not EJ' if no classifications meet the threshold +def create_cmr_dict(cmr_data: list[dict]) -> dict[str, dict]: """ - highest_prediction = max(predictions, key=lambda x: x["score"]) - - if highest_prediction["label"] == "Not EJ": - return ["Not EJ"] - - classifications = [ - pred["label"] for pred in predictions if pred["score"] >= threshold and pred["label"] != "Not EJ" - ] + Restructure CMR data into a dictionary with concept-id as the key. - return classifications if classifications else ["Not EJ"] + Args: + cmr_data: List of CMR dataset dictionaries. - -def create_cmr_dict(cmr_data: list[dict[str, dict[str, str]]]) -> dict[str, dict[str, dict[str, str]]]: - """Restructure CMR data into a dictionary with 'concept-id' as the key.""" + Returns: + Dictionary mapping concept-ids to their respective CMR data. + """ return {dataset["meta"]["concept-id"]: dataset for dataset in cmr_data} -def remove_unauthorized_classifications(classifications: list[str]) -> list[str]: - """Filter classifications to keep only those in the authorized list.""" - - authorized_classifications = [ - "Climate Change", - "Disasters", - "Extreme Heat", - "Food Availability", - "Health & Air Quality", - "Human Dimensions", - "Urban Flooding", - "Water Availability", - ] +def create_clean_dataset( + inferences: list[dict], + cmr_dict: dict[str, dict], + processor: ThresholdProcessor, +) -> list[dict]: + """ + Create clean dataset with processed CMR data and classifications. + Excludes datasets classified as 'Not EJ'. - return [cls for cls in classifications if cls in authorized_classifications] + Args: + inferences: List of inference dictionaries containing predictions. + cmr_dict: Dictionary mapping concept-ids to CMR data. + processor: ThresholdProcessor instance for processing classifications. + Returns: + List of processed dataset dictionaries, excluding 'Not EJ' classifications. + """ + clean_data = [] -def update_cmr_with_classifications( - inferences: list[dict[str, dict]], - cmr_dict: dict[str, dict[str, dict]], - threshold: float = 0.5, -) -> list[dict[str, dict]]: - """Update CMR data with valid classifications based on inferences.""" + for inference in inferences: + concept_id = inference["concept-id"] + cmr_dataset = cmr_dict.get(concept_id) - predicted_cmr = [] + if cmr_dataset: + # Process classifications + classifications = processor.process_and_filter(inference["predictions"]) - for inference in inferences: - classifications = process_classifications(predictions=inference["predictions"], threshold=threshold) - classifications = remove_unauthorized_classifications(classifications) + # Only include datasets that have valid classifications and are not marked as 'Not EJ' + if classifications and "Not EJ" not in classifications: + # Process CMR data + processed_cmr = CmrDataset(cmr_dataset).to_dict() + processed_cmr["indicators"] = ";".join(classifications) + clean_data.append(processed_cmr) - if classifications: - cmr_dataset = cmr_dict.get(inference["concept-id"]) + return clean_data - if cmr_dataset: - cmr_dataset["indicators"] = ";".join(classifications) - predicted_cmr.append(cmr_dataset) - return predicted_cmr +def main( + cmr_file: str = CMR_FILENAME, + inference_file: str = INFERENCE_FILENAME, +) -> None: + """ + Main function to create EJ dump file. + Args: + cmr_file: Path to the CMR data JSON file. + inference_file: Path to the inference predictions JSON file. + """ + # Initialize processor + processor = ThresholdProcessor() -def main(): - inferences = load_json_file("cmr-inference.json") - cmr = load_json_file("cmr_collections_umm_20240807_142146.json") + # Load input files + inferences = load_json_file(inference_file) + cmr = load_json_file(cmr_file) + # Create CMR dictionary cmr_dict = create_cmr_dict(cmr) - predicted_cmr = update_cmr_with_classifications(inferences=inferences, cmr_dict=cmr_dict, threshold=0.8) + # Create clean dataset with all required fields, excluding 'Not EJ' classifications + clean_data = create_clean_dataset( + inferences=inferences, + cmr_dict=cmr_dict, + processor=processor, + ) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - file_name = f"ej_dump_{timestamp}.json" + # Generate output filename with timestamp + timestamp = datetime.now().strftime(TIMESTAMP_FORMAT) + output_filename = OUTPUT_FILENAME_TEMPLATE.format(timestamp) - save_to_json(predicted_cmr, file_name) + # Save output + save_to_json(clean_data, output_filename) + print(f"Processed {len(clean_data)} EJ datasets from {cmr_file} and {inference_file}") + print() + print(f"Saved to {output_filename}") if __name__ == "__main__": - main() + main( + cmr_file=CMR_FILENAME, + inference_file=INFERENCE_FILENAME, + ) diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py new file mode 100644 index 00000000..56b00b56 --- /dev/null +++ b/scripts/ej/test_cmr_processing.py @@ -0,0 +1,610 @@ +# docker-compose -f local.yml run --rm django pytest scripts/ej/test_cmr_processing.py +import json +from urllib.parse import urlparse + +import pytest +from cmr_processing import CmrDataset + + +# Helper function to load test data +def load_test_data(file_path="scripts/ej/cmr_example.json"): + with open(file_path) as f: + return json.load(f)[0] # First dataset from the example + + +class TestCmrDatasetIntegration: + """Integration tests using real CMR data example""" + + @pytest.fixture + def cmr_dataset(self): + return CmrDataset(load_test_data()) + + def test_full_dataset_processing(self, cmr_dataset): + """Test that all properties can be extracted from real data without errors""" + assert cmr_dataset.dataset_name == "2000 Pilot Environmental Sustainability Index (ESI)" + assert cmr_dataset.description.startswith("The 2000 Pilot Environmental Sustainability Index") + assert cmr_dataset.limitations == "None" + assert cmr_dataset.format == "PDF" + assert cmr_dataset.temporal_extent == "1978-01-01T00:00:00.000Z - 1999-12-31T00:00:00.000Z" + assert cmr_dataset.intended_use == "Path A" + assert cmr_dataset.source_link == "https://doi.org/10.7927/H4NK3BZJ" + assert "Long temporal extent" in cmr_dataset.strengths + assert "No recent data available" in cmr_dataset.weaknesses + assert cmr_dataset.latency == "Not Provided" + assert cmr_dataset.geographic_coverage == "" + assert ( + "https://sedac.ciesin.columbia.edu/downloads/maps/esi/esi-pilot-environmental-sustainability-index-2000/sedac-logo.jpg" # noqa + in cmr_dataset.data_visualization + ) + assert cmr_dataset.temporal_resolution == "" + assert cmr_dataset.spatial_resolution == "" + assert "ESI" in cmr_dataset.projects + + +class TestTemporalProcessing: + """Unit tests for temporal information processing""" + + def basic_temporal_data(self): + return { + "meta": {}, + "umm": { + "TemporalExtents": [ + { + "RangeDateTimes": [ + { + "BeginningDateTime": "2020-01-01T00:00:00.000Z", + "EndingDateTime": "2020-12-31T23:59:59.999Z", + } + ], + "TemporalResolution": {"Unit": "Hour", "Value": 24}, + } + ] + }, + } + + def test_parse_datetime_with_milliseconds(self): + dataset = CmrDataset({}) + dt = dataset._parse_datetime("2020-01-01T00:00:00.123Z") + assert dt.year == 2020 + assert dt.microsecond == 123000 + + def test_parse_datetime_without_milliseconds(self): + dataset = CmrDataset({}) + dt = dataset._parse_datetime("2020-01-01T00:00:00Z") + assert dt.year == 2020 + assert dt.microsecond == 0 + + def test_temporal_info_with_invalid_dates(self): + data = { + "umm": { + "TemporalExtents": [ + {"RangeDateTimes": [{"BeginningDateTime": "invalid", "EndingDateTime": "2020-12-31T23:59:59.999Z"}]} + ] + } + } + dataset = CmrDataset(data) + assert dataset.temporal_info.total_duration == 0 + assert dataset.temporal_info.latest_end_date is None + + def test_temporal_resolution_parsing(self): + dataset = CmrDataset(self.basic_temporal_data()) + assert dataset.temporal_resolution == "24 Hour" + + def test_temporal_duration_calculation(self): + dataset = CmrDataset(self.basic_temporal_data()) + assert dataset.temporal_info.total_duration == 365 # Full year + + def test_multiple_time_ranges(self): + data = { + "meta": {}, + "umm": { + "TemporalExtents": [ + { + "RangeDateTimes": [ + { + "BeginningDateTime": "2020-01-01T00:00:00.000Z", + "EndingDateTime": "2020-06-30T23:59:59.999Z", + }, + { + "BeginningDateTime": "2020-07-01T00:00:00.000Z", + "EndingDateTime": "2021-01-01T00:00:00.000Z", + }, + ] + } + ] + }, + } + dataset = CmrDataset(data) + assert dataset.temporal_info.total_duration == 365 + + def test_single_date_times(self): + data = { + "meta": {}, + "umm": {"TemporalExtents": [{"SingleDateTimes": ["2020-01-01T00:00:00.000Z", "2020-06-01T00:00:00.000Z"]}]}, + } + dataset = CmrDataset(data) + assert len(dataset.temporal_info.single_date_times) == 2 + assert dataset.temporal_extent == "2020-01-01T00:00:00.000Z, 2020-06-01T00:00:00.000Z" + + def test_missing_temporal_data(self): + dataset = CmrDataset({"meta": {}, "umm": {}}) + assert dataset.temporal_info.total_duration == 0 + assert dataset.temporal_info.latest_end_date is None + assert dataset.temporal_resolution == "" + + def test_single_date_only(self): + data = { + "meta": {}, + "umm": {"TemporalExtents": [{"SingleDateTimes": ["2020-01-01T00:00:00.000Z", "2020-06-01T00:00:00.000Z"]}]}, + } + dataset = CmrDataset(data) + assert dataset.temporal_extent == "2020-01-01T00:00:00.000Z, 2020-06-01T00:00:00.000Z" + + def test_range_date_only(self): + data = { + "meta": {}, + "umm": { + "TemporalExtents": [ + { + "RangeDateTimes": [ + { + "BeginningDateTime": "2020-01-01T00:00:00.000Z", + "EndingDateTime": "2020-12-31T23:59:59.999Z", + }, + { + "BeginningDateTime": "2021-01-01T00:00:00.000Z", + "EndingDateTime": "2021-12-31T23:59:59.999Z", + }, + ] + } + ] + }, + } + dataset = CmrDataset(data) + assert ( + dataset.temporal_extent + == "2020-01-01T00:00:00.000Z - 2020-12-31T23:59:59.999Z, 2021-01-01T00:00:00.000Z - 2021-12-31T23:59:59.999Z" # noqa + ) + + def test_combined_single_and_range_dates(self): + data = { + "meta": {}, + "umm": { + "TemporalExtents": [ + { + "SingleDateTimes": ["2020-01-01T00:00:00.000Z"], + "RangeDateTimes": [ + { + "BeginningDateTime": "2021-01-01T00:00:00.000Z", + "EndingDateTime": "2021-12-31T23:59:59.999Z", + } + ], + } + ] + }, + } + dataset = CmrDataset(data) + assert ( + dataset.temporal_extent == "2020-01-01T00:00:00.000Z, 2021-01-01T00:00:00.000Z - 2021-12-31T23:59:59.999Z" + ) + + +class TestSpatialProcessing: + """Unit tests for spatial information processing""" + + def test_global_coverage_detection(self): + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "Geometry": { + "BoundingRectangles": [ + { + "NorthBoundingCoordinate": 90, + "SouthBoundingCoordinate": -90, + "WestBoundingCoordinate": -180, + "EastBoundingCoordinate": 180, + } + ] + } + } + } + } + } + dataset = CmrDataset(data) + assert dataset.geographic_coverage == "Global" + + def test_non_global_coverage(self): + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "Geometry": { + "BoundingRectangles": [ + { + "NorthBoundingCoordinate": 45, + "SouthBoundingCoordinate": -45, + "WestBoundingCoordinate": -90, + "EastBoundingCoordinate": 90, + } + ] + } + } + } + } + } + dataset = CmrDataset(data) + assert dataset.geographic_coverage == "" + + def test_spatial_resolution_varies(self): + """Test spatial resolution when it varies.""" + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "ResolutionAndCoordinateSystem": {"HorizontalDataResolution": {"VariesResolution": "Varies"}} + } + } + } + } + dataset = CmrDataset(data) + assert dataset.spatial_resolution == "Varies" + + def test_spatial_resolution_gridded_range(self): + """Test spatial resolution with gridded range resolutions.""" + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "ResolutionAndCoordinateSystem": { + "HorizontalDataResolution": { + "GriddedRangeResolutions": [ + { + "MinimumXDimension": 5.0, + "MinimumYDimension": 5.0, + "MaximumXDimension": 50.0, + "MaximumYDimension": 40.0, + "Unit": "Kilometers", + } + ] + } + } + } + } + } + } + dataset = CmrDataset(data) + assert dataset.spatial_resolution == "50.0 kilometers" + + def test_spatial_resolution_gridded(self): + """Test spatial resolution with gridded resolutions.""" + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "ResolutionAndCoordinateSystem": { + "HorizontalDataResolution": { + "GriddedResolutions": [{"XDimension": 30.0, "YDimension": 30.0, "Unit": "Meters"}] + } + } + } + } + } + } + dataset = CmrDataset(data) + assert dataset.spatial_resolution == "30.0 meters" + + def test_spatial_resolution_generic(self): + """Test spatial resolution with generic resolutions.""" + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "ResolutionAndCoordinateSystem": { + "HorizontalDataResolution": { + "GenericResolutions": [{"XDimension": 10.0, "YDimension": 10.0, "Unit": "Kilometers"}] + } + } + } + } + } + } + dataset = CmrDataset(data) + assert dataset.spatial_resolution == "10.0 kilometers" + + def test_spatial_resolution_missing(self): + """Test spatial resolution when resolution data is missing.""" + data = {"umm": {"SpatialExtent": {"HorizontalSpatialDomain": {"ResolutionAndCoordinateSystem": {}}}}} + dataset = CmrDataset(data) + assert dataset.spatial_resolution == "" + + def test_spatial_resolution_different_dimensions(self): + """Test spatial resolution when X and Y dimensions differ.""" + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "ResolutionAndCoordinateSystem": { + "HorizontalDataResolution": { + "GriddedResolutions": [{"XDimension": 30.0, "YDimension": 40.0, "Unit": "Meters"}] + } + } + } + } + } + } + dataset = CmrDataset(data) + assert dataset.spatial_resolution == "40.0 meters" + + def test_spatial_resolution_incomplete_data(self): + """Test spatial resolution with incomplete resolution data.""" + data = { + "umm": { + "SpatialExtent": { + "HorizontalSpatialDomain": { + "ResolutionAndCoordinateSystem": { + "HorizontalDataResolution": { + "GriddedResolutions": [ + { + "XDimension": 30.0, + # Missing YDimension + "Unit": "Meters", + } + ] + } + } + } + } + } + } + dataset = CmrDataset(data) + assert dataset.spatial_resolution == "" + + +class TestDownloadProcessing: + """Unit tests for download information processing""" + + def test_direct_download_detection(self): + data = { + "umm": { + "RelatedUrls": [ + { + "URLContentType": "DistributionURL", + "Type": "GET DATA", + "Subtype": "DIRECT DOWNLOAD", + "URL": "http://example.com/data", + } + ] + } + } + dataset = CmrDataset(data) + assert "Direct data download available" in dataset.strengths + + def test_visualization_urls(self): + data = { + "umm": { + "RelatedUrls": [ + {"URLContentType": "VisualizationURL", "URL": "http://example.com/viz1"}, + {"URLContentType": "VisualizationURL", "URL": "http://example.com/viz2"}, + ] + } + } + dataset = CmrDataset(data) + assert "http://example.com/viz1" in dataset.data_visualization + assert "http://example.com/viz2" in dataset.data_visualization + + def test_format_extraction_single(self): + data = { + "umm": { + "ArchiveAndDistributionInformation": { + "FileDistributionInformation": [{"Format": "GeoTIFF", "Fees": "0"}] + } + } + } + dataset = CmrDataset(data) + assert dataset.format == "GeoTIFF" + + def test_format_extraction_multiple(self): + data = { + "umm": { + "ArchiveAndDistributionInformation": { + "FileDistributionInformation": [ + {"Format": "Excel", "Fees": "0"}, + {"Format": "PDF", "Fees": "0"}, + {"Format": "PNG", "Fees": "0"}, + ] + } + } + } + dataset = CmrDataset(data) + assert dataset.format == "Excel; PDF; PNG" + + def test_format_extraction_empty(self): + data = {"umm": {"ArchiveAndDistributionInformation": {"FileDistributionInformation": []}}} + dataset = CmrDataset(data) + assert dataset.format == "" + + def test_format_extraction_missing_info(self): + data = {"umm": {"ArchiveAndDistributionInformation": {}}} + dataset = CmrDataset(data) + assert dataset.format == "" + + def test_format_extraction_no_archive_info(self): + data = {"umm": {}} + dataset = CmrDataset(data) + assert dataset.format == "" + + +class TestProcessingLevelInfo: + """Unit tests for processing level information""" + + def test_intended_use_exploration(self): + data = {"umm": {"ProcessingLevel": {"Id": "4"}, "CollectionDataType": "SCIENCE_QUALITY"}} + dataset = CmrDataset(data) + assert dataset.intended_use == "Path A" + + def test_intended_use_basic_analysis(self): + data = { + "umm": { + "ProcessingLevel": {"Id": "2"}, + "CollectionDataType": "SCIENCE_QUALITY", + "DataCenters": [{"ShortName": "SEDAC"}], + } + } + dataset = CmrDataset(data) + assert dataset.intended_use == "Path B" + + def test_intended_use_advanced_analysis(self): + # Added this test to cover Path C case + data = { + "umm": { + "ProcessingLevel": {"Id": "2"}, + "CollectionDataType": "SCIENCE_QUALITY", + "DataCenters": [{"ShortName": "OTHER"}], + } + } + dataset = CmrDataset(data) + assert dataset.intended_use == "Path C" + + def test_latency_mapping(self): + data = {"umm": {"CollectionDataType": "NEAR_REAL_TIME"}} + dataset = CmrDataset(data) + assert dataset.latency == "1-3 Hours" + + +class TestPropertiesGeneration: + """Unit tests for strengths and weaknesses generation""" + + def test_empty_properties(self): + dataset = CmrDataset({"meta": {}, "umm": {}}) + assert dataset.strengths == "" + assert dataset.weaknesses == "" + + def test_multiple_strengths(self): + data = { + "umm": { + "CollectionProgress": "ACTIVE", + "CollectionDataType": "NEAR_REAL_TIME", + "RelatedUrls": [ + {"URLContentType": "DistributionURL", "Type": "GET DATA", "Subtype": "DIRECT DOWNLOAD"} + ], + } + } + dataset = CmrDataset(data) + strengths = dataset.strengths.split("; ") + assert len(strengths) == 3 + assert "Data collection is ongoing" in strengths + assert "Near real-time data is available" in strengths + assert "Direct data download available" in strengths + + +class TestUrlProcessing: + """Unit tests for URL-related functionality""" + + def test_sde_link_generation(self): + data = {"meta": {"concept-id": "C179001887-SEDAC"}} + dataset = CmrDataset(data) + parsed_url = urlparse(dataset.sde_link) + assert parsed_url.hostname == "sciencediscoveryengine.nasa.gov" + assert "C179001887-SEDAC" in dataset.sde_link + + def test_source_link_generation(self): + data = {"umm": {"DOI": {"Authority": "https://doi.org/", "DOI": "10.1234/test"}}} + dataset = CmrDataset(data) + assert dataset.source_link == "https://doi.org/10.1234/test" + + def test_missing_doi_info(self): + dataset = CmrDataset({"umm": {"DOI": {}}}) + assert dataset.source_link == "" + + +class TestProjectProcessing: + """Unit tests for project information processing""" + + def test_multiple_projects(self): + data = {"umm": {"Projects": [{"ShortName": "short_1"}, {"ShortName": "short_2"}]}} + dataset = CmrDataset(data) + assert dataset.projects == "short_1; short_2" + + def test_missing_project_shortname(self): + data = {"umm": {"Projects": [{"LongName": "long_1"}, {"ShortName": "short_2"}]}} + dataset = CmrDataset(data) + assert dataset.projects == "long_1; short_2" + + def test_no_projects(self): + dataset = CmrDataset({"umm": {}}) + assert dataset.projects == "" + + +class TestStrengthsWeaknesses: + """Unit tests for strengths and weaknesses generation""" + + def test_recent_data_strength(self): + data = { + "umm": { + "TemporalExtents": [ + { + "RangeDateTimes": [ + { + "BeginningDateTime": "2023-01-01T00:00:00.000Z", + "EndingDateTime": "2024-01-01T00:00:00.000Z", + } + ] + } + ] + } + } + dataset = CmrDataset(data) + assert "Recent data is available" in dataset.strengths + + def test_weaknesses_combination(self): + data = { + "umm": { + "TemporalExtents": [ + { + "RangeDateTimes": [ + { + "BeginningDateTime": "2020-01-01T00:00:00.000Z", + "EndingDateTime": "2020-02-01T00:00:00.000Z", + } + ] + } + ], + "RelatedUrls": [{"URLContentType": "DistributionURL", "Type": "GET DATA"}], + } + } + dataset = CmrDataset(data) + weaknesses = dataset.weaknesses.split("; ") + assert "Limited temporal extent" in weaknesses + assert "Direct data download not available" in weaknesses + + +class TestEdgeCases: + """Tests for edge cases and error handling""" + + def test_empty_dataset(self): + dataset = CmrDataset({}) + assert dataset.dataset_name == "" + assert dataset.description == "" + assert dataset.limitations == "" + assert dataset.strengths == "" + assert dataset.weaknesses == "" + + def test_malformed_dates(self): + data = { + "umm": { + "TemporalExtents": [ + { + "RangeDateTimes": [ + {"BeginningDateTime": "not-a-date", "EndingDateTime": "2020-01-01T00:00:00.000Z"} + ] + } + ] + } + } + dataset = CmrDataset(data) + assert dataset.temporal_info.total_duration == 0 + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/scripts/ej/test_threshold_processing.py b/scripts/ej/test_threshold_processing.py new file mode 100644 index 00000000..424a81fa --- /dev/null +++ b/scripts/ej/test_threshold_processing.py @@ -0,0 +1,209 @@ +"""Unit tests for threshold processing functionality.""" + +# docker-compose -f local.yml run --rm django pytest scripts/ej/test_threshold_processing.py + +import pytest +from threshold_processing import ThresholdProcessor + + +class TestThresholdProcessor: + """Test suite for ThresholdProcessor class.""" + + @pytest.fixture + def default_thresholds(self): + """Default thresholds for testing.""" + return { + "Not EJ": 0.80, + "Urban Flooding": 0.50, + "Extreme Heat": 0.50, + "Water Availability": 0.80, + "Health & Air Quality": 0.90, + "Disasters": 0.80, + "Food Availability": 0.80, + "Human Dimensions": 0.80, + } + + @pytest.fixture + def authorized_classifications(self): + """Authorized classifications for testing.""" + return [ + "Urban Flooding", + "Extreme Heat", + "Water Availability", + "Health & Air Quality", + "Disasters", + "Food Availability", + "Human Dimensions", + ] + + @pytest.fixture + def processor(self, default_thresholds): + """Create a ThresholdProcessor instance with test thresholds.""" + return ThresholdProcessor(thresholds=default_thresholds) + + @pytest.fixture + def custom_processor(self): + """Create a ThresholdProcessor instance with simplified test thresholds.""" + custom_thresholds = { + "Not EJ": 0.75, + "Test Category 1": 0.60, + "Test Category 2": 0.80, + } + return ThresholdProcessor(thresholds=custom_thresholds) + + def test_initialization_with_thresholds(self, processor, default_thresholds): + """Test initialization with provided thresholds.""" + assert processor.thresholds == default_thresholds + assert "Not EJ" in processor.thresholds + assert processor.thresholds["Not EJ"] == 0.80 + + def test_initialization_custom_thresholds(self, custom_processor): + """Test initialization with custom thresholds.""" + assert custom_processor.thresholds["Not EJ"] == 0.75 + assert custom_processor.thresholds["Test Category 1"] == 0.60 + assert custom_processor.thresholds["Test Category 2"] == 0.80 + + def test_single_high_scoring_not_ej(self, processor): + """Test when 'Not EJ' has the highest score.""" + predictions = [ + {"label": "Not EJ", "score": 0.90}, + {"label": "Urban Flooding", "score": 0.85}, + {"label": "Water Availability", "score": 0.82}, + ] + result = processor.process_predictions(predictions) + assert result == ["Not EJ"] + assert len(result) == 1 + + def test_multiple_indicators_above_threshold(self, processor): + """Test when multiple indicators exceed their thresholds.""" + predictions = [ + {"label": "Not EJ", "score": 0.30}, + {"label": "Urban Flooding", "score": 0.75}, # Above 0.50 threshold + {"label": "Extreme Heat", "score": 0.60}, # Above 0.50 threshold + {"label": "Water Availability", "score": 0.85}, # Above 0.80 threshold + ] + result = processor.process_predictions(predictions) + assert len(result) == 3 + assert "Urban Flooding" in result + assert "Extreme Heat" in result + assert "Water Availability" in result + + def test_no_indicators_above_threshold(self, processor): + """Test when no indicators meet their thresholds.""" + predictions = [ + {"label": "Not EJ", "score": 0.70}, + {"label": "Urban Flooding", "score": 0.45}, # Below 0.50 threshold + {"label": "Water Availability", "score": 0.75}, # Below 0.80 threshold + ] + result = processor.process_predictions(predictions) + assert result == ["Not EJ"] + + def test_mixed_threshold_scenarios(self, processor): + """Test various mixed scenarios of threshold checking.""" + predictions = [ + {"label": "Not EJ", "score": 0.60}, + {"label": "Urban Flooding", "score": 0.55}, # Above 0.50 threshold + {"label": "Extreme Heat", "score": 0.45}, # Below 0.50 threshold + {"label": "Water Availability", "score": 0.85}, # Above 0.80 threshold + ] + result = processor.process_predictions(predictions) + assert len(result) == 2 + assert "Urban Flooding" in result + assert "Water Availability" in result + assert "Extreme Heat" not in result + + def test_authorized_classifications_filtering(self, processor, authorized_classifications): + """Test filtering of authorized classifications.""" + # Monkey patch the authorized classifications for this test + import threshold_processing + + original_authorized = threshold_processing.AUTHORIZED_CLASSIFICATIONS + threshold_processing.AUTHORIZED_CLASSIFICATIONS = authorized_classifications + + test_classifications = ["Urban Flooding", "Invalid Category", "Water Availability", "Another Invalid"] + result = processor.filter_authorized_classifications(test_classifications) + assert len(result) == 2 + assert all(r in authorized_classifications for r in result) + assert "Invalid Category" not in result + assert "Another Invalid" not in result + + # Restore original authorized classifications + threshold_processing.AUTHORIZED_CLASSIFICATIONS = original_authorized + + def test_process_and_filter_complete_pipeline(self, processor, authorized_classifications): + """Test the complete processing pipeline with unauthorized categories.""" + # Monkey patch the authorized classifications for this test + import threshold_processing + + original_authorized = threshold_processing.AUTHORIZED_CLASSIFICATIONS + threshold_processing.AUTHORIZED_CLASSIFICATIONS = authorized_classifications + + predictions = [ + {"label": "Not EJ", "score": 0.30}, + {"label": "Urban Flooding", "score": 0.75}, + {"label": "Invalid Category", "score": 0.95}, + {"label": "Water Availability", "score": 0.85}, + ] + result = processor.process_and_filter(predictions) + assert len(result) == 2 + assert "Urban Flooding" in result + assert "Water Availability" in result + assert "Invalid Category" not in result + + # Restore original authorized classifications + threshold_processing.AUTHORIZED_CLASSIFICATIONS = original_authorized + + def test_edge_case_empty_predictions(self, processor): + """Test handling of empty predictions list.""" + result = processor.process_predictions([]) + assert result == ["Not EJ"] + + def test_edge_case_missing_scores(self, processor): + """Test handling of predictions with missing scores.""" + predictions = [{"label": "Urban Flooding"}, {"label": "Water Availability", "score": 0.85}] # Missing score + with pytest.raises(KeyError): + processor.process_predictions(predictions) + + def test_edge_case_invalid_score_values(self, processor): + """Test handling of invalid score values.""" + predictions = [{"label": "Not EJ", "score": "invalid"}, {"label": "Urban Flooding", "score": 0.75}] + with pytest.raises(TypeError): + processor.process_predictions(predictions) + + def test_threshold_boundary_conditions(self, processor): + """Test classification at exact threshold boundaries.""" + predictions = [ + {"label": "Not EJ", "score": 0.30}, + {"label": "Urban Flooding", "score": 0.50}, # Exactly at threshold + {"label": "Water Availability", "score": 0.80}, # Exactly at threshold + {"label": "Health & Air Quality", "score": 0.89}, # Just below threshold + ] + result = processor.process_predictions(predictions) + assert len(result) == 2 + assert "Urban Flooding" in result + assert "Water Availability" in result + assert "Health & Air Quality" not in result + + def test_all_indicators_same_score(self, processor): + """Test behavior when all indicators have the same score.""" + predictions = [ + {"label": "Not EJ", "score": 0.85}, + {"label": "Urban Flooding", "score": 0.85}, + {"label": "Water Availability", "score": 0.85}, + ] + result = processor.process_predictions(predictions) + assert result == ["Not EJ"] # Since Not EJ is highest scoring (tied) prediction + + def test_high_scores_below_threshold(self, processor): + """Test when scores are high but still below their respective thresholds.""" + predictions = [ + {"label": "Not EJ", "score": 0.70}, + {"label": "Health & Air Quality", "score": 0.89}, # High but below 0.90 threshold + {"label": "Water Availability", "score": 0.79}, # High but below 0.80 threshold + ] + result = processor.process_predictions(predictions) + assert result == ["Not EJ"] + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/scripts/ej/threshold_processing.py b/scripts/ej/threshold_processing.py new file mode 100644 index 00000000..da0ce048 --- /dev/null +++ b/scripts/ej/threshold_processing.py @@ -0,0 +1,83 @@ +"""Module for processing classification predictions with thresholds.""" + +try: + from config import AUTHORIZED_CLASSIFICATIONS, INDICATOR_THRESHOLDS +except ImportError: + from scripts.ej.config import AUTHORIZED_CLASSIFICATIONS, INDICATOR_THRESHOLDS + + +class ThresholdProcessor: + """ + Processes classification predictions using configurable thresholds. + """ + + def __init__(self, thresholds: dict[str, float] = None): + """ + Initialize the processor with thresholds. + + Args: + thresholds: Dictionary of classification labels and their threshold values. + If None, uses default thresholds from config. + """ + self.thresholds = thresholds or INDICATOR_THRESHOLDS + + def process_predictions(self, predictions: list[dict[str, float]]) -> list[str]: + """ + Process predictions and classify based on individual thresholds. + + Args: + predictions: List of dictionaries containing prediction labels and scores. + Each dict should have 'label' and 'score' keys. + + Returns: + List of classification labels that meet their respective thresholds. + """ + # Handle empty predictions + if not predictions: + return ["Not EJ"] + + # Find highest scoring prediction + highest_prediction = max(predictions, key=lambda x: x["score"]) + + # If highest prediction is "Not EJ", return it as the only classification + if highest_prediction["label"] == "Not EJ": + return ["Not EJ"] + + # Filter classifications based on thresholds + classifications = [ + pred["label"] + for pred in predictions + if ( + pred["label"] in self.thresholds # Only check labels we have thresholds for + and pred["score"] >= self.thresholds[pred["label"]] + and pred["label"] != "Not EJ" + ) + ] + + # Default to "Not EJ" if no classifications meet thresholds + return classifications if classifications else ["Not EJ"] + + def filter_authorized_classifications(self, classifications: list[str]) -> list[str]: + """ + Filter classifications to keep only authorized ones. + + Args: + classifications: List of classification labels. + + Returns: + List of authorized classification labels. + """ + return [cls for cls in classifications if cls in AUTHORIZED_CLASSIFICATIONS] + + def process_and_filter(self, predictions: list[dict[str, float]]) -> list[str]: + """ + Process predictions and filter to authorized classifications. + + Args: + predictions: List of dictionaries containing prediction labels and scores. + + Returns: + List of authorized classification labels that meet their thresholds. + """ + classifications = self.process_predictions(predictions) + return self.filter_authorized_classifications(classifications) diff --git a/scripts/find_redirects_solar_urls.py b/scripts/find_redirects_solar_urls.py index 3bdbc131..db78081b 100644 --- a/scripts/find_redirects_solar_urls.py +++ b/scripts/find_redirects_solar_urls.py @@ -43,9 +43,9 @@ def csv_to_dict_list(file_path): scraped_title = soup.find("title").text.strip() if soup.find("title") else "" except (AssertionError, Exception) as parse_error: scraped_title = "" - print(f"Error parsing URL {url_info['url']}: {parse_error}") + print(f"Error parsing URL {url_info['url']}: {parse_error}") # noqa: F821 except requests.RequestException as e: - print(f"Error fetching URL {url_info['url']}: {e}") + print(f"Error fetching URL {url_info['url']}: {e}") # noqa: F821 response_url = "" scraped_title = "" diff --git a/scripts/quality_and_indexing/restore_deleted_files.py b/scripts/quality_and_indexing/restore_deleted_files.py index 6d6fcb84..70721cc9 100644 --- a/scripts/quality_and_indexing/restore_deleted_files.py +++ b/scripts/quality_and_indexing/restore_deleted_files.py @@ -1,5 +1,6 @@ """ -you need to run this script in the root of the repository that from which the file was deleted, in this case the root of the sinequa_configs repository. +you need to run this script in the root of the repository that from which the file was deleted, +in this case the root of the sinequa_configs repository. """ import subprocess diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 1b38db21..02ba0900 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -1,12 +1,37 @@ import csv +from django import forms from django.contrib import admin, messages from django.http import HttpResponse +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaResolvedTitle, + DeltaTitlePattern, +) + from .models.candidate_url import CandidateURL, ResolvedTitle -from .models.collection import Collection, WorkflowHistory +from .models.collection import Collection, ReindexingHistory, WorkflowHistory +from .models.collection_choice_fields import TDAMMTags +from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern -from .tasks import import_candidate_urls_from_api +from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api + + +def fetch_and_replace_text_for_server(modeladmin, request, queryset, server_name): + for collection in queryset: + fetch_and_replace_full_text.delay(collection.id, server_name) + modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server") + + +@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text") +def fetch_full_text_lrm_dev_action(modeladmin, request, queryset): + fetch_and_replace_text_for_server(modeladmin, request, queryset, "lrm_dev") + + +@admin.action(description="Import candidate URLs from XLI Server with Full Text") +def fetch_full_text_xli_action(modeladmin, request, queryset): + fetch_and_replace_text_for_server(modeladmin, request, queryset, "xli") @admin.action(description="Generate deployment message") @@ -109,7 +134,7 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_ messages.add_message( request, messages.INFO, - f"Started importing URLs from the API for: {collection_names} from {server_name.title()}", + f"Started importing URLs from the API for: {collection_names} from {server_name.upper()} Server", ) @@ -133,19 +158,19 @@ def import_candidate_urls_secret_production(modeladmin, request, queryset): import_candidate_urls_from_api_caller(modeladmin, request, queryset, "secret_production") -@admin.action(description="Import candidate URLs from Li's Server") -def import_candidate_urls_lis_server(modeladmin, request, queryset): - import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lis_server") +@admin.action(description="Import candidate URLs from XLI Server") +def import_candidate_urls_xli_server(modeladmin, request, queryset): + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "xli") @admin.action(description="Import candidate URLs from LRM Dev Server") def import_candidate_urls_lrm_dev_server(modeladmin, request, queryset): - import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev_server") + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev") @admin.action(description="Import candidate URLs from LRM QA Server") def import_candidate_urls_lrm_qa_server(modeladmin, request, queryset): - import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa_server") + import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa") class ExportCsvMixin: @@ -154,7 +179,7 @@ def export_as_csv(self, request, queryset): field_names = [field.name for field in meta.fields] response = HttpResponse(content_type="text/csv") - response["Content-Disposition"] = f"attachment; filename={meta}.csv" + response["Content-Disposition"] = f"attachment; filename={meta}.csv" # noqa: E702 writer = csv.writer(response) writer.writerow(field_names) @@ -192,6 +217,7 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): "source", "turned_on", "is_multi_division", + "reindexing_status", ), }, ), @@ -218,27 +244,55 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin): list_display = ( "name", "candidate_urls_count", + "included_candidate_urls_count", + "delta_urls_count", + "included_delta_urls_count", + "included_curated_urls_count", "config_folder", "url", "division", "new_collection", "is_multi_division", + "reindexing_status", ) + + def included_candidate_urls_count(self, obj) -> int: + return obj.candidate_urls.filter(excluded=False).count() + + included_candidate_urls_count.short_description = "Included Candidate URLs Count" + + def delta_urls_count(self, obj) -> int: + return obj.delta_urls.count() + + delta_urls_count.short_description = "Total Delta URLs Count" + + def included_delta_urls_count(self, obj) -> int: + return obj.delta_urls.filter(excluded=False).count() + + included_delta_urls_count.short_description = "Included Delta URLs Count" + + def included_curated_urls_count(self, obj) -> int: + return obj.curated_urls.filter(excluded=False).count() + + included_curated_urls_count.short_description = "Included Curated URLs Count" + readonly_fields = ("config_folder",) - list_filter = ("division", "curation_status", "workflow_status", "turned_on", "is_multi_division") + list_filter = ( + "division", + "curation_status", + "workflow_status", + "turned_on", + "is_multi_division", + "reindexing_status", + ) search_fields = ("name", "url", "config_folder") actions = [ generate_deployment_message, "export_as_csv", "update_config", download_candidate_urls_as_csv, - import_candidate_urls_test, - import_candidate_urls_production, - import_candidate_urls_secret_test, - import_candidate_urls_secret_production, - import_candidate_urls_lis_server, - import_candidate_urls_lrm_dev_server, - import_candidate_urls_lrm_qa_server, + fetch_full_text_lrm_dev_action, + fetch_full_text_xli_action, ] ordering = ("cleaning_order",) @@ -262,11 +316,90 @@ def exclude_and_delete_children(modeladmin, request, queryset): candidate_url.get_children().delete() -class CandidateURLAdmin(admin.ModelAdmin): - """Admin View for CandidateURL""" +class TDAMMFormMixin(forms.ModelForm): + """Mixin for forms that need TDAMM tag fields""" + + tdamm_tag_manual = forms.MultipleChoiceField( + choices=TDAMMTags.choices, + required=False, + label="TDAMM Manual Tags", + widget=forms.CheckboxSelectMultiple, + ) + + tdamm_tag_ml = forms.MultipleChoiceField( + choices=TDAMMTags.choices, + required=False, + label="TDAMM ML Tags", + widget=forms.CheckboxSelectMultiple, + ) + + +class TDAMMAdminMixin: + """Mixin for admin classes that handle TDAMM tags""" + + list_display = ("url", "scraped_title", "generated_title", "collection") + list_filter = ["collection"] + search_fields = ("url", "collection__name") + + def get_fieldsets(self, request, obj=None): + fieldsets = [ + ( + "Overall Information", + { + "fields": ( + "collection", + "url", + "scraped_title", + "scraped_text", + "generated_title", + "visited", + "document_type", + "division", + ) + }, + ), + ( + "TDAMM Tags", + { + "fields": ( + "tdamm_tag_ml", + "tdamm_tag_manual", + ), + "classes": ("collapse",), + }, + ), + ] + return fieldsets + + +class CandidateURLForm(TDAMMFormMixin): + class Meta: + model = CandidateURL + fields = "__all__" + + +class DumpURLForm(TDAMMFormMixin, forms.ModelForm): + class Meta: + model = DumpUrl + fields = "__all__" + + +class DeltaURLForm(TDAMMFormMixin, forms.ModelForm): + class Meta: + model = DeltaUrl + fields = "__all__" + + +class CuratedURLForm(TDAMMFormMixin, forms.ModelForm): + class Meta: + model = CuratedUrl + fields = "__all__" - list_display = ("url", "scraped_title", "collection") - list_filter = ("collection",) + +class CandidateURLAdmin(TDAMMAdminMixin, admin.ModelAdmin): + """Admin view for CandidateURL""" + + form = CandidateURLForm class TitlePatternAdmin(admin.ModelAdmin): @@ -290,6 +423,12 @@ class WorkflowHistoryAdmin(admin.ModelAdmin): list_filter = ["workflow_status", "old_status"] +class ReindexingHistoryAdmin(admin.ModelAdmin): + list_display = ("collection", "old_status", "reindexing_status", "created_at") + search_fields = ["collection__name"] + list_filter = ["reindexing_status", "old_status"] + + class ResolvedTitleAdmin(admin.ModelAdmin): list_display = ["title_pattern", "candidate_url", "resolved_title", "created_at"] @@ -299,9 +438,66 @@ class DivisionPatternAdmin(admin.ModelAdmin): search_fields = ("match_pattern", "division") +# deltas below +class DeltaTitlePatternAdmin(admin.ModelAdmin): + """Admin View for DeltaTitlePattern""" + + list_display = ( + "match_pattern", + "title_pattern", + "collection", + "match_pattern_type", + ) + list_filter = ( + "match_pattern_type", + "collection", + ) + + +class DeltaResolvedTitleAdmin(admin.ModelAdmin): + list_display = ["title_pattern", "delta_url", "resolved_title", "created_at"] + + +class DeltaDivisionPatternAdmin(admin.ModelAdmin): + list_display = ("collection", "match_pattern", "division") + search_fields = ("match_pattern", "division") + + +class DumpUrlAdmin(TDAMMAdminMixin, admin.ModelAdmin): + """Admin View for DumpUrl""" + + form = DumpURLForm + + +class DeltaUrlAdmin(TDAMMAdminMixin, admin.ModelAdmin): + """Admin View for DeltaUrl""" + + form = DeltaURLForm + + def get_fieldsets(self, request, obj=None): + fieldsets = super().get_fieldsets(request, obj) + fieldsets[0][1]["fields"] += ("to_delete",) + return fieldsets + + +class CuratedUrlAdmin(TDAMMAdminMixin, admin.ModelAdmin): + """Admin View for CuratedUrl""" + + form = CuratedURLForm + + +admin.site.register(ReindexingHistory, ReindexingHistoryAdmin) admin.site.register(WorkflowHistory, WorkflowHistoryAdmin) admin.site.register(CandidateURL, CandidateURLAdmin) admin.site.register(TitlePattern, TitlePatternAdmin) admin.site.register(IncludePattern) admin.site.register(ResolvedTitle, ResolvedTitleAdmin) admin.site.register(DivisionPattern, DivisionPatternAdmin) + + +admin.site.register(DeltaTitlePattern, DeltaTitlePatternAdmin) +admin.site.register(DeltaResolvedTitle, DeltaResolvedTitleAdmin) +admin.site.register(DeltaDivisionPattern, DeltaDivisionPatternAdmin) +admin.site.register(DumpUrl, DumpUrlAdmin) +admin.site.register(DeltaUrl, DeltaUrlAdmin) +admin.site.register(CuratedUrl, CuratedUrlAdmin) diff --git a/sde_collections/management/commands/database_backup.py b/sde_collections/management/commands/database_backup.py new file mode 100644 index 00000000..090de63e --- /dev/null +++ b/sde_collections/management/commands/database_backup.py @@ -0,0 +1,143 @@ +""" +Management command to backup PostgreSQL database. + +Usage: + docker-compose -f local.yml run --rm django python manage.py database_backup + docker-compose -f local.yml run --rm django python manage.py database_backup --no-compress + docker-compose -f local.yml run --rm django python manage.py database_backup --output my_backup.sql + docker-compose -f production.yml run --rm django python manage.py database_backup + +All backups are stored in the /backups directory, which is mounted as a volume in both local +and production environments. If specifying a custom output path, it will be relative to this directory. +""" + +import gzip +import os +import shutil +import subprocess +from contextlib import contextmanager +from datetime import datetime + +from django.conf import settings +from django.core.management.base import BaseCommand + + +@contextmanager +def temp_file_handler(filename: str): + """Context manager to handle temporary files, ensuring cleanup.""" + try: + yield filename + finally: + if os.path.exists(filename): + os.remove(filename) + + +class Command(BaseCommand): + help = "Creates a PostgreSQL backup using pg_dump" + + def add_arguments(self, parser): + parser.add_argument( + "--no-compress", + action="store_true", + help="Disable backup file compression (enabled by default)", + ) + parser.add_argument( + "--output", + type=str, + help="Output file path (default: auto-generated in /app/backups directory)", + ) + + def get_backup_filename(self, compress: bool, custom_output: str = None) -> tuple[str, str]: + """Generate backup filename and actual dump path. + + Args: + compress: Whether the output should be compressed + custom_output: Optional custom output path + + Returns: + tuple[str, str]: A tuple containing: + - final_filename: Full path for the final backup file (with .gz if compressed) + - temp_filename: Full path for the temporary dump file (without .gz) + """ + backup_dir = "/app/backups" + os.makedirs(backup_dir, exist_ok=True) + + if custom_output: + # If custom_output is relative, make it relative to backup_dir + if not custom_output.startswith("/"): + custom_output = os.path.join(backup_dir, custom_output) + + # Ensure the output directory exists + output_dir = os.path.dirname(custom_output) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + if compress: + return custom_output + ( + ".gz" if not custom_output.endswith(".gz") else "" + ), custom_output.removesuffix( # noqa + ".gz" + ) + return custom_output, custom_output + else: + date_str = datetime.now().strftime("%Y%m%d") + env_name = os.getenv("BACKUP_ENVIRONMENT", "unknown") + temp_filename = os.path.join(backup_dir, f"{env_name}_backup_{date_str}.sql") + final_filename = f"{temp_filename}.gz" if compress else temp_filename + return final_filename, temp_filename + + def run_pg_dump(self, output_file: str, env: dict) -> None: + """Execute pg_dump with given parameters.""" + db_settings = settings.DATABASES["default"] + cmd = [ + "pg_dump", + "-h", + db_settings["HOST"], + "-U", + db_settings["USER"], + "-d", + db_settings["NAME"], + "--no-owner", + "--no-privileges", + "-f", + output_file, + ] + subprocess.run(cmd, env=env, check=True) + + def compress_file(self, input_file: str, output_file: str) -> None: + """Compress input file to output file using gzip.""" + with open(input_file, "rb") as f_in: + with gzip.open(output_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + def handle(self, *args, **options): + if not os.getenv("BACKUP_ENVIRONMENT"): + self.stdout.write( + self.style.WARNING( + "Note: Set BACKUP_ENVIRONMENT in your env if you want automatic environment-based filenames" + ) + ) + + compress = not options["no_compress"] + backup_file, dump_file = self.get_backup_filename(compress, options.get("output")) + + env = os.environ.copy() + env["PGPASSWORD"] = settings.DATABASES["default"]["PASSWORD"] + + try: + if compress: + with temp_file_handler(dump_file): + self.run_pg_dump(dump_file, env) + self.compress_file(dump_file, backup_file) + else: + self.run_pg_dump(backup_file, env) + + self.stdout.write( + self.style.SUCCESS( + f"Successfully created {'compressed ' if compress else ''}backup at: backups/{os.path.basename(backup_file)}" # noqa + ) + ) + except subprocess.CalledProcessError as e: + self.stdout.write(self.style.ERROR(f"Backup failed: {str(e)}")) + except Exception as e: + self.stdout.write(self.style.ERROR(f"Error during backup process: {str(e)}")) diff --git a/sde_collections/management/commands/database_restore.py b/sde_collections/management/commands/database_restore.py new file mode 100644 index 00000000..7410484d --- /dev/null +++ b/sde_collections/management/commands/database_restore.py @@ -0,0 +1,145 @@ +""" +Management command to restore PostgreSQL database from backup. + +Usage: + docker-compose -f local.yml run --rm django python manage.py database_restore backups/backup.sql[.gz] + docker-compose -f production.yml run --rm django python manage.py database_restore backups/backup.sql[.gz] + +The backup file should be located in the /backups directory, which is mounted as a volume in both +local and production environments. +""" + +import enum +import gzip +import os +import shutil +import socket +import subprocess +from contextlib import contextmanager + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError +from django.db import connections + + +class Server(enum.Enum): + PRODUCTION = "PRODUCTION" + STAGING = "STAGING" + UNKNOWN = "UNKNOWN" + + +def detect_server() -> Server: + hostname = socket.gethostname().upper() + if "PRODUCTION" in hostname: + return Server.PRODUCTION + elif "STAGING" in hostname: + return Server.STAGING + return Server.UNKNOWN + + +@contextmanager +def temp_file_handler(filename: str): + """Context manager to handle temporary files, ensuring cleanup.""" + try: + yield filename + finally: + if os.path.exists(filename): + os.remove(filename) + + +class Command(BaseCommand): + help = "Restores PostgreSQL database from backup file (compressed or uncompressed)" + + def add_arguments(self, parser): + parser.add_argument("backup_path", type=str, help="Path to the backup file (.sql or .sql.gz)") + + def get_db_settings(self): + """Get database connection settings.""" + db = settings.DATABASES["default"] + return { + "host": db["HOST"], + "name": db["NAME"], + "user": db["USER"], + "password": db["PASSWORD"], + } + + def run_psql_command(self, command: str, db_name: str = "postgres", env: dict = None) -> None: + """Execute a psql command.""" + db = self.get_db_settings() + cmd = ["psql", "-h", db["host"], "-U", db["user"], "-d", db_name, "-c", command] + subprocess.run(cmd, env=env, check=True) + + def terminate_database_connections(self, env: dict) -> None: + """Terminate all connections to the database.""" + db = self.get_db_settings() + # Close Django's connection first + connections.close_all() + + # Terminate any remaining PostgreSQL connections + terminate_conn_sql = f""" + SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = '{db["name"]}' + AND pid <> pg_backend_pid(); + """ + try: + self.run_psql_command(terminate_conn_sql, env=env) + except subprocess.CalledProcessError: + # If this fails, it's usually because there are no connections to terminate + pass + + def reset_database(self, env: dict) -> None: + """Drop and recreate the database.""" + db = self.get_db_settings() + + self.stdout.write(f"Terminating connections to {db['name']}...") + self.terminate_database_connections(env) + + self.stdout.write(f"Dropping database {db['name']}...") + self.run_psql_command(f"DROP DATABASE IF EXISTS {db['name']}", env=env) + + self.stdout.write(f"Creating database {db['name']}...") + self.run_psql_command(f"CREATE DATABASE {db['name']}", env=env) + + def restore_backup(self, backup_file: str, env: dict) -> None: + """Restore database from backup file.""" + db = self.get_db_settings() + cmd = ["psql", "-h", db["host"], "-U", db["user"], "-d", db["name"], "-f", backup_file] + self.stdout.write("Restoring from backup...") + subprocess.run(cmd, env=env, check=True) + + def decompress_file(self, input_file: str, output_file: str) -> None: + """Decompress gzipped file to output file.""" + with gzip.open(input_file, "rb") as f_in: + with open(output_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + def handle(self, *args, **options): + server = detect_server() + backup_path = options["backup_path"] + is_compressed = backup_path.endswith(".gz") + + if not os.path.exists(backup_path): + raise CommandError(f"Backup file not found: {backup_path}") + + env = os.environ.copy() + env["PGPASSWORD"] = self.get_db_settings()["password"] + + try: + # Reset the database first + self.reset_database(env) + + # Handle backup restoration + if is_compressed: + with temp_file_handler(backup_path[:-3]) as temp_file: + self.decompress_file(backup_path, temp_file) + self.restore_backup(temp_file, env) + else: + self.restore_backup(backup_path, env) + + self.stdout.write(self.style.SUCCESS(f"Successfully restored {server.value} database from {backup_path}")) + + except subprocess.CalledProcessError as e: + self.stdout.write(self.style.ERROR(f"Restore failed on {server.value}: {str(e)}")) + except Exception as e: + self.stdout.write(self.style.ERROR(f"Error during restore process: {str(e)}")) diff --git a/sde_collections/management/commands/deduplicate_patterns.py b/sde_collections/management/commands/deduplicate_patterns.py new file mode 100644 index 00000000..f9de42e6 --- /dev/null +++ b/sde_collections/management/commands/deduplicate_patterns.py @@ -0,0 +1,47 @@ +# docker-compose -f local.yml run --rm django python manage.py deduplicate_patterns +# docker-compose -f production.yml run --rm django python manage.py deduplicate_patterns + +from collections import defaultdict + +from django.core.management.base import BaseCommand +from django.db.models import Count + +from sde_collections.models.pattern import ( + DivisionPattern, + DocumentTypePattern, + ExcludePattern, + IncludePattern, + TitlePattern, +) + + +class Command(BaseCommand): + help = "Remove duplicate patterns within collections for all pattern types" + + def handle(self, *args, **kwargs): + pattern_models = [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern] + + deletion_counts = defaultdict(int) + + for model in pattern_models: + # Get all collections that have duplicate patterns + collections_with_dupes = ( + model.objects.values("collection", "match_pattern") + .annotate(pattern_count=Count("id")) + .filter(pattern_count__gt=1) + ) + + for group in collections_with_dupes: + # Get all patterns for this collection/match_pattern combo + patterns = model.objects.filter(collection_id=group["collection"], match_pattern=group["match_pattern"]) + + # Keep one pattern, delete the rest + patterns_to_delete = patterns[1:] + for pattern in patterns_to_delete: + pattern.delete() + deletion_counts[model.__name__] += 1 + + # Print final summary + for model_name, count in deletion_counts.items(): + self.stdout.write(f"{model_name}: {count}") + self.stdout.write(f"Total: {sum(deletion_counts.values())}") diff --git a/sde_collections/management/commands/deduplicate_urls.py b/sde_collections/management/commands/deduplicate_urls.py new file mode 100644 index 00000000..251ae887 --- /dev/null +++ b/sde_collections/management/commands/deduplicate_urls.py @@ -0,0 +1,88 @@ +import time + +from django.core.management.base import BaseCommand +from django.db.models import Count, Min + +from sde_collections.models.candidate_url import CandidateURL +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices + + +class Command(BaseCommand): + help = "Deduplicate CandidateURLs" + + def handle(self, *args, **kwargs): + deduplicate_candidate_urls() + + +def is_priority_collection(collection): + priority_statuses = { + WorkflowStatusChoices.CURATED, + WorkflowStatusChoices.QUALITY_FIXED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED, + WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, + WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, + WorkflowStatusChoices.QUALITY_CHECK_FAILED, + WorkflowStatusChoices.QUALITY_CHECK_MINOR, + WorkflowStatusChoices.QUALITY_CHECK_PERFECT, + WorkflowStatusChoices.PROD_PERFECT, + WorkflowStatusChoices.PROD_MINOR, + WorkflowStatusChoices.PROD_MAJOR, + } + return collection.workflow_status in priority_statuses + + +def deduplicate_candidate_urls(): + start_time = time.time() + + collection_counts = { + c["id"]: c["url_count"] + for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count") + } + + collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()} + + # Phase 1: Intra-collection duplicates + intra_dupes = ( + CandidateURL.objects.values("collection_id", "url") + .annotate(count=Count("id"), min_id=Min("id")) + .filter(count__gt=1) + ) + + intra_ids_to_delete = [] + for dupe in intra_dupes: + dupe_ids = set( + CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"]) + .exclude(id=dupe["min_id"]) + .values_list("id", flat=True) + ) + intra_ids_to_delete.extend(dupe_ids) + + CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete() + + # Phase 2: Cross-collection duplicates + cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1) + + cross_ids_to_delete = [] + for dupe in cross_dupes: + instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id")) + + priority_instances = [i for i in instances if collection_status[i["collection_id"]]] + non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]] + + if priority_instances: + keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]]) + else: + keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]]) + + delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]] + cross_ids_to_delete.extend(delete_ids) + + CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete() + + elapsed_time = time.time() - start_time + action = "Deleted" + print( + f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds" # noqa + ) diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py new file mode 100644 index 00000000..e48cde41 --- /dev/null +++ b/sde_collections/management/commands/migrate_urls_and_patterns.py @@ -0,0 +1,147 @@ +import time + +from django.apps import apps +from django.core.management.base import BaseCommand +from django.db.models import Count + +from sde_collections.models.candidate_url import CandidateURL +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, + DeltaExcludePattern, + DeltaIncludePattern, + DeltaTitlePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl +from sde_collections.models.pattern import ( + DivisionPattern, + DocumentTypePattern, + ExcludePattern, + IncludePattern, + TitlePattern, +) + +STATUSES_TO_MIGRATE = [ + WorkflowStatusChoices.CURATED, + WorkflowStatusChoices.QUALITY_FIXED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED, + WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, + WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, + WorkflowStatusChoices.QUALITY_CHECK_FAILED, + WorkflowStatusChoices.QUALITY_CHECK_MINOR, + WorkflowStatusChoices.QUALITY_CHECK_PERFECT, + WorkflowStatusChoices.PROD_PERFECT, + WorkflowStatusChoices.PROD_MINOR, + WorkflowStatusChoices.PROD_MAJOR, +] + + +class Command(BaseCommand): + help = """Migrate CandidateURLs to DeltaUrl, apply the matching patterns, + and then promote to CuratedUrl based on collection workflow status""" + + def handle(self, *args, **kwargs): + # Log the start time for the entire process + overall_start_time = time.time() + self.stdout.write("Starting the migration process...") + + # Step 1: Clear all Delta instances + start_time = time.time() + DumpUrl.objects.all().delete() + CuratedUrl.objects.all().delete() + DeltaUrl.objects.all().delete() + DeltaExcludePattern.objects.all().delete() + DeltaIncludePattern.objects.all().delete() + DeltaTitlePattern.objects.all().delete() + DeltaDocumentTypePattern.objects.all().delete() + DeltaDivisionPattern.objects.all().delete() + self.stdout.write(f"Cleared all Delta instances in {time.time() - start_time:.2f} seconds.") + + # Step 2: Get collections ordered by URL count + start_time = time.time() + total_collections = Collection.objects.count() + collections = Collection.objects.annotate(url_count=Count("candidate_urls")).order_by("url_count") + self.stdout.write(f"Retrieved and ordered collections in {time.time() - start_time:.2f} seconds.") + + # Set to track URLs globally across all collections + global_unique_urls = set() + + # Process each collection individually + for index, collection in enumerate(collections): + collection_start_time = time.time() + self.stdout.write( + f"\nProcessing collection: {collection} with {collection.url_count} URLs ({index + 1}/{total_collections})" # noqa + ) + + # Step 3: Migrate CandidateURLs to DeltaUrl for this collection + urls_start_time = time.time() + delta_urls = [] + + for candidate_url in CandidateURL.objects.filter(collection=collection): + if candidate_url.url not in global_unique_urls: + global_unique_urls.add(candidate_url.url) + delta_urls.append( + DeltaUrl( + collection=candidate_url.collection, + url=candidate_url.url, + scraped_title=candidate_url.scraped_title, + generated_title=candidate_url.generated_title, + visited=candidate_url.visited, + document_type=candidate_url.document_type, + division=candidate_url.division, + to_delete=False, + ) + ) + + # Bulk create the unique DeltaUrl instances for this collection + DeltaUrl.objects.bulk_create(delta_urls) + self.stdout.write( + f"Migrated {len(delta_urls)} URLs to DeltaUrl in {time.time() - urls_start_time:.2f} seconds" + ) + + # Step 4: Migrate Patterns for this collection + patterns_start_time = time.time() + + for pattern_model in [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern]: + self.migrate_patterns_for_collection(pattern_model, collection) + + self.stdout.write(f"Pattern migration completed in {time.time() - patterns_start_time:.2f} seconds") + + # Step 5: Promote to CuratedUrl if applicable + if collection.workflow_status in STATUSES_TO_MIGRATE: + promote_start_time = time.time() + collection.promote_to_curated() + self.stdout.write(f"Promoted to CuratedUrl in {time.time() - promote_start_time:.2f} seconds") + + self.stdout.write( + f"Total processing time for collection: {time.time() - collection_start_time:.2f} seconds\n" + f"--------------------" + ) + + # Log the total time for the process + self.stdout.write(f"Total migration process completed in {time.time() - overall_start_time:.2f} seconds.") + + def migrate_patterns_for_collection(self, non_delta_model, collection): + """Migrate patterns from a non-delta model to the corresponding delta model for a specific collection.""" + # Determine the delta model name and fetch the model class + delta_model_name = "Delta" + non_delta_model.__name__ + delta_model = apps.get_model(non_delta_model._meta.app_label, delta_model_name) + + # Get all field names from both models except 'id' (primary key) + non_delta_fields = {field.name for field in non_delta_model._meta.fields if field.name != "id"} + delta_fields = {field.name for field in delta_model._meta.fields if field.name != "id"} + + # Find shared fields + shared_fields = non_delta_fields.intersection(delta_fields) + + # Only process patterns for the current collection + for pattern in non_delta_model.objects.filter(collection=collection): + # Build the dictionary of shared fields to copy + delta_fields_data = {field: getattr(pattern, field) for field in shared_fields} + + # Create an instance of the delta model and save it to call the custom save() method + delta_instance = delta_model(**delta_fields_data) + delta_instance.save() # Explicitly call save() to trigger custom logic diff --git a/sde_collections/migrations/0059_candidateurl_scraped_text.py b/sde_collections/migrations/0059_candidateurl_scraped_text.py new file mode 100644 index 00000000..cc3ea65b --- /dev/null +++ b/sde_collections/migrations/0059_candidateurl_scraped_text.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-10-21 23:10 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="scraped_text", + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..16cf4219 --- /dev/null +++ b/sde_collections/migrations/0059_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,128 @@ +# Generated by Django 4.2.9 on 2024-11-20 06:39 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + migrations.AlterModelTable( + name="candidateurl", + table="sde_collections_candidateurl", + ), + ] diff --git a/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py new file mode 100644 index 00000000..58478546 --- /dev/null +++ b/sde_collections/migrations/0059_url_curatedurl_deltaurl_dumpurl.py @@ -0,0 +1,146 @@ +# Generated by Django 4.2.9 on 2024-11-04 22:22 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="Url", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("url", models.CharField(max_length=4096, verbose_name="URL")), + ( + "scraped_title", + models.CharField( + blank=True, + default="", + help_text="This is the original title scraped by Sinequa", + max_length=1024, + verbose_name="Scraped Title", + ), + ), + ( + "generated_title", + models.CharField( + blank=True, + default="", + help_text="This is the title generated based on a Title Pattern", + max_length=1024, + verbose_name="Generated Title", + ), + ), + ("visited", models.BooleanField(default=False)), + ( + "document_type", + models.IntegerField( + choices=[ + (1, "Images"), + (2, "Data"), + (3, "Documentation"), + (4, "Software and Tools"), + (5, "Missions and Instruments"), + ], + null=True, + ), + ), + ( + "division", + models.IntegerField( + choices=[ + (1, "Astrophysics"), + (2, "Biological and Physical Sciences"), + (3, "Earth Science"), + (4, "Heliophysics"), + (5, "Planetary Science"), + (6, "General"), + ], + null=True, + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="urls", + to="sde_collections.collection", + ), + ), + ], + options={ + "verbose_name": "URL", + "verbose_name_plural": "URLs", + "ordering": ["url"], + }, + ), + migrations.CreateModel( + name="CuratedUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Curated URL", + "verbose_name_plural": "Curated URLs", + }, + bases=("sde_collections.url",), + ), + migrations.CreateModel( + name="DeltaUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ("delete", models.BooleanField(default=False)), + ], + options={ + "verbose_name": "Delta URL", + "verbose_name_plural": "Delta URLs", + }, + bases=("sde_collections.url",), + ), + migrations.CreateModel( + name="DumpUrl", + fields=[ + ( + "url_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="sde_collections.url", + ), + ), + ], + options={ + "verbose_name": "Dump URL", + "verbose_name_plural": "Dump URLs", + }, + bases=("sde_collections.url",), + ), + ] diff --git a/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py b/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py new file mode 100644 index 00000000..12a0fb3c --- /dev/null +++ b/sde_collections/migrations/0060_alter_candidateurl_scraped_text.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.9 on 2024-11-07 17:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_candidateurl_scraped_text"), + ] + + operations = [ + migrations.AlterField( + model_name="candidateurl", + name="scraped_text", + field=models.TextField( + blank=True, + default="", + help_text="This is the text scraped by Sinequa", + null=True, + verbose_name="Scraped Text", + ), + ), + ] diff --git a/sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py b/sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py new file mode 100644 index 00000000..1886e221 --- /dev/null +++ b/sde_collections/migrations/0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more.py @@ -0,0 +1,37 @@ +# Generated by Django 4.2.9 on 2024-11-07 17:40 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_url_curatedurl_deltaurl_dumpurl"), + ] + + operations = [ + migrations.RemoveField( + model_name="deltaurl", + name="url_ptr", + ), + migrations.RemoveField( + model_name="dumpurl", + name="url_ptr", + ), + migrations.RemoveField( + model_name="url", + name="collection", + ), + migrations.DeleteModel( + name="CuratedUrl", + ), + migrations.DeleteModel( + name="DeltaUrl", + ), + migrations.DeleteModel( + name="DumpUrl", + ), + migrations.DeleteModel( + name="Url", + ), + ] diff --git a/sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py b/sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py new file mode 100644 index 00000000..abf4492c --- /dev/null +++ b/sde_collections/migrations/0061_dumpurl_deltaurl_curatedurl.py @@ -0,0 +1,162 @@ +# Generated by Django 4.2.9 on 2024-11-07 17:44 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0060_remove_deltaurl_url_ptr_remove_dumpurl_url_ptr_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="DumpUrl", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("url", models.CharField(unique=True, verbose_name="Url")), + ("scraped_title", models.CharField(blank=True, default="", verbose_name="Scraped Title")), + ("generated_title", models.CharField(blank=True, default="", verbose_name="Generated Title")), + ("visited", models.BooleanField(default=False)), + ( + "document_type", + models.IntegerField( + choices=[ + (1, "Images"), + (2, "Data"), + (3, "Documentation"), + (4, "Software and Tools"), + (5, "Missions and Instruments"), + ], + null=True, + ), + ), + ( + "division", + models.IntegerField( + choices=[ + (1, "Astrophysics"), + (2, "Biological and Physical Sciences"), + (3, "Earth Science"), + (4, "Heliophysics"), + (5, "Planetary Science"), + (6, "General"), + ], + null=True, + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s_urls", + to="sde_collections.collection", + ), + ), + ], + options={ + "ordering": ["url"], + "abstract": False, + }, + ), + migrations.CreateModel( + name="DeltaUrl", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("url", models.CharField(unique=True, verbose_name="Url")), + ("scraped_title", models.CharField(blank=True, default="", verbose_name="Scraped Title")), + ("generated_title", models.CharField(blank=True, default="", verbose_name="Generated Title")), + ("visited", models.BooleanField(default=False)), + ( + "document_type", + models.IntegerField( + choices=[ + (1, "Images"), + (2, "Data"), + (3, "Documentation"), + (4, "Software and Tools"), + (5, "Missions and Instruments"), + ], + null=True, + ), + ), + ( + "division", + models.IntegerField( + choices=[ + (1, "Astrophysics"), + (2, "Biological and Physical Sciences"), + (3, "Earth Science"), + (4, "Heliophysics"), + (5, "Planetary Science"), + (6, "General"), + ], + null=True, + ), + ), + ("delete", models.BooleanField(default=False)), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s_urls", + to="sde_collections.collection", + ), + ), + ], + options={ + "ordering": ["url"], + "abstract": False, + }, + ), + migrations.CreateModel( + name="CuratedUrl", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("url", models.CharField(unique=True, verbose_name="Url")), + ("scraped_title", models.CharField(blank=True, default="", verbose_name="Scraped Title")), + ("generated_title", models.CharField(blank=True, default="", verbose_name="Generated Title")), + ("visited", models.BooleanField(default=False)), + ( + "document_type", + models.IntegerField( + choices=[ + (1, "Images"), + (2, "Data"), + (3, "Documentation"), + (4, "Software and Tools"), + (5, "Missions and Instruments"), + ], + null=True, + ), + ), + ( + "division", + models.IntegerField( + choices=[ + (1, "Astrophysics"), + (2, "Biological and Physical Sciences"), + (3, "Earth Science"), + (4, "Heliophysics"), + (5, "Planetary Science"), + (6, "General"), + ], + null=True, + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s_urls", + to="sde_collections.collection", + ), + ), + ], + options={ + "ordering": ["url"], + "abstract": False, + }, + ), + ] diff --git a/sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py b/sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py new file mode 100644 index 00000000..48996f5b --- /dev/null +++ b/sde_collections/migrations/0062_deltatitlepattern_deltaresolvedtitleerror_and_more.py @@ -0,0 +1,288 @@ +# Generated by Django 4.2.9 on 2024-11-11 17:17 + +from django.db import migrations, models +import django.db.models.deletion +import sde_collections.models.delta_patterns + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0061_dumpurl_deltaurl_curatedurl"), + ] + + operations = [ + migrations.CreateModel( + name="DeltaTitlePattern", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "match_pattern", + models.CharField( + help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned", + verbose_name="Pattern", + ), + ), + ( + "match_pattern_type", + models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1), + ), + ( + "title_pattern", + models.CharField( + help_text="This is the pattern for the new title. You can either write an exact replacement string (no quotes required) or you can write sinequa-valid code", + validators=[sde_collections.models.delta_patterns.validate_title_pattern], + verbose_name="Title Pattern", + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + ( + "curated_urls", + models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"), + ), + ( + "delta_urls", + models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"), + ), + ], + options={ + "verbose_name": "Title Pattern", + "verbose_name_plural": "Title Patterns", + "unique_together": {("collection", "match_pattern")}, + }, + ), + migrations.CreateModel( + name="DeltaResolvedTitleError", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("error_string", models.TextField()), + ("http_status_code", models.IntegerField(blank=True, null=True)), + ( + "delta_url", + models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltaurl"), + ), + ( + "title_pattern", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltatitlepattern" + ), + ), + ], + options={ + "abstract": False, + }, + ), + migrations.CreateModel( + name="DeltaResolvedTitle", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("resolved_title", models.CharField(blank=True, default="")), + ( + "delta_url", + models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltaurl"), + ), + ( + "title_pattern", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="sde_collections.deltatitlepattern" + ), + ), + ], + options={ + "verbose_name": "Resolved Title", + "verbose_name_plural": "Resolved Titles", + }, + ), + migrations.CreateModel( + name="DeltaIncludePattern", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "match_pattern", + models.CharField( + help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned", + verbose_name="Pattern", + ), + ), + ( + "match_pattern_type", + models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + ( + "curated_urls", + models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"), + ), + ( + "delta_urls", + models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"), + ), + ], + options={ + "verbose_name": "Include Pattern", + "verbose_name_plural": "Include Patterns", + "unique_together": {("collection", "match_pattern")}, + }, + ), + migrations.CreateModel( + name="DeltaExcludePattern", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "match_pattern", + models.CharField( + help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned", + verbose_name="Pattern", + ), + ), + ( + "match_pattern_type", + models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1), + ), + ("reason", models.TextField(blank=True, default="", verbose_name="Reason for excluding")), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + ( + "curated_urls", + models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"), + ), + ( + "delta_urls", + models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"), + ), + ], + options={ + "verbose_name": "Exclude Pattern", + "verbose_name_plural": "Exclude Patterns", + "unique_together": {("collection", "match_pattern")}, + }, + ), + migrations.CreateModel( + name="DeltaDocumentTypePattern", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "match_pattern", + models.CharField( + help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned", + verbose_name="Pattern", + ), + ), + ( + "match_pattern_type", + models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1), + ), + ( + "document_type", + models.IntegerField( + choices=[ + (1, "Images"), + (2, "Data"), + (3, "Documentation"), + (4, "Software and Tools"), + (5, "Missions and Instruments"), + ] + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + ( + "curated_urls", + models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"), + ), + ( + "delta_urls", + models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"), + ), + ], + options={ + "verbose_name": "Document Type Pattern", + "verbose_name_plural": "Document Type Patterns", + "unique_together": {("collection", "match_pattern")}, + }, + ), + migrations.CreateModel( + name="DeltaDivisionPattern", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "match_pattern", + models.CharField( + help_text="This pattern is compared against the URL of all the documents in the collection and matching documents will be returned", + verbose_name="Pattern", + ), + ), + ( + "match_pattern_type", + models.IntegerField(choices=[(1, "Individual URL Pattern"), (2, "Multi-URL Pattern")], default=1), + ), + ( + "division", + models.IntegerField( + choices=[ + (1, "Astrophysics"), + (2, "Biological and Physical Sciences"), + (3, "Earth Science"), + (4, "Heliophysics"), + (5, "Planetary Science"), + (6, "General"), + ] + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)s", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + ( + "curated_urls", + models.ManyToManyField(related_name="%(class)s_curated_urls", to="sde_collections.curatedurl"), + ), + ( + "delta_urls", + models.ManyToManyField(related_name="%(class)s_delta_urls", to="sde_collections.deltaurl"), + ), + ], + options={ + "verbose_name": "Division Pattern", + "verbose_name_plural": "Division Patterns", + "unique_together": {("collection", "match_pattern")}, + }, + ), + ] diff --git a/sde_collections/migrations/0063_merge_20241112_1428.py b/sde_collections/migrations/0063_merge_20241112_1428.py new file mode 100644 index 00000000..d104ce16 --- /dev/null +++ b/sde_collections/migrations/0063_merge_20241112_1428.py @@ -0,0 +1,13 @@ +# Generated by Django 4.2.9 on 2024-11-12 20:28 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0060_alter_candidateurl_scraped_text"), + ("sde_collections", "0062_deltatitlepattern_deltaresolvedtitleerror_and_more"), + ] + + operations = [] diff --git a/sde_collections/migrations/0064_alter_curatedurl_options_and_more.py b/sde_collections/migrations/0064_alter_curatedurl_options_and_more.py new file mode 100644 index 00000000..2a69c5d2 --- /dev/null +++ b/sde_collections/migrations/0064_alter_curatedurl_options_and_more.py @@ -0,0 +1,129 @@ +# Generated by Django 4.2.9 on 2024-11-12 20:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0063_merge_20241112_1428"), + ] + + operations = [ + migrations.AlterModelOptions( + name="curatedurl", + options={"ordering": ["url"], "verbose_name": "Curated Urls", "verbose_name_plural": "Curated Urls"}, + ), + migrations.AlterModelOptions( + name="deltadivisionpattern", + options={"verbose_name": "Delta Division Pattern", "verbose_name_plural": "Delta Division Patterns"}, + ), + migrations.AlterModelOptions( + name="deltadocumenttypepattern", + options={ + "verbose_name": "Delta Document Type Pattern", + "verbose_name_plural": "Delta Document Type Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltaexcludepattern", + options={"verbose_name": "Delta Exclude Pattern", "verbose_name_plural": "Delta Exclude Patterns"}, + ), + migrations.AlterModelOptions( + name="deltaincludepattern", + options={"verbose_name": "Delta Include Pattern", "verbose_name_plural": "Delta Include Patterns"}, + ), + migrations.AlterModelOptions( + name="deltatitlepattern", + options={"verbose_name": "Delta Title Pattern", "verbose_name_plural": "Delta Title Patterns"}, + ), + migrations.AlterModelOptions( + name="deltaurl", + options={"ordering": ["url"], "verbose_name": "Delta Urls", "verbose_name_plural": "Delta Urls"}, + ), + migrations.AlterModelOptions( + name="dumpurl", + options={"ordering": ["url"], "verbose_name": "Dump Urls", "verbose_name_plural": "Dump Urls"}, + ), + migrations.AddField( + model_name="curatedurl", + name="scraped_text", + field=models.TextField( + blank=True, default="", help_text="This is the text scraped by Sinequa", verbose_name="Scraped Text" + ), + ), + migrations.AddField( + model_name="deltaurl", + name="scraped_text", + field=models.TextField( + blank=True, default="", help_text="This is the text scraped by Sinequa", verbose_name="Scraped Text" + ), + ), + migrations.AddField( + model_name="dumpurl", + name="scraped_text", + field=models.TextField( + blank=True, default="", help_text="This is the text scraped by Sinequa", verbose_name="Scraped Text" + ), + ), + migrations.AlterField( + model_name="curatedurl", + name="generated_title", + field=models.CharField( + blank=True, + default="", + help_text="This is the title generated based on a Title Pattern", + verbose_name="Generated Title", + ), + ), + migrations.AlterField( + model_name="curatedurl", + name="scraped_title", + field=models.CharField( + blank=True, + default="", + help_text="This is the original title scraped by Sinequa", + verbose_name="Scraped Title", + ), + ), + migrations.AlterField( + model_name="deltaurl", + name="generated_title", + field=models.CharField( + blank=True, + default="", + help_text="This is the title generated based on a Title Pattern", + verbose_name="Generated Title", + ), + ), + migrations.AlterField( + model_name="deltaurl", + name="scraped_title", + field=models.CharField( + blank=True, + default="", + help_text="This is the original title scraped by Sinequa", + verbose_name="Scraped Title", + ), + ), + migrations.AlterField( + model_name="dumpurl", + name="generated_title", + field=models.CharField( + blank=True, + default="", + help_text="This is the title generated based on a Title Pattern", + verbose_name="Generated Title", + ), + ), + migrations.AlterField( + model_name="dumpurl", + name="scraped_title", + field=models.CharField( + blank=True, + default="", + help_text="This is the original title scraped by Sinequa", + verbose_name="Scraped Title", + ), + ), + ] diff --git a/sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py b/sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py new file mode 100644 index 00000000..a7507629 --- /dev/null +++ b/sde_collections/migrations/0065_rename_delete_deltaurl_to_delete_and_more.py @@ -0,0 +1,42 @@ +# Generated by Django 4.2.9 on 2024-11-16 00:26 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0064_alter_curatedurl_options_and_more"), + ] + + operations = [ + migrations.RenameField( + model_name="deltaurl", + old_name="delete", + new_name="to_delete", + ), + migrations.AlterField( + model_name="curatedurl", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="curated_urls", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltaurl", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, related_name="delta_urls", to="sde_collections.collection" + ), + ), + migrations.AlterField( + model_name="dumpurl", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, related_name="dump_urls", to="sde_collections.collection" + ), + ), + ] diff --git a/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py b/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py new file mode 100644 index 00000000..f9be360b --- /dev/null +++ b/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py @@ -0,0 +1,83 @@ +# Generated by Django 4.2.9 on 2024-11-23 17:44 + +from django.db import migrations, models +import sde_collections.models.delta_patterns + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"), + ] + + operations = [ + migrations.AlterUniqueTogether( + name="deltadivisionpattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltadocumenttypepattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltaexcludepattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltaincludepattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltatitlepattern", + unique_together=set(), + ), + migrations.AlterField( + model_name="deltadivisionpattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="title_pattern", + field=models.CharField( + help_text="Pattern for the new title. Support exact replacement or sinequa-valid code", + validators=[sde_collections.models.delta_patterns.validate_title_pattern], + verbose_name="Title Pattern", + ), + ), + ] diff --git a/sde_collections/migrations/0066_merge_20241120_0158.py b/sde_collections/migrations/0066_merge_20241120_0158.py new file mode 100644 index 00000000..ccd58b61 --- /dev/null +++ b/sde_collections/migrations/0066_merge_20241120_0158.py @@ -0,0 +1,13 @@ +# Generated by Django 4.2.9 on 2024-11-20 07:58 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0059_candidateurl_tdamm_tag_manual_and_more"), + ("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"), + ] + + operations = [] diff --git a/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py b/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py new file mode 100644 index 00000000..4a244362 --- /dev/null +++ b/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py @@ -0,0 +1,73 @@ +# Generated by Django 4.2.9 on 2024-11-23 18:14 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0066_alter_deltadivisionpattern_unique_together_and_more"), + ] + + operations = [ + migrations.AlterModelOptions( + name="deltadivisionpattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Division Pattern", + "verbose_name_plural": "Delta Division Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltadocumenttypepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Document Type Pattern", + "verbose_name_plural": "Delta Document Type Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltaexcludepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Exclude Pattern", + "verbose_name_plural": "Delta Exclude Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltaincludepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Include Pattern", + "verbose_name_plural": "Delta Include Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltatitlepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Title Pattern", + "verbose_name_plural": "Delta Title Patterns", + }, + ), + migrations.AlterUniqueTogether( + name="deltadivisionpattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltadocumenttypepattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltaexcludepattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltaincludepattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltatitlepattern", + unique_together={("collection", "match_pattern")}, + ), + ] diff --git a/sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..2391e18c --- /dev/null +++ b/sde_collections/migrations/0067_remove_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.9 on 2024-11-20 16:12 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0066_merge_20241120_0158"), + ] + + operations = [ + migrations.RemoveField( + model_name="candidateurl", + name="tdamm_tag_manual", + ), + migrations.RemoveField( + model_name="candidateurl", + name="tdamm_tag_ml", + ), + ] diff --git a/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py b/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py new file mode 100644 index 00000000..91d87951 --- /dev/null +++ b/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py @@ -0,0 +1,124 @@ +# Generated by Django 4.2.9 on 2024-11-24 19:39 + +from django.db import migrations, models +import django.db.models.deletion +import sde_collections.models.delta_patterns + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0067_alter_deltadivisionpattern_options_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="deltadivisionpattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltadivisionpattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltadivisionpattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="title_pattern", + field=models.CharField( + help_text="Pattern for the new title. Can be an exact replacement string or sinequa-valid code", + validators=[sde_collections.models.delta_patterns.validate_title_pattern], + verbose_name="Title Pattern", + ), + ), + ] diff --git a/sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py b/sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py new file mode 100644 index 00000000..adae0e2b --- /dev/null +++ b/sde_collections/migrations/0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more.py @@ -0,0 +1,344 @@ +# Generated by Django 4.2.9 on 2024-11-20 16:23 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0067_remove_candidateurl_tdamm_tag_manual_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="curatedurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="curatedurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="deltaurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="deltaurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="dumpurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="dumpurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + ] diff --git a/sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..d45e8108 --- /dev/null +++ b/sde_collections/migrations/0069_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,124 @@ +# Generated by Django 4.2.9 on 2024-11-20 23:42 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0068_curatedurl_tdamm_tag_manual_curatedurl_tdamm_tag_ml_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AddField( + model_name="candidateurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + ] diff --git a/sde_collections/migrations/0070_merge_20241205_1437.py b/sde_collections/migrations/0070_merge_20241205_1437.py new file mode 100644 index 00000000..8d904006 --- /dev/null +++ b/sde_collections/migrations/0070_merge_20241205_1437.py @@ -0,0 +1,13 @@ +# Generated by Django 4.2.9 on 2024-12-05 20:37 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0068_alter_deltadivisionpattern_collection_and_more"), + ("sde_collections", "0069_candidateurl_tdamm_tag_manual_and_more"), + ] + + operations = [] diff --git a/sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py b/sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py new file mode 100644 index 00000000..12b7ae8e --- /dev/null +++ b/sde_collections/migrations/0071_alter_candidateurl_tdamm_tag_manual_and_more.py @@ -0,0 +1,466 @@ +# Generated by Django 4.2.9 on 2024-12-05 23:36 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0070_merge_20241205_1437"), + ] + + operations = [ + migrations.AlterField( + model_name="candidateurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AlterField( + model_name="candidateurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + migrations.AlterField( + model_name="curatedurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AlterField( + model_name="curatedurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + migrations.AlterField( + model_name="deltaurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AlterField( + model_name="deltaurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + migrations.AlterField( + model_name="dumpurl", + name="tdamm_tag_manual", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_manual", + null=True, + size=None, + ), + ), + migrations.AlterField( + model_name="dumpurl", + name="tdamm_tag_ml", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + ("Not TDAMM", "Not TDAMM"), + ("MMA_M_EM", "Messenger - EM Radiation"), + ("MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays"), + ("MMA_M_EM_X", "Messenger - EM Radiation - X-rays"), + ("MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet"), + ("MMA_M_EM_O", "Messenger - EM Radiation - Optical"), + ("MMA_M_EM_I", "Messenger - EM Radiation - Infrared"), + ("MMA_M_EM_M", "Messenger - EM Radiation - Microwave"), + ("MMA_M_EM_R", "Messenger - EM Radiation - Radio"), + ("MMA_M_G", "Messenger - Gravitational Waves"), + ("MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral"), + ("MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic"), + ("MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous"), + ("MMA_M_G_B", "Messenger - Gravitational Waves - Burst"), + ("MMA_M_C", "Messenger - Cosmic Rays"), + ("MMA_M_N", "Messenger - Neutrinos"), + ("MMA_O_BI", "Objects - Binaries"), + ("MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes"), + ("MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars"), + ("MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables"), + ("MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole"), + ("MMA_O_BI_B", "Objects - Binaries - Binary Pulsars"), + ("MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries"), + ("MMA_O_BH", "Objects - Black Holes"), + ("MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei"), + ("MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass"), + ("MMA_O_BH_STM", "Objects - Black Holes - Stellar mass"), + ("MMA_O_BH_SUM", "Objects - Black Holes - Supermassive"), + ("MMA_O_E", "Objects - Exoplanets"), + ("MMA_O_N", "Objects - Neutron Stars"), + ("MMA_O_N_M", "Objects - Neutron Stars - Magnetars"), + ("MMA_O_N_P", "Objects - Neutron Stars - Pulsars"), + ("MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula"), + ("MMA_O_S", "Objects - Supernova Remnants"), + ("MMA_S_F", "Signals - Fast Radio Bursts"), + ("MMA_S_G", "Signals - Gamma-ray Bursts"), + ("MMA_S_K", "Signals - Kilonovae"), + ("MMA_S_N", "Signals - Novae"), + ("MMA_S_P", "Signals - Pevatrons"), + ("MMA_S_ST", "Signals - Stellar flares"), + ("MMA_S_SU", "Signals - Supernovae"), + ], + max_length=255, + ), + blank=True, + db_column="tdamm_tag_ml", + null=True, + size=None, + ), + ), + migrations.AlterModelTable( + name="candidateurl", + table=None, + ), + ] diff --git a/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py new file mode 100644 index 00000000..8a746cb4 --- /dev/null +++ b/sde_collections/migrations/0072_collection_reindexing_status_reindexinghistory.py @@ -0,0 +1,133 @@ +# Generated by Django 4.2.9 on 2024-12-06 03:51 +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +def set_initial_reindexing_status(apps, schema_editor): + Collection = apps.get_model("sde_collections", "Collection") + + # Define the workflow status values + RESEARCH_IN_PROGRESS = 1 + READY_FOR_ENGINEERING = 2 + ENGINEERING_IN_PROGRESS = 3 + READY_FOR_CURATION = 4 + CURATION_IN_PROGRESS = 5 + CURATED = 6 + QUALITY_FIXED = 7 + SECRET_DEPLOYMENT_STARTED = 8 + SECRET_DEPLOYMENT_FAILED = 9 + READY_FOR_LRM_QUALITY_CHECK = 10 + READY_FOR_FINAL_QUALITY_CHECK = 11 + QUALITY_CHECK_FAILED = 12 + QUALITY_CHECK_PERFECT = 13 + MERGE_PENDING = 14 + NEEDS_DELETE = 19 + + # Workflow statuses that should be marked as reindexing not needed + reindexing_not_needed_statuses = [ + RESEARCH_IN_PROGRESS, + READY_FOR_ENGINEERING, + ENGINEERING_IN_PROGRESS, + READY_FOR_CURATION, + CURATION_IN_PROGRESS, + CURATED, + QUALITY_FIXED, + SECRET_DEPLOYMENT_STARTED, + SECRET_DEPLOYMENT_FAILED, + READY_FOR_LRM_QUALITY_CHECK, + READY_FOR_FINAL_QUALITY_CHECK, + QUALITY_CHECK_FAILED, + QUALITY_CHECK_PERFECT, + MERGE_PENDING, + NEEDS_DELETE, + ] + + # Set collections that don't need reindexing + Collection.objects.filter(workflow_status__in=reindexing_not_needed_statuses).update( + reindexing_status=1 + ) # NOT_NEEDED + + # All other collections need reindexing + Collection.objects.exclude(workflow_status__in=reindexing_not_needed_statuses).update(reindexing_status=2) # NEEDED + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ("sde_collections", "0071_alter_candidateurl_tdamm_tag_manual_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="collection", + name="reindexing_status", + field=models.IntegerField( + choices=[ + (1, "Reindexing Not Needed"), + (2, "Reindexing Needed on LRM Dev"), + (3, "Reindexing Finished on LRM Dev"), + (4, "Ready for Curation"), + (5, "Curated"), + (6, "Indexed on Prod"), + ], + default=1, + verbose_name="Reindexing Status", + ), + ), + migrations.CreateModel( + name="ReindexingHistory", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "reindexing_status", + models.IntegerField( + choices=[ + (1, "Reindexing Not Needed"), + (2, "Reindexing Needed on LRM Dev"), + (3, "Reindexing Finished on LRM Dev"), + (4, "Ready for Curation"), + (5, "Curated"), + (6, "Indexed on Prod"), + ], + default=1, + ), + ), + ( + "old_status", + models.IntegerField( + choices=[ + (1, "Reindexing Not Needed"), + (2, "Reindexing Needed on LRM Dev"), + (3, "Reindexing Finished on LRM Dev"), + (4, "Ready for Curation"), + (5, "Curated"), + (6, "Indexed on Prod"), + ], + null=True, + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "collection", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="reindexing_history", + to="sde_collections.collection", + ), + ), + ( + "curated_by", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.DO_NOTHING, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + ), + migrations.RunPython(set_initial_reindexing_status), + ] diff --git a/sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py b/sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py new file mode 100644 index 00000000..df71a2b6 --- /dev/null +++ b/sde_collections/migrations/0073_alter_collection_workflow_status_and_more.py @@ -0,0 +1,100 @@ +# Generated by Django 4.2.9 on 2024-12-10 19:18 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0072_collection_reindexing_status_reindexinghistory"), + ] + + operations = [ + migrations.AlterField( + model_name="collection", + name="workflow_status", + field=models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "QC: Failed"), + (18, "QC: Minor Issues"), + (13, "QC: Perfect"), + (14, "Prod: Perfect"), + (15, "Prod: Minor Issues"), + (16, "Prod: Major Issues"), + (17, "Code Merge Pending"), + (19, "Delete from Prod"), + (20, "Indexing Finished on LRM Dev"), + ], + default=1, + ), + ), + migrations.AlterField( + model_name="workflowhistory", + name="old_status", + field=models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "QC: Failed"), + (18, "QC: Minor Issues"), + (13, "QC: Perfect"), + (14, "Prod: Perfect"), + (15, "Prod: Minor Issues"), + (16, "Prod: Major Issues"), + (17, "Code Merge Pending"), + (19, "Delete from Prod"), + (20, "Indexing Finished on LRM Dev"), + ], + null=True, + ), + ), + migrations.AlterField( + model_name="workflowhistory", + name="workflow_status", + field=models.IntegerField( + choices=[ + (1, "Research in Progress"), + (2, "Ready for Engineering"), + (3, "Engineering in Progress"), + (4, "Ready for Curation"), + (5, "Curation in Progress"), + (6, "Curated"), + (7, "Quality Fixed"), + (8, "Secret Deployment Started"), + (9, "Secret Deployment Failed"), + (10, "Ready for LRM Quality Check"), + (11, "Ready for Quality Check"), + (12, "QC: Failed"), + (18, "QC: Minor Issues"), + (13, "QC: Perfect"), + (14, "Prod: Perfect"), + (15, "Prod: Minor Issues"), + (16, "Prod: Major Issues"), + (17, "Code Merge Pending"), + (19, "Delete from Prod"), + (20, "Indexing Finished on LRM Dev"), + ], + default=1, + ), + ), + ] diff --git a/sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py b/sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py new file mode 100644 index 00000000..06963e83 --- /dev/null +++ b/sde_collections/migrations/0074_alter_collection_reindexing_status_and_more.py @@ -0,0 +1,59 @@ +# Generated by Django 4.2.9 on 2024-12-11 02:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0073_alter_collection_workflow_status_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="collection", + name="reindexing_status", + field=models.IntegerField( + choices=[ + (1, "Re-Indexing Not Needed"), + (2, "Re-Indexing Needed"), + (3, "Re-Indexing Finished"), + (4, "Ready for Re-Curation"), + (5, "Re-Curation Finished"), + (6, "Re-Indexed on Prod"), + ], + default=1, + verbose_name="Reindexing Status", + ), + ), + migrations.AlterField( + model_name="reindexinghistory", + name="old_status", + field=models.IntegerField( + choices=[ + (1, "Re-Indexing Not Needed"), + (2, "Re-Indexing Needed"), + (3, "Re-Indexing Finished"), + (4, "Ready for Re-Curation"), + (5, "Re-Curation Finished"), + (6, "Re-Indexed on Prod"), + ], + null=True, + ), + ), + migrations.AlterField( + model_name="reindexinghistory", + name="reindexing_status", + field=models.IntegerField( + choices=[ + (1, "Re-Indexing Not Needed"), + (2, "Re-Indexing Needed"), + (3, "Re-Indexing Finished"), + (4, "Ready for Re-Curation"), + (5, "Re-Curation Finished"), + (6, "Re-Indexed on Prod"), + ], + default=1, + ), + ), + ] diff --git a/sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py b/sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py new file mode 100644 index 00000000..5ecddbc9 --- /dev/null +++ b/sde_collections/migrations/0075_alter_collection_reindexing_status_and_more.py @@ -0,0 +1,99 @@ +# Generated by Django 4.2.9 on 2024-12-13 19:57 + +from django.db import migrations, models + + +def migrate_reindexing_statuses(apps, schema_editor): + Collection = apps.get_model("sde_collections", "Collection") + ReindexingHistory = apps.get_model("sde_collections", "ReindexingHistory") + + # Update Collections + Collection.objects.filter(reindexing_status=6).update(reindexing_status=7) # Move "Indexed on Prod" first + Collection.objects.filter(reindexing_status=5).update(reindexing_status=6) # Then move "Curated" + # 5 is now free for "Curation in Progress" + + # Update ReindexingHistory + ReindexingHistory.objects.filter(reindexing_status=6).update(reindexing_status=7) + ReindexingHistory.objects.filter(reindexing_status=5).update(reindexing_status=6) + + ReindexingHistory.objects.filter(old_status=6).update(old_status=7) + ReindexingHistory.objects.filter(old_status=5).update(old_status=6) + + +def reverse_migrate_reindexing_statuses(apps, schema_editor): + Collection = apps.get_model("sde_collections", "Collection") + ReindexingHistory = apps.get_model("sde_collections", "ReindexingHistory") + + # Reverse Collections + Collection.objects.filter(reindexing_status=5).update(reindexing_status=None) # Clear new status + Collection.objects.filter(reindexing_status=6).update(reindexing_status=5) + Collection.objects.filter(reindexing_status=7).update(reindexing_status=6) + + # Reverse ReindexingHistory + ReindexingHistory.objects.filter(reindexing_status=5).update(reindexing_status=None) + ReindexingHistory.objects.filter(reindexing_status=6).update(reindexing_status=5) + ReindexingHistory.objects.filter(reindexing_status=7).update(reindexing_status=6) + + ReindexingHistory.objects.filter(old_status=5).update(old_status=None) + ReindexingHistory.objects.filter(old_status=6).update(old_status=5) + ReindexingHistory.objects.filter(old_status=7).update(old_status=6) + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0074_alter_collection_reindexing_status_and_more"), + ] + + operations = [ + migrations.RunPython(migrate_reindexing_statuses, reverse_migrate_reindexing_statuses), + migrations.AlterField( + model_name="collection", + name="reindexing_status", + field=models.IntegerField( + choices=[ + (1, "Re-Indexing Not Needed"), + (2, "Re-Indexing Needed"), + (3, "Re-Indexing Finished"), + (4, "Ready for Re-Curation"), + (5, "Re-Curation in Progress"), + (6, "Re-Curation Finished"), + (7, "Re-Indexed on Prod"), + ], + default=1, + verbose_name="Reindexing Status", + ), + ), + migrations.AlterField( + model_name="reindexinghistory", + name="old_status", + field=models.IntegerField( + choices=[ + (1, "Re-Indexing Not Needed"), + (2, "Re-Indexing Needed"), + (3, "Re-Indexing Finished"), + (4, "Ready for Re-Curation"), + (5, "Re-Curation in Progress"), + (6, "Re-Curation Finished"), + (7, "Re-Indexed on Prod"), + ], + null=True, + ), + ), + migrations.AlterField( + model_name="reindexinghistory", + name="reindexing_status", + field=models.IntegerField( + choices=[ + (1, "Re-Indexing Not Needed"), + (2, "Re-Indexing Needed"), + (3, "Re-Indexing Finished"), + (4, "Ready for Re-Curation"), + (5, "Re-Curation in Progress"), + (6, "Re-Curation Finished"), + (7, "Re-Indexed on Prod"), + ], + default=1, + ), + ), + ] diff --git a/sde_collections/models/README.md b/sde_collections/models/README.md new file mode 100644 index 00000000..3c326f6a --- /dev/null +++ b/sde_collections/models/README.md @@ -0,0 +1,15 @@ +# COSMOS Curation System + +A system for managing collections of URLs through pattern-based rules and status workflows. + +## Documentation + + +- [URL Pattern Overview](./README_PATTERN_OVERVIEW.md) - Core pattern system for URL filtering and modification + - [Pattern System Details](./README_PATTERN_SYSTEM.md) + - [URL Lifecycle Management](./README_LIFECYCLE.md) + - [Pattern Resolution](./README_PATTERN_RESOLUTION.md) + - [URL Inclusion/Exclusion](./README_INCLUSION.md) + - [Pattern Unapplication Logic](./README_UNAPPLY_LOGIC.md) +- [Collection Status Workflows](./README_STATUS_TRIGGERS.md) - Collection progression and automated triggers +- [Reindexing Status System](./README_REINDEXING_STATUSES.md) - Status management for reindexing collections diff --git a/sde_collections/models/README_INCLUSION.md b/sde_collections/models/README_INCLUSION.md new file mode 100644 index 00000000..d2fedf51 --- /dev/null +++ b/sde_collections/models/README_INCLUSION.md @@ -0,0 +1,146 @@ +# URL Include and Exclude Patterns + +## Overview + +The pattern system allows you to control which URLs are included in or excluded from your collection using two types of patterns: +- **Exclude Patterns**: Mark URLs for exclusion from the collection +- **Include Patterns**: Explicitly include URLs, overriding any exclude patterns + +## Pattern Types + +### Individual URL Patterns +- Matches exact URLs +- Best for targeting specific pages +- No wildcards allowed +```python +# Matches only exactly this URL +match_pattern = "https://example.com/docs/specific-page.html" +``` + +### Multi-URL (Wildcard) Patterns +- Uses `*` as a wildcard to match multiple URLs +- Best for targeting entire directories or file types +- Can have wildcards anywhere in the pattern +```python +# Matches all files in the /docs directory +match_pattern = "https://example.com/docs/*" + +# Matches all PDF files +match_pattern = "https://example.com/*.pdf" +``` + +## Pattern Precedence + +1. Include patterns **always** take precedence over exclude patterns +2. More specific patterns take precedence over general patterns +3. If a URL matches both an include and exclude pattern, it will be included + +## Common Examples + +### Excluding a Directory But Including Specific Files + +```python +# Exclude the internal docs directory +DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/internal/*", + match_pattern_type=2 # Multi-URL pattern +) + +# But include specific approved pages +DeltaIncludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/internal/public-roadmap.html", + match_pattern_type=1 # Individual URL pattern +) +``` + +### Including Only Specific File Types + +```python +# Exclude everything in docs directory +DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/*", + match_pattern_type=2 +) + +# Include only PDF files +DeltaIncludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/*.pdf", + match_pattern_type=2 +) +``` + +### Folder-Based Access Control + +```python +# Exclude all draft documents +DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/drafts/*", + match_pattern_type=2 +) + +# Include the approved drafts subfolder +DeltaIncludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/drafts/approved/*", + match_pattern_type=2 +) +``` + +## Best Practices + +1. **Start Specific**: Begin with specific patterns and broaden as needed + ```python + # Better + match_pattern = "https://example.com/docs/api/v1/*" + # Less precise + match_pattern = "https://example.com/docs/*" + ``` + +2. **Use Include for Exceptions**: When excluding a large section, use include patterns for exceptions + ```python + # Exclude staging environment + exclude_pattern = "https://staging.example.com/*" + # Include specific staging features that should be public + include_pattern = "https://staging.example.com/features/released/*" + ``` + +3. **Document Patterns**: Keep track of why each pattern was added + ```python + DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/internal/*", + reason="Internal documentation not ready for public release" + ) + ``` + +4. **Regular Maintenance**: Review patterns periodically to ensure they're still needed and correct + +## Common Gotchas + +1. **Trailing Slashes**: URLs with and without trailing slashes are treated as different + ```python + # These are different patterns + "https://example.com/docs" + "https://example.com/docs/" + ``` + +2. **Over-Inclusive Wildcards**: Be careful with patterns that might match too much + ```python + # Dangerous: Could match more than intended + match_pattern = "https://example.com/*internal*" + + # Better: More specific + match_pattern = "https://example.com/internal/*" + ``` + +3. **Pattern Order**: Remember that include patterns always win, regardless of the order they're created + ```python + # This URL will be included despite the exclude pattern + exclude_pattern = "https://example.com/docs/*" + include_pattern = "https://example.com/docs/public.html" + ``` diff --git a/sde_collections/models/README_LIFECYCLE.md b/sde_collections/models/README_LIFECYCLE.md new file mode 100644 index 00000000..cd6bcc33 --- /dev/null +++ b/sde_collections/models/README_LIFECYCLE.md @@ -0,0 +1,240 @@ +# URL Migration and Promotion Guide + +## Overview +This document explains the lifecycle of URLs in the system, focusing on two critical processes: +1. Migration from DumpUrls to DeltaUrls +2. Promotion from DeltaUrls to CuratedUrls + +## Core Concepts + +### URL States +- **DumpUrls**: Raw data from initial scraping/indexing +- **DeltaUrls**: Work-in-progress changes and modifications +- **CuratedUrls**: Production-ready, approved content + +### Fields That Transfer +All fields transfer between states, including: +- URL +- Scraped Title +- Generated Title +- Document Type +- Division +- Excluded Status +- Scraped Text +- Any additional metadata + +## Pattern Application + +### When Patterns Are Applied +Patterns are applied in two scenarios: +1. During migration from Dump to Delta +2. When a new pattern is created/updated + +Patterns are NOT applied during promotion. The effects of patterns (modified titles, document types, etc.) are carried through to CuratedUrls during promotion, but the patterns themselves don't reapply. + +### Pattern Effects +- Patterns modify DeltaUrls when they are created or when DeltaUrls are created through migration +- Pattern-modified fields (titles, document types, etc.) become part of the DeltaUrl's data +- These modifications persist through promotion to CuratedUrls +- Pattern relationships (which patterns affect which URLs) are maintained for tracking purposes + +## Migration Process (Dump → Delta) + +### Overview +Migration converts DumpUrls to DeltaUrls, preserving all fields and applying patterns. This process happens when: +- New content is scraped +- Content is reindexed +- Collection is being prepared for curation + +### Steps +1. Clear existing DeltaUrls +2. Process each DumpUrl: + - If matching CuratedUrl exists: Create Delta with all fields + - If no matching CuratedUrl: Create Delta as new URL +3. Process missing CuratedUrls: + - Create deletion Deltas for any not in Dump +4. Apply all patterns to new Deltas +5. Clear DumpUrls + +## Migration Process (Dump → Delta) + +### Overview +Migration converts DumpUrls to DeltaUrls, preserving all fields and applying patterns. This process happens when: +- New content is scraped +- Content is reindexed +- Collection is being prepared for curation +### Steps +1. Clear existing DeltaUrls +2. Process each DumpUrl: + - If matching CuratedUrl exists: Create Delta with all fields + - If no matching CuratedUrl: Create Delta as new URL +3. Process missing CuratedUrls: + - Create deletion Deltas for any not in Dump +4. Apply all patterns to new Deltas +5. Clear DumpUrls + +### Examples + +#### Example 1: Basic Migration +If there are no patterns or existing CuratedUrls, the DeltaUrl will be created from the DumpUrl. +```python +# Starting State +dump_url = DumpUrl( + url="example.com/doc", + scraped_title="Original Title", + document_type=DocumentTypes.DOCUMENTATION +) + +# After Migration +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="Original Title", + document_type=DocumentTypes.DOCUMENTATION, + to_delete=False +) +``` + +#### Example 2: Migration with Existing Curated +If a CuratedUrl exists for the URL, and the DumpUrl has changes, a DeltaUrl will be created. +```python +# Starting State +dump_url = DumpUrl( + url="example.com/doc", + scraped_title="New Title", + document_type=DocumentTypes.DOCUMENTATION +) + +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="Old Title", + document_type=DocumentTypes.DOCUMENTATION +) + +# After Migration +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="New Title", # Different from curated + document_type=DocumentTypes.DOCUMENTATION, + to_delete=False +) + +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="Old Title", + document_type=DocumentTypes.DOCUMENTATION +) +``` + +#### Example 3: Migration with Pattern Application +If a pattern exists that modifies the document type of a DumpUrl, that pattern will be applied and the DeltaUrl will reflect the pattern's changes. +```python +# Starting State +dump_url = DumpUrl( + url="example.com/data/file.pdf", + scraped_title="Data File", + document_type=None +) +document_type_pattern = DocumentTypePattern( + match_pattern="*.pdf", + document_type=DocumentTypes.DATA +) + +# After Migration and Pattern Application +delta_url = DeltaUrl( + url="example.com/data/file.pdf", + scraped_title="Data File", + document_type=DocumentTypes.DATA, # Set by pattern + to_delete=False +) +``` + +## Promotion Process (Delta → Curated) + +### Overview +Promotion moves DeltaUrls to CuratedUrls, carrying forward all changes including pattern-applied modifications. This occurs when: +- A curator marks a collection as Curated + +### Steps +1. Process each DeltaUrl: + - If marked for deletion: Remove matching CuratedUrl + - Otherwise: Update/create CuratedUrl with ALL fields +2. Clear all DeltaUrls +3. Update pattern relationship tracking + +### Examples + +#### Example 1: Basic Promotion +If there ae no CuratedUrls for the URL, the DeltaUrl will be promoted to a new CuratedUrl. +```python +# Starting State +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="New Title", + document_type=DocumentTypes.DOCUMENTATION, + to_delete=False +) + +# After Promotion +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="New Title", + document_type=DocumentTypes.DOCUMENTATION +) +``` + +#### Example 2: Promotion with NULL Override +It's important to notice that the None value in the DeltaUrl is preserved in the CuratedUrl. +```python +# Starting State +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="Title", + document_type=None, # Explicitly set to None by pattern + to_delete=False +) + +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="Title", + document_type=DocumentTypes.DOCUMENTATION +) + +# After Promotion +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="Title", + document_type=None # NULL value preserved +) +``` + +#### Example 3: Deletion During Promotion +If there is no DumpUrl for an existing CuratedUrl, this signifies the url has been removed from the collection. A DeltaUrl with `to_delete=True` will be created, and on promotion the CuratedUrl will be deleted. +```python +# Starting State +delta_url = DeltaUrl( + url="example.com/old-doc", + scraped_title="Old Title", + to_delete=True +) + +curated_url = CuratedUrl( + url="example.com/old-doc", + scraped_title="Old Title" +) + +# After Promotion +# CuratedUrl is deleted +# DeltaUrl is cleared +``` + +## Important Notes + +### Field Handling +- ALL fields are copied during migration and promotion +- NULL values in DeltaUrls are treated as explicit values +- Pattern-set values take precedence over original values + +### Pattern Behavior +- Patterns only apply during migration or when patterns themselves are created/updated +- Pattern effects are preserved during promotion as regular field values +- Patterns are NOT re-applied during promotion. This means you can't add a DeltaUrl outside of the migration process and expect patterns to apply. In this case, you would need to either add it as a DumpUrl and migrate it correctly, or add it as a DeltaUrl manually apply the pattern. diff --git a/sde_collections/models/README_MANUAL_TESTING.md b/sde_collections/models/README_MANUAL_TESTING.md new file mode 100644 index 00000000..2be75f50 --- /dev/null +++ b/sde_collections/models/README_MANUAL_TESTING.md @@ -0,0 +1,166 @@ +# COSMOS Curation System Testing Guide + +## Resources +There are 14 collections which have been reindexed on dev and can have their statuses changed to `REINDEXING_FINISHED` to test url importing. The collections and their counts can be seen [here](https://docs.google.com/spreadsheets/d/1z_YeTwsyadW6ywPsahUElnf8X65gP7t7UyaO7sVqGiI/edit?gid=1316450061#gid=1316450061). + +## Test Flow 1: Basic URL Collection Lifecycle + +### Objective +Verify the complete lifecycle of a URL collection from initial creation through curation to production. + +### Prerequisites +- Access to dev environment +- Test collection created +- Sample URLs ready for testing + +### Test Cases + +#### 1.1 Collection Status Progression +1. Create new collection in `RESEARCH_IN_PROGRESS` status +2. Verify initial scraper and indexer configs are created when moved to `READY_FOR_ENGINEERING` +3. Progress through `ENGINEERING_IN_PROGRESS` to `INDEXING_FINISHED_ON_DEV` +4. Confirm full text fetch triggers automatically +5. Verify status updates to `READY_FOR_CURATION` +6. Check plugin config creation +7. Move through `CURATION_IN_PROGRESS` to `CURATED` +8. Verify DeltaUrls promotion to CuratedUrls +9. Test quality check status changes (`QUALITY_CHECK_PERFECT/MINOR`) +10. Confirm collection appears in public query after PR merge + +#### 1.2 Data State Transitions +1. Verify DumpUrls are created during indexing +2. Test migration from DumpUrls to DeltaUrls +3. Confirm field preservation during transitions +4. Check promotion from DeltaUrls to CuratedUrls +5. Verify all metadata transfers correctly + +Expected Results: +- Each status transition triggers appropriate automated actions +- Data integrity maintained through all transitions +- Correct config generation at each stage +- Proper public visibility after final approval + +## Test Flow 2: Pattern System Functionality + +### Objective +Test the creation, application, and interaction of different pattern types. + +### Prerequisites +- Collection with sample URLs +- Mix of different URL types and structures + +### Test Cases + +#### 2.1 Include/Exclude Patterns +1. Create exclude pattern for specific directory + ```python + pattern = "https://example.com/internal/*" + ``` +2. Create include pattern for specific file within excluded directory + ```python + pattern = "https://example.com/internal/public-doc.html" + ``` +3. Verify include pattern overrides exclude pattern +4. Test wildcard pattern matching +5. Check pattern precedence rules + +#### 2.2 Modification Patterns +1. Create overlapping title patterns: + ```python + pattern1 = "*/docs/* → title='Documentation'" + pattern2 = "*/docs/api/* → title='API Reference'" + ``` +2. Create division patterns with different specificity +3. Test document type patterns with wildcards +4. Verify "smallest set priority" resolution +5. Check pattern application during migrations + +#### 2.3 Pattern Removal Scenarios +1. Test removing pattern affecting only Delta URLs +2. Remove pattern affecting Curated URLs +3. Verify handling of multiple pattern effects +4. Test manual change preservation +5. Check cleanup procedures + +Expected Results: +- Pattern precedence rules correctly applied +- Proper handling of overlapping patterns +- Manual changes preserved during pattern operations +- Correct reversal of pattern effects on removal + +## Test Flow 3: Reindexing Workflow + +### Objective +Verify the reindexing process and status management. + +### Prerequisites +- Existing collection in production +- Access to both dev and prod environments + +### Test Cases + +#### 3.1 Reindexing Status Progression +1. Change status from `REINDEXING_NOT_NEEDED` to `REINDEXING_NEEDED_ON_DEV` +2. Complete reindexing and update to `REINDEXING_FINISHED_ON_DEV` +3. Verify automatic full text fetch +4. Confirm status update to `REINDEXING_READY_FOR_CURATION` +5. Progress through `REINDEXING_CURATED` +6. Final update to `REINDEXING_INDEXED_ON_PROD` + +#### 3.2 Data Handling During Reindex +1. Verify existing DumpUrls are cleared +2. Check new full text data processing +3. Test DumpUrl to DeltaUrl migration +4. Verify pattern reapplication +5. Confirm CuratedUrl updates + +Expected Results: +- Proper status progression through reindexing +- Data integrity maintained +- Patterns correctly reapplied +- Existing customizations preserved + +## Edge Cases and Stress Testing + +### URL Pattern Edge Cases +1. Test URLs with/without trailing slashes +2. Verify handling of overlapping wildcards +3. Check pattern resolution with equal URL count matches +4. Test maximum pattern chain depth +5. Verify handling of malformed URLs + +### Status Transition Edge Cases +1. Test interrupted transitions +2. Verify handling of failed automated actions +3. Check concurrent status updates +4. Test invalid status progressions +5. Verify recovery procedures + +### Data Volume Testing +1. Test with large number of URLs (>100k) +2. Check pattern application performance +3. Verify migration speed with large datasets +4. Test memory usage during bulk operations +5. Check system response under heavy concurrent access + +## Common Issues to Watch For + +1. Pattern Precedence + - Multiple patterns affecting same URL + - Include/exclude pattern conflicts + - Resolution of equal-specificity patterns + +2. Data Integrity + - Field preservation during transitions + - Manual change retention + - Pattern effect tracking + +3. Performance + - Large collection handling + - Multiple pattern application + - Status transition timing + +4. Status Management + - Automated trigger reliability + - Status update race conditions + - Recovery from failed transitions diff --git a/sde_collections/models/README_PATTERN_OVERVIEW.md b/sde_collections/models/README_PATTERN_OVERVIEW.md new file mode 100644 index 00000000..1c5202c4 --- /dev/null +++ b/sde_collections/models/README_PATTERN_OVERVIEW.md @@ -0,0 +1,78 @@ +# URL Pattern Management System + +## Overview +This system provides a framework for managing and curating collections of URLs through pattern-based rules. It enables systematic modification, categorization, and filtering of URLs while maintaining a clear separation between work-in-progress changes and production content. + +## Core Concepts + +### URL States +Content progresses through three states: +- **Dump URLs**: Raw content from initial scraping/indexing +- **Delta URLs**: Work-in-progress changes and modifications +- **Curated URLs**: Production-ready, approved content + +### Pattern Types +- **Include/Exclude Patterns**: Control which URLs are included in collections + - Include patterns always override exclude patterns + - Use wildcards for matching multiple URLs + +- **Modification Patterns**: Change URL properties + - Title patterns modify final titles shown in search results + - Document type patterns affect which tab the URL appears under + - Division patterns assign URLs within the Science Knowledge Sources + +### Pattern Resolution +The system uses a "smallest set priority" strategy which resolves conflicts by always using the most specific pattern that matches a URL: +- Multiple patterns can match the same URL +- Pattern matching the smallest number of URLs takes precedence +- Applies to title, division, and document type patterns +- More specific patterns naturally override general ones + +## Getting Started + +To effectively understand this system, we recommend reading through the documentation in the following order: + +1. Begin with the Pattern System Overview to learn the fundamental concepts of how patterns work and interact with URLs +2. Next, explore the URL Lifecycle documentation to understand how content moves through different states +3. The Pattern Resolution documentation will show you how the system handles overlapping patterns +4. Learn how to control which URLs appear in your collection with the Include/Exclude patterns guide +5. Finally, review the Pattern Unapplication Logic to understand how pattern removal affects your URLs + +Each section builds upon knowledge from previous sections, providing a comprehensive understanding of the system. + +## Documentation + +[Pattern System Overview](./README_PATTERN_SYSTEM.md) +- Core concepts and pattern types +- Pattern lifecycle and effects +- Delta URL generation rules +- Working principles (idempotency, separation of concerns) +- Pattern interaction examples + +[URL Lifecycle Management](./README_LIFECYCLE.md) +- Migration process (Dump → Delta) +- Promotion process (Delta → Curated) +- Field handling during transitions +- Pattern application timing +- Data integrity considerations + +[Pattern Resolution](./README_PATTERN_RESOLUTION.md) +- Smallest set priority mechanism +- URL counting and precedence +- Performance considerations +- Edge case handling +- Implementation details + +[URL Inclusion/Exclusion](./README_INCLUSION.md) +- Wildcard pattern matching +- Include/exclude precedence +- Example pattern configurations +- Best practices +- Common pitfalls and solutions + +[Pattern Unapplication Logic](./README_UNAPPLY_LOGIC.md) +- Pattern removal handling +- Delta management during unapplication +- Manual change preservation +- Cleanup procedures +- Edge case handling diff --git a/sde_collections/models/README_PATTERN_RESOLUTION.md b/sde_collections/models/README_PATTERN_RESOLUTION.md new file mode 100644 index 00000000..91a4259a --- /dev/null +++ b/sde_collections/models/README_PATTERN_RESOLUTION.md @@ -0,0 +1,120 @@ +# Pattern Resolution System + +## Overview +The pattern system uses a "smallest set priority" strategy for resolving conflicts between overlapping patterns. This applies to title patterns, division patterns, and document type patterns. The pattern that matches the smallest set of URLs takes precedence. + +## How It Works +When multiple patterns match a URL, the system: +1. Counts how many total URLs each pattern matches +2. Compares the counts +3. Applies the pattern that matches the fewest URLs + +### Example Pattern Hierarchy +``` +Pattern A: */docs/* # Matches 100 URLs +Pattern B: */docs/api/* # Matches 20 URLs +Pattern C: */docs/api/v2/* # Matches 5 URLs + +Example URLs and Which Patterns Apply: +1. https://example.com/docs/overview.html + ✓ Matches Pattern A + ✗ Doesn't match Pattern B or C + Result: Pattern A applies (only match) + +2. https://example.com/docs/api/endpoints.html + ✓ Matches Pattern A + ✓ Matches Pattern B + ✗ Doesn't match Pattern C + Result: Pattern B applies (20 < 100 URLs) + +3. https://example.com/docs/api/v2/users.html + ✓ Matches Pattern A + ✓ Matches Pattern B + ✓ Matches Pattern C + Result: Pattern C applies (5 < 20 < 100 URLs) +``` + +## Pattern Types and Resolution + +### Title Patterns +``` +Patterns: +A: */docs/* → title="Documentation" # Matches 100 URLs +B: */docs/api/* → title="API Reference" # Matches 20 URLs +C: */docs/api/v2/* → title="V2 API Guide" # Matches 5 URLs + +Example URLs: +1. https://example.com/docs/getting-started.html + • Matches: Pattern A + • Result: title="Documentation" + +2. https://example.com/docs/api/authentication.html + • Matches: Patterns A, B + • Result: title="API Reference" + +3. https://example.com/docs/api/v2/oauth.html + • Matches: Patterns A, B, C + • Result: title="V2 API Guide" +``` + +### Division Patterns +``` +Patterns: +A: *.pdf → division="GENERAL" # Matches 500 URLs +B: */specs/*.pdf → division="ENGINEERING" # Matches 50 URLs +C: */specs/2024/*.pdf → division="RESEARCH" # Matches 10 URLs + +Example URLs: +1. https://example.com/docs/report.pdf + • Matches: Pattern A + • Result: division="GENERAL" + +2. https://example.com/specs/architecture.pdf + • Matches: Patterns A, B + • Result: division="ENGINEERING" + +3. https://example.com/specs/2024/roadmap.pdf + • Matches: Patterns A, B, C + • Result: division="RESEARCH" +``` + +### Document Type Patterns +``` +Patterns: +A: */docs/* → type="DOCUMENTATION" # Matches 200 URLs +B: */docs/data/* → type="DATA" # Matches 30 URLs +C: */docs/data/schemas/* → type="SCHEMA" # Matches 8 URLs + +Example URLs: +1. https://example.com/docs/guide.html + • Matches: Pattern A + • Result: type="DOCUMENTATION" + +2. https://example.com/docs/data/metrics.json + • Matches: Patterns A, B + • Result: type="DATA" + +3. https://example.com/docs/data/schemas/user.json + • Matches: Patterns A, B, C + • Result: type="SCHEMA" +``` + +## Special Cases + +### Mixed Pattern Types +``` +When different pattern types overlap, each is resolved independently: + +URL: https://example.com/docs/api/v2/schema.json +Matching Patterns: +1. */docs/* → title="Documentation", 100 matches +2. */docs/* → doc_type="DOCUMENTATION", 100 matches +3. */docs/api/* → title="API Reference", 50 matches +4. */docs/api/v2/* → division="ENGINEERING", 10 matches +5. */docs/api/v2/*.json → doc_type="DATA", 3 matches + +Final Result: +• title="API Reference" (from pattern 3, most specific title pattern) +• division="ENGINEERING" (from pattern 4, only matching division pattern) +• doc_type="DATA" (from pattern 5, most specific doc_type pattern) +``` diff --git a/sde_collections/models/README_PATTERN_SYSTEM.md b/sde_collections/models/README_PATTERN_SYSTEM.md new file mode 100644 index 00000000..b8381747 --- /dev/null +++ b/sde_collections/models/README_PATTERN_SYSTEM.md @@ -0,0 +1,112 @@ +# Understanding the Pattern System + +## Overview +The pattern system is designed to manage and track changes to URLs in a content curation workflow. It provides a way to systematically modify, exclude, or categorize URLs while maintaining a clear separation between work-in-progress changes (Delta URLs) and production content (Curated URLs). + +## Core Concepts + +### URL States +- **Curated URLs**: Production-ready, approved content +- **Delta URLs**: Work-in-progress changes, additions, or deletions to curated content +- **Dump URLs**: Raw content from the dev server + +### Pattern Types +1. **Exclude Patterns**: Mark URLs for exclusion from the collection +2. **Include Patterns**: Explicitly include URLs in the collection +3. **Title Patterns**: Change or modify the original title +4. **Document Type Patterns**: Assign document type classifications +5. **Division Patterns**: Assign SMD division + +## Pattern Lifecycle + +### 1. Pattern Creation & Application +When a new pattern is created: +1. System identifies all matching URLs based on the pattern criteria +2. For matching Curated URLs: + - If the pattern would change the URL's properties + - And no Delta URL exists → Create a Delta URL with the changes + - If Delta URL exists → Update it with additional changes +3. For matching Delta URLs: + - Apply the pattern's effects directly + + +### 2. Pattern Effects +- Each pattern type has specific effects: + - Exclude: Sets exclusion status + - Include: Clears exclusion status + - Title: Modifies scraped title + - Document Type: Sets document classification + - Division: Sets organizational division + +### 3. Delta URL Generation Rules +Delta URLs are created when: +1. A new pattern would modify a Curated URL +2. An existing pattern effecting a Curated URL is removed, requiring reversal of its effects +3. Reindexed content in DumpUrl differs from Curated content + +Delta URLs are not created when: +1. Pattern effects match current Curated URL state +2. Reindexed content matches Curated content + +### 4. Pattern Removal +When a pattern is deleted: +1. System identifies all URLs affected by the pattern +2. For each affected Curated URL: + - Create Delta URL to reverse effects +3. For affected Delta URLs: + - Remove pattern's effects + - If other patterns still affect it → Keep with updated state + - If Delta URL becomes identical to Curated URL → Delete Delta URL + +## Working Principles + +### 1. Idempotency +- Applying the same pattern multiple times should have the same effect as applying it once +- System tracks pattern effects to ensure consistency +- Multiple patterns can affect the same URL + +### 2. Separation of Concerns +- Pattern effects on Delta URLs don't directly affect Curated URLs +- Exclusion status tracked separately for Delta and Curated URLs +- Changes only propagate to Curated URLs during promotion + +### 3. Change Tracking +- System maintains relationships between patterns and affected URLs +- Each pattern's effects are tracked separately +- Changes can be reversed if patterns are removed + +### 4. Delta URL Lifecycle +1. Creation: + - When patterns would modify Curated URLs + - When DumpUrl content differs from Curated content + - When patterns are removed and effects on CuratedUrls need reversal + +2. Updates: + - When new patterns affect the URL + - When pattern effects change + - When source content changes + +3. Deletion: + - When identical to Curated URL with no pattern effects + - When explicitly marked for deletion + - During promotion to Curated status + +## Pattern Interaction Examples + +### Scenario 1: Multiple Patterns +- Pattern A excludes URLs containing "draft" +- Pattern B sets document type for URLs containing "spec" +- URL: "example.com/draft-spec" +- Result: URL is excluded, document type is set (both patterns apply) + +### Scenario 2: Pattern Removal +- Pattern sets custom title for URLs +- URLs have custom titles in production +- Pattern is deleted +- Result: Delta URLs created to restore original titles + +### Scenario 3: Conflicting Patterns +- Pattern A includes URLs containing "docs" +- Pattern B excludes URLs containing "internal" +- URL: "example.com/docs/internal" +- Result: Url is included - Includes always take precedence diff --git a/sde_collections/models/README_REINDEXING_STATUSES.md b/sde_collections/models/README_REINDEXING_STATUSES.md new file mode 100644 index 00000000..144a83c2 --- /dev/null +++ b/sde_collections/models/README_REINDEXING_STATUSES.md @@ -0,0 +1,74 @@ +# Reindexing Status Documentation + +### Status Flow + +The typical reindexing status flow is: + +1. `REINDEXING_NOT_NEEDED` ("Re-Indexing Not Needed") → Default state +2. `REINDEXING_NEEDED_ON_DEV` ("Re-Indexing Needed") → When reindexing is required +3. `REINDEXING_FINISHED_ON_DEV` ("Re-Indexing Finished") → After reindexing completes +4. `REINDEXING_READY_FOR_CURATION` ("Ready for Re-Curation") → After dump URLs are migrated +5. `REINDEXING_CURATION_IN_PROGRESS` ("Re-Curation in Progress") → During active re-curation +6. `REINDEXING_CURATED` ("Re-Curation Finished") → After re-curation is complete +7. `REINDEXING_INDEXED_ON_PROD` ("Re-Indexed on Prod") → After successful prod indexing + +## Status Descriptions +### Reindexing Not Needed +- Variable name: `REINDEXING_NOT_NEEDED` (1) +- Default status for new collections +- Applied to collections in early workflow stages (research, engineering, etc.) + +### Reindexing Needed on LRM Dev +- Variable name: `REINDEXING_NEEDED_ON_DEV` (2) +- Indicates collections that need to be reindexed on LRM Dev environment +- For collections that have already been indexed on production + +### Reindexing Finished on LRM Dev +- Variable name: `REINDEXING_FINISHED_ON_DEV` (3) +- For collections that have completed reindexing on LRM Dev +- Currently managed manually by LRM team via admin interface + +### Ready for Re-Curation +- Variable name: `REINDEXING_READY_FOR_CURATION` (4) +- Automatically set when: + - A collection's dump URLs are migrated to delta URLs AND there are curated URLs present + - Triggered by Collection.migrate_dump_to_delta() method + +### Re-Curation in Progress +- Variable name: `REINDEXING_CURATION_IN_PROGRESS` (5) +- Indicates that collection is actively being re-curated +- Manually set when curator begins re-curation work +- Transitions to `REINDEXING_CURATED` when re-curation is complete + +### Re-Curation Finished +- Variable name: `REINDEXING_CURATED` (6) +- Automatically set when: + - Delta URLs are promoted to curated URLs AND there are curated URLs present + - Triggered by Collection.promote_to_curated() method + +### Re-Indexed on Prod +- Variable name: `REINDEXING_INDEXED_ON_PROD` (7) +- Currently managed manually via command line +- Future: Will be set automatically via plugin ping + +### Key Code Locations for Automatic Changes + +1. In migrate_dump_to_delta(): +```python +# After migrating, check if we should update reindexing status +curated_urls_count = self.curated_urls.count() +if curated_urls_count > 0: + self.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION + self.save() +``` + +2. In promote_to_curated(): +```python +# After promoting, check if we should update reindexing status +curated_urls_count = self.curated_urls.count() +if curated_urls_count > 0: + self.reindexing_status = ReindexingStatusChoices.REINDEXING_CURATED + self.save() +``` + +Note: All status changes are logged in the ReindexingHistory model for tracking purposes. diff --git a/sde_collections/models/README_STATUS_TRIGGERS.md b/sde_collections/models/README_STATUS_TRIGGERS.md new file mode 100644 index 00000000..8f8b8397 --- /dev/null +++ b/sde_collections/models/README_STATUS_TRIGGERS.md @@ -0,0 +1,68 @@ +# Collection Status Workflows + +This document outlines the automated workflows triggered by status changes in Collections. + +## Workflow Status Transitions + +Collections progress through workflow statuses that trigger specific automated actions: + +### Initial Flow +1. `RESEARCH_IN_PROGRESS` → `READY_FOR_ENGINEERING` + - Triggers: Creation of initial scraper and indexer configs + +2. `READY_FOR_ENGINEERING` → `ENGINEERING_IN_PROGRESS` → `INDEXING_FINISHED_ON_DEV` + - When indexing finishes, a developer changes the status to `INDEXING_FINISHED_ON_DEV` + - This will trigger a full text fetch from LRM dev + - If the fetch completes successfully, it updates the status to `READY_FOR_CURATION` + +3. `READY_FOR_CURATION` + - Triggers creation/update of plugin config + +4. `READY_FOR_CURATION` → `CURATION_IN_PROGRESS` → `CURATED` + - When curation finishes, the curator marks the collection as `CURATED` + - This triggers the promotion of DeltaUrls to CuratedUrls + +5. Quality Check Flow: + - During quality checks the curator can put the status as `QUALITY_CHECK_PERFECT/MINOR` + - These passing quality statuses will trigger the addition of the collection to the public query + - After the PR is merged and SDE Prod server is updated with the latest code, this collection will become visible + +### Reindexing Flow + +After the main workflow, collections can enter a reindexing cycle: + +1. `REINDEXING_NOT_NEEDED` → `REINDEXING_NEEDED_ON_DEV` + - By default collections do not need reindexing + - They can be manually marked as reindexing needed on dev + +2. `REINDEXING_NEEDED_ON_DEV` → `REINDEXING_FINISHED_ON_DEV` + - When re-indexing finishes, a developer changes the status to `REINDEXING_FINISHED_ON_DEV` + - This will trigger a full text fetch from LRM dev + - If the fetch completes successfully, it updates the status to `REINDEXING_READY_FOR_CURATION` + +3. `REINDEXING_READY_FOR_CURATION` → `REINDEXING_CURATED` + - When re-curation finishes, the curator marks the collection as `REINDEXING_CURATED` + - This triggers the promotion of DeltaUrls to CuratedUrls + +4. `REINDEXING_CURATED` → `REINDEXING_INDEXED_ON_PROD` + - After the collection has been indexed on Prod, a dev marks it as `REINDEXING_INDEXED_ON_PROD` + +## Full Text Import Process + +The full text import process integrates with both workflows: + +1. Clears existing DumpUrls for the collection +2. Fetches and processes new full text data in batches +3. Creates new DumpUrls +4. Migrates DumpUrls to DeltaUrls +5. Updates collection status based on context: + - In main workflow: Updates to `READY_FOR_CURATION` + - In reindexing: Updates to `REINDEXING_READY_FOR_CURATION` + +## Key Models and Files + +- `Collection`: Main model handling status transitions +- `WorkflowStatusChoices`: Enum defining main workflow states +- `ReindexingStatusChoices`: Enum defining reindexing states +- `tasks.py`: Contains full text import logic and status updates +- Signal handler in Collection model manages status change triggers diff --git a/sde_collections/models/README_UNAPPLY_LOGIC.md b/sde_collections/models/README_UNAPPLY_LOGIC.md new file mode 100644 index 00000000..1000f5f8 --- /dev/null +++ b/sde_collections/models/README_UNAPPLY_LOGIC.md @@ -0,0 +1,152 @@ +# Pattern System Unapply Logic + +## Core Principles +1. When patterns are removed, we need to handle deltas based on their relationship to curated URLs +2. Deltas should only exist if they differ from their curated counterparts, or if no curated URL exists +3. Multiple patterns can affect the same URL +4. Manual changes to deltas should be preserved + +## Cases to Handle + +### Case 1: Delta Only (New URL) +**Scenario:** +- No curated URL exists for this URL +- Delta URL exists with pattern effect +- Pattern is removed +``` +Curated: None exists +Delta: url=new.com, division=None +``` +`[Pattern: division=BIOLOGY], created` +``` +Curated: None exists +Delta: url=new.com, division=BIOLOGY +``` +`[Pattern: division=BIOLOGY], deleted` +``` +Curated: None exists +Delta: url=new.com, division=None +``` + +### Case 2: Delta Created to Apply Pattern +**Scenario:** +- A Curated with no division already exists +- A pattern is created +- A delta is created to to apply a pattern +- Pattern is removed +- Delta should be deleted +``` +Curated: division=None +``` +`[Pattern: division=BIOLOGY], created` +``` +Curated: division=None +Delta: division=BIOLOGY (from pattern) +``` +`[Pattern: division=BIOLOGY], deleted` +``` +Curated: division=None +``` + +### Case 3: Pre-existing Delta +- A Curated with no division already exists +- A Delta with an updated scraped_title exists +- A pattern is created to set division +- A delta is created to apply a pattern +- Pattern is removed +- Delta should be maintained because of scraped_title + +``` +Curated: division=None +Delta: scraped_title="Modified", division=None +``` +`[Pattern: division=BIOLOGY], created` +``` +Curated: division=None +Delta: scraped_title="Modified", division=BIOLOGY (from pattern) +``` +`[Pattern: division=BIOLOGY], deleted` +``` +Curated: division=None +Delta: scraped_title="Modified", division=None +``` + +### Case 4: Multiple Pattern Effects +**Scenario:** +- Delta has changes from multiple patterns +- One pattern is removed +``` +Delta: division=BIOLOGY, doc_type=DATA (from two patterns) +Pattern: division=BIOLOGY +Pattern: doc_type=DATA +``` +`[Pattern: division=BIOLOGY], deleted` +``` +Delta: division=None, doc_type=DATA +Pattern: doc_type=DATA +``` + +### Case 5: Overlapping Patterns, Specific Deleted +``` +Delta: division=ASTROPHYSICS (because of specific pattern) +Specific Pattern: division=ASTROPHYSICS +General Pattern: division=BIOLOGY +``` +`[Specific Pattern: division=ASTROPHYSICS], deleted` + +``` +Delta: division=BIOLOGY (because of general pattern) +General Pattern: division=BIOLOGY +``` + + +### Case 6: Overlapping Patterns, General Deleted +``` +Delta: division=ASTROPHYSICS (because of specific pattern) +Specific Pattern: division=ASTROPHYSICS +General Pattern: division=BIOLOGY +``` +`[General Pattern: division=BIOLOGY], deleted` + +``` +Delta: division=ASTROPHYSICS (because of specific pattern) +Specific Pattern: division=ASTROPHYSICS +``` + + +## Implementation Steps + +1. **Get Affected URLs** + - Get all deltas and curated URLs that match pattern + - For each URL determine what exists (delta only, both, or curated only) + +2. **For Each Delta URL Found** + - If no matching curated exists: + - Set pattern's field to null + - If matching curated exists: + - Set pattern's field to curated value + - If delta now matches curated exactly, delete delta + +3. **For Each Curated URL without Delta** + - Create new delta with pattern's field set to null + +4. **Cleanup** + - Clear pattern's relationships with URLs + - Remove pattern from database + +## Edge Cases to Handle + +1. **Field Comparison** + - When comparing delta to curated, ignore id and to_delete fields + - All other fields must match exactly for delta deletion + +2. **Manual Changes** + - Preserve any delta fields not modified by this pattern + - Only delete delta if ALL fields match curated + +3. **Multiple Collections** + - Only affect URLs in pattern's collection + +4. **Invalid States** + - Handle missing URLs gracefully + - Skip URLs that no longer exist diff --git a/sde_collections/models/candidate_url.py b/sde_collections/models/candidate_url.py index 51c3a28b..2fe3dd82 100644 --- a/sde_collections/models/candidate_url.py +++ b/sde_collections/models/candidate_url.py @@ -2,10 +2,12 @@ import os from urllib.parse import urlparse +from django.contrib.postgres.fields import ArrayField from django.db import models +from ..utils.paired_field_descriptor import PairedFieldDescriptor from .collection import Collection -from .collection_choice_fields import Divisions, DocumentTypes +from .collection_choice_fields import Divisions, DocumentTypes, TDAMMTags from .pattern import ExcludePattern, TitlePattern @@ -35,6 +37,13 @@ class CandidateURL(models.Model): blank=True, help_text="This is the original title scraped by Sinequa", ) + scraped_text = models.TextField( + "Scraped Text", + default="", + null=True, + blank=True, + help_text="This is the text scraped by Sinequa", + ) generated_title = models.CharField( "Generated Title", default="", @@ -79,6 +88,12 @@ class CandidateURL(models.Model): default=False, help_text="Helps keep track if the Current URL is present in production or not", ) + # is_tdamm = models.BooleanField("Is TDAMM?", default=False, help_text="Enable TDAMM tagging for this URL") + tdamm_tag = PairedFieldDescriptor( + field_name="tdamm_tag", + field_type=ArrayField(models.CharField(max_length=255, choices=TDAMMTags.choices), blank=True, null=True), + verbose_name="TDAMM Tags", + ) class Meta: """Meta definition for Candidate URL.""" diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index c5690a4b..0f1162e1 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -3,14 +3,15 @@ import requests from django.contrib.auth import get_user_model +from django.contrib.contenttypes.models import ContentType from django.db import models -from django.db.models import Q from django.db.models.signals import post_save from django.dispatch import receiver from model_utils import FieldTracker from slugify import slugify from config_generation.db_to_xml import XmlEditor +from sde_collections.tasks import fetch_and_replace_full_text from ..utils.github_helper import GitHubHandler from ..utils.slack_utils import ( @@ -23,12 +24,15 @@ CurationStatusChoices, Divisions, DocumentTypes, + ReindexingStatusChoices, SourceChoices, UpdateFrequencies, WorkflowStatusChoices, ) +from .delta_url import CuratedUrl, DeltaUrl, DumpUrl User = get_user_model() +DELTA_COMPARISON_FIELDS = ["scraped_title"] # Add more fields as needed class Collection(models.Model): @@ -73,7 +77,12 @@ class Collection(models.Model): choices=WorkflowStatusChoices.choices, default=WorkflowStatusChoices.RESEARCH_IN_PROGRESS, ) - tracker = FieldTracker(fields=["workflow_status"]) + reindexing_status = models.IntegerField( + choices=ReindexingStatusChoices.choices, + default=ReindexingStatusChoices.REINDEXING_NOT_NEEDED, + verbose_name="Reindexing Status", + ) + tracker = FieldTracker(fields=["workflow_status", "reindexing_status"]) curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True) curation_started = models.DateTimeField("Curation Started", null=True, blank=True) @@ -84,6 +93,145 @@ class Meta: verbose_name = "Collection" verbose_name_plural = "Collections" + def clear_delta_urls(self): + """Clears all DeltaUrls for this collection.""" + DeltaUrl.objects.filter(collection=self).delete() + + def clear_dump_urls(self): + """Clears all DumpUrls for this collection.""" + DumpUrl.objects.filter(collection=self).delete() + + def refresh_url_lists_for_all_patterns(self): + """ + Updates pattern relations for all patterns associated with this collection. + """ + # List of pattern models to update + pattern_models = [ + "DeltaExcludePattern", + "DeltaIncludePattern", + "DeltaTitlePattern", + "DeltaDocumentTypePattern", + "DeltaDivisionPattern", + ] + + # Loop through each model and update its relations + for model_name in pattern_models: + # Get the model dynamically + model = ContentType.objects.get(app_label="sde_collections", model=model_name.lower()).model_class() + + # Filter patterns for the current collection and update relations + for pattern in model.objects.filter(collection=self): + pattern.update_affected_delta_urls_list() + pattern.update_affected_curated_urls_list() + + def migrate_dump_to_delta(self): + """ + Migrates data from DumpUrls to DeltaUrls, preserving all fields. + Creates DeltaUrls that reflect: + 1. Changes from DumpUrls vs CuratedUrls + 2. Missing URLs in DumpUrls that exist in CuratedUrls (marked for deletion) + """ + # Step 1: Clear existing DeltaUrls for this collection + self.clear_delta_urls() + + # Step 2: Fetch all current DumpUrls and CuratedUrls for this collection + dump_urls = {url.url: url for url in DumpUrl.objects.filter(collection=self)} + curated_urls = {url.url: url for url in CuratedUrl.objects.filter(collection=self)} + + # Step 3: Process each URL in DumpUrls to migrate as needed + for url, dump in dump_urls.items(): + curated = curated_urls.get(url) + + if curated: + # Check if any of the comparison fields differ + if any(getattr(curated, field) != getattr(dump, field) for field in DELTA_COMPARISON_FIELDS): + self.create_or_update_delta_url(dump, to_delete=False) + else: + # New URL, not in CuratedUrls; move it entirely to DeltaUrls + self.create_or_update_delta_url(dump, to_delete=False) + + # Step 4: Identify CuratedUrls missing in DumpUrls and flag them for deletion in DeltaUrls + for curated in curated_urls.values(): + if curated.url not in dump_urls: + self.create_or_update_delta_url(curated, to_delete=True) + + # Step 5: Clear DumpUrls after migration is complete + self.clear_dump_urls() + + # Step 6: Apply all patterns to DeltaUrls + # self.refresh_url_lists_for_all_patterns() # TODO: I'm pretty confident we shouldn't be running this + self.apply_all_patterns() + + def create_or_update_delta_url(self, url_instance, to_delete=False): + """ + Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object. + Always copies all fields, even for deletion cases. + + Args: + url_instance: DumpUrl or CuratedUrl instance to copy from + to_delete: Whether to mark the resulting DeltaUrl for deletion + """ + # Get all copyable fields from the source instance + fields_to_copy = { + field.name: getattr(url_instance, field.name) + for field in url_instance._meta.fields + if field.name not in ["id", "collection"] + } + + # Set deletion status + fields_to_copy["to_delete"] = to_delete + + # Update or create the DeltaUrl + DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy) + + def promote_to_curated(self): + """ + Promotes all DeltaUrls in this collection to CuratedUrls. + Updates, adds, or removes CuratedUrls as necessary to match the latest DeltaUrls. + """ + # Step 1: Fetch all current DeltaUrls and CuratedUrls for this collection + delta_urls = {url.url: url for url in DeltaUrl.objects.filter(collection=self)} + curated_urls = {url.url: url for url in CuratedUrl.objects.filter(collection=self)} + + # Step 2: Process each DeltaUrl to update or create the corresponding CuratedUrl + for url, delta in delta_urls.items(): + curated = curated_urls.get(url) + + # Delete the CuratedUrl if the DeltaUrl is marked for deletion + if delta.to_delete: + if curated: + curated.delete() + continue + + if curated: + updated_fields = {} + for field in delta._meta.fields: + field_name = field.name + if field_name in ["to_delete", "id"]: + continue + + delta_value = getattr(delta, field_name) + if getattr(curated, field_name) != delta_value: + updated_fields[field_name] = delta_value + + if updated_fields: + CuratedUrl.objects.filter(pk=curated.pk).update(**updated_fields) + else: + # Previously, we excluded fields with values of None and "" + # however, such null values are considered meaningful and should be copied over + new_data = { + field.name: getattr(delta, field.name) + for field in delta._meta.fields + if field.name not in ["to_delete", "collection", "id"] and getattr(delta, field.name) + } + CuratedUrl.objects.create(collection=self, **new_data) + + # Step 3: Clear all DeltaUrls for this collection since they've been promoted + DeltaUrl.objects.filter(collection=self).delete() + + # Step 4: Reapply patterns to DeltaUrls + self.refresh_url_lists_for_all_patterns() + def add_to_public_query(self): """Add the collection to the public query.""" if self.workflow_status not in [ @@ -108,10 +256,6 @@ def add_to_public_query(self): scraper_content = scraper_editor.update_config_xml() gh.create_or_update_file(query_path, scraper_content) - @property - def included_urls_count(self): - return self.candidate_urls.filter(excluded=False).count() - @property def _scraper_config_path(self) -> str: return f"sources/scrapers/{self.config_folder}/default.xml" @@ -130,7 +274,7 @@ def tree_root(self) -> str: @property def server_url_secret_prod(self) -> str: - base_url = "https://sciencediscoveryengine.nasa.gov" + base_url = "https://sciencediscoveryengine.nasa.gov" # noqa: E231 payload = { "name": "secret-prod", "scope": "All", @@ -144,7 +288,7 @@ def server_url_secret_prod(self) -> str: @property def server_url_prod(self) -> str: - base_url = "https://sciencediscoveryengine.nasa.gov" + base_url = "https://sciencediscoveryengine.nasa.gov" # noqa: E231 payload = { "name": "query-smd-primary", "scope": "All", @@ -199,6 +343,19 @@ def workflow_status_button_color(self) -> str: } return color_choices[self.workflow_status] + @property + def reindexing_status_button_color(self) -> str: + color_choices = { + 1: "btn-light", # REINDEXING_NOT_NEEDED + 2: "btn-danger", # REINDEXING_NEEDED_ON_DEV (matching Ready For Engineering) + 3: "btn-info", # REINDEXING_FINISHED_ON_DEV (matching Indexing Finished on LRM Dev) + 4: "btn-info", # REINDEXING_READY_FOR_CURATION (matching Ready for Curation) + 5: "btn-success", # REINDEXING_CURATION_IN_PROGRESS (matching Curation in Progress) + 6: "btn-primary", # REINDEXING_CURATED (matching Curated) + 7: "btn-primary", # REINDEXING_INDEXED_ON_PROD (matching Prod: Perfect) + } + return color_choices[self.reindexing_status] + def _process_exclude_list(self): """Process the exclude list.""" return [pattern._process_match_pattern() for pattern in self.excludepattern.all()] @@ -371,13 +528,12 @@ def candidate_urls_count(self) -> int: @property def sinequa_configuration(self) -> str: - return ( - f"https://github.com/NASA-IMPACT/sde-backend/blob/production/sources/SDE/{self.config_folder}/default.xml" - ) + URL = f"https://github.com/NASA-IMPACT/sde-backend/blob/production/sources/SDE/{self.config_folder}/default.xml" # noqa: E231, E501 + return URL @property def github_issue_link(self) -> str: - return f"https://github.com/NASA-IMPACT/sde-project/issues/{self.github_issue_number}" + return f"https://github.com/NASA-IMPACT/sde-project/issues/{self.github_issue_number}" # noqa: E231 @classmethod def _fetch_json_results(cls, url): @@ -460,15 +616,22 @@ def sync_with_production_webapp(self) -> None: self.save() - def apply_all_patterns(self) -> None: - """Apply all the patterns.""" - for pattern in self.excludepattern.all(): + def apply_all_patterns(self): + """Apply all the patterns with debug information.""" + + for pattern in self.deltaexcludepatterns.all(): + pattern.apply() + + for pattern in self.deltaincludepatterns.all(): pattern.apply() - for pattern in self.includepattern.all(): + + for pattern in self.deltatitlepatterns.all(): pattern.apply() - for pattern in self.titlepattern.all(): + + for pattern in self.deltadocumenttypepatterns.all(): pattern.apply() - for pattern in self.documenttypepattern.all(): + + for pattern in self.deltadivisionpatterns.all(): pattern.apply() def save(self, *args, **kwargs): @@ -484,7 +647,13 @@ def save(self, *args, **kwargs): if transition in STATUS_CHANGE_NOTIFICATIONS: details = STATUS_CHANGE_NOTIFICATIONS[transition] message = format_slack_message(self.name, details, self.id) - send_slack_message(message) + try: + # TODO: find a better way to allow this to work on dev environments with + # no slack integration + send_slack_message(message) + except Exception as e: + print(f"Error sending Slack message: {e}") + # Call the parent class's save method super().save(*args, **kwargs) @@ -492,6 +661,7 @@ def __init__(self, *args, **kwargs): # Create a cached version of the last workflow_status to compare against super().__init__(*args, **kwargs) self.old_workflow_status = self.workflow_status + self.old_reindexing_status = self.reindexing_status class RequiredUrls(models.Model): @@ -572,21 +742,72 @@ def log_workflow_history(sender, instance, created, **kwargs): old_status=instance.old_workflow_status, ) + if instance.reindexing_status != instance.old_reindexing_status: + ReindexingHistory.objects.create( + collection=instance, + reindexing_status=instance.reindexing_status, + curated_by=instance.curated_by, + old_status=instance.old_reindexing_status, + ) + + +class ReindexingHistory(models.Model): + collection = models.ForeignKey(Collection, on_delete=models.CASCADE, related_name="reindexing_history", null=True) + reindexing_status = models.IntegerField( + choices=ReindexingStatusChoices.choices, + default=ReindexingStatusChoices.REINDEXING_NOT_NEEDED, + ) + old_status = models.IntegerField(choices=ReindexingStatusChoices.choices, null=True) + curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + + def __str__(self): + return str(self.collection) + str(self.reindexing_status) + + @property + def reindexing_status_button_color(self) -> str: + color_choices = { + 1: "btn-light", # REINDEXING_NOT_NEEDED + 2: "btn-warning", # REINDEXING_NEEDED_ON_DEV + 3: "btn-secondary", # REINDEXING_FINISHED_ON_DEV + 4: "btn-info", # REINDEXING_READY_FOR_CURATION + 5: "btn-primary", # REINDEXING_CURATED + 6: "btn-success", # REINDEXING_INDEXED_ON_PROD + } + return color_choices[self.reindexing_status] + @receiver(post_save, sender=Collection) def create_configs_on_status_change(sender, instance, created, **kwargs): - """ - Creates various config files on certain workflow status changes - """ - - if "workflow_status" in instance.tracker.changed(): - if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION: - instance.create_plugin_config(overwrite=True) - elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING: - instance.create_scraper_config(overwrite=False) - instance.create_indexer_config(overwrite=False) - elif instance.workflow_status in [ - WorkflowStatusChoices.QUALITY_CHECK_PERFECT, - WorkflowStatusChoices.QUALITY_CHECK_MINOR, - ]: - instance.add_to_public_query() + """Creates various config files on certain workflow status changes""" + + if getattr(instance, "_handling_status_change", False): + return + + try: + instance._handling_status_change = True + + if "workflow_status" in instance.tracker.changed(): + if instance.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION: + instance.create_plugin_config(overwrite=True) + elif instance.workflow_status == WorkflowStatusChoices.CURATED: + instance.promote_to_curated() + elif instance.workflow_status == WorkflowStatusChoices.READY_FOR_ENGINEERING: + instance.create_scraper_config(overwrite=False) + instance.create_indexer_config(overwrite=False) + elif instance.workflow_status == WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV: + fetch_and_replace_full_text.delay(instance.id, "lrm_dev") + elif instance.workflow_status in [ + WorkflowStatusChoices.QUALITY_CHECK_PERFECT, + WorkflowStatusChoices.QUALITY_CHECK_MINOR, + ]: + instance.add_to_public_query() + + if "reindexing_status" in instance.tracker.changed(): + if instance.reindexing_status == ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV: + fetch_and_replace_full_text.delay(instance.id, "lrm_dev") + elif instance.reindexing_status == ReindexingStatusChoices.REINDEXING_CURATED: + instance.promote_to_curated() + + finally: + instance._handling_status_change = False diff --git a/sde_collections/models/collection_choice_fields.py b/sde_collections/models/collection_choice_fields.py index 3a9a3664..c907d08b 100644 --- a/sde_collections/models/collection_choice_fields.py +++ b/sde_collections/models/collection_choice_fields.py @@ -97,3 +97,75 @@ class WorkflowStatusChoices(models.IntegerChoices): PROD_MAJOR = 16, "Prod: Major Issues" MERGE_PENDING = 17, "Code Merge Pending" NEEDS_DELETE = 19, "Delete from Prod" + INDEXING_FINISHED_ON_DEV = 20, "Indexing Finished on LRM Dev" + + +class ReindexingStatusChoices(models.IntegerChoices): + REINDEXING_NOT_NEEDED = 1, "Re-Indexing Not Needed" + REINDEXING_NEEDED_ON_DEV = 2, "Re-Indexing Needed" + REINDEXING_FINISHED_ON_DEV = 3, "Re-Indexing Finished" + REINDEXING_READY_FOR_CURATION = 4, "Ready for Re-Curation" + REINDEXING_CURATION_IN_PROGRESS = 5, "Re-Curation in Progress" + REINDEXING_CURATED = 6, "Re-Curation Finished" + REINDEXING_INDEXED_ON_PROD = 7, "Re-Indexed on Prod" + + # @classmethod + # def get_status_string(cls, value): + # for choice in cls.choices: + # if choice[0] == value: + # return choice[1] + # return "N/A" + + +class TDAMMTags(models.TextChoices): + """TDAMM (Tagged Data for Multi-Messenger Astronomy) tag choices.""" + + NOT_TDAMM = "Not TDAMM", "Not TDAMM" + MMA_M_EM = "MMA_M_EM", "Messenger - EM Radiation" + MMA_M_EM_G = "MMA_M_EM_G", "Messenger - EM Radiation - Gamma rays" + MMA_M_EM_X = "MMA_M_EM_X", "Messenger - EM Radiation - X-rays" + MMA_M_EM_U = "MMA_M_EM_U", "Messenger - EM Radiation - Ultraviolet" + MMA_M_EM_O = "MMA_M_EM_O", "Messenger - EM Radiation - Optical" + MMA_M_EM_I = "MMA_M_EM_I", "Messenger - EM Radiation - Infrared" + MMA_M_EM_M = "MMA_M_EM_M", "Messenger - EM Radiation - Microwave" + MMA_M_EM_R = "MMA_M_EM_R", "Messenger - EM Radiation - Radio" + MMA_M_G = "MMA_M_G", "Messenger - Gravitational Waves" + MMA_M_G_CBI = "MMA_M_G_CBI", "Messenger - Gravitational Waves - Compact Binary Inspiral" + MMA_M_G_S = "MMA_M_G_S", "Messenger - Gravitational Waves - Stochastic" + MMA_M_G_CON = "MMA_M_G_CON", "Messenger - Gravitational Waves - Continuous" + MMA_M_G_B = "MMA_M_G_B", "Messenger - Gravitational Waves - Burst" + MMA_M_C = "MMA_M_C", "Messenger - Cosmic Rays" + MMA_M_N = "MMA_M_N", "Messenger - Neutrinos" + MMA_O_BI = "MMA_O_BI", "Objects - Binaries" + MMA_O_BI_BBH = "MMA_O_BI_BBH", "Objects - Binaries - Binary Black Holes" + MMA_O_BI_BNS = "MMA_O_BI_BNS", "Objects - Binaries - Binary Neutron Stars" + MMA_O_BI_C = "MMA_O_BI_C", "Objects - Binaries - Cataclysmic Variables" + MMA_O_BI_N = "MMA_O_BI_N", "Objects - Binaries - Neutron Star-Black Hole" + MMA_O_BI_B = "MMA_O_BI_B", "Objects - Binaries - Binary Pulsars" + MMA_O_BI_W = "MMA_O_BI_W", "Objects - Binaries - White Dwarf Binaries" + MMA_O_BH = "MMA_O_BH", "Objects - Black Holes" + MMA_O_BH_AGN = "MMA_O_BH_AGN", "Objects - Black Holes - Active Galactic Nuclei" + MMA_O_BH_IM = "MMA_O_BH_IM", "Objects - Black Holes - Intermediate mass" + MMA_O_BH_STM = "MMA_O_BH_STM", "Objects - Black Holes - Stellar mass" + MMA_O_BH_SUM = "MMA_O_BH_SUM", "Objects - Black Holes - Supermassive" + MMA_O_E = "MMA_O_E", "Objects - Exoplanets" + MMA_O_N = "MMA_O_N", "Objects - Neutron Stars" + MMA_O_N_M = "MMA_O_N_M", "Objects - Neutron Stars - Magnetars" + MMA_O_N_P = "MMA_O_N_P", "Objects - Neutron Stars - Pulsars" + MMA_O_N_PWN = "MMA_O_N_PWN", "Objects - Neutron Stars - Pulsar Wind Nebula" + MMA_O_S = "MMA_O_S", "Objects - Supernova Remnants" + MMA_S_F = "MMA_S_F", "Signals - Fast Radio Bursts" + MMA_S_G = "MMA_S_G", "Signals - Gamma-ray Bursts" + MMA_S_K = "MMA_S_K", "Signals - Kilonovae" + MMA_S_N = "MMA_S_N", "Signals - Novae" + MMA_S_P = "MMA_S_P", "Signals - Pevatrons" + MMA_S_ST = "MMA_S_ST", "Signals - Stellar flares" + MMA_S_SU = "MMA_S_SU", "Signals - Supernovae" + + @classmethod + def lookup_by_text(cls, text: str) -> str | None: + """Look up a TDAMM tag by its display text.""" + for choice in cls.choices: + if choice[1].lower() == text.lower(): + return choice[0] + return None diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py new file mode 100644 index 00000000..61c8e9ea --- /dev/null +++ b/sde_collections/models/delta_patterns.py @@ -0,0 +1,693 @@ +import re +from typing import Any + +from django.apps import apps +from django.core.exceptions import ValidationError +from django.db import models + +from ..utils.title_resolver import ( + is_valid_xpath, + parse_title, + resolve_title, + validate_fstring, +) +from .collection_choice_fields import Divisions, DocumentTypes + + +class BaseMatchPattern(models.Model): + """Base class for all delta patterns.""" + + class MatchPatternTypeChoices(models.IntegerChoices): + INDIVIDUAL_URL = 1, "Individual URL Pattern" + MULTI_URL_PATTERN = 2, "Multi-URL Pattern" + + collection = models.ForeignKey( + "Collection", + on_delete=models.CASCADE, + related_name="%(class)ss", # Makes collection.deltaincludepatterns.all() + related_query_name="%(class)ss", + ) + match_pattern = models.CharField( + "Pattern", help_text="This pattern is compared against the URL of all documents in the collection" + ) + match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1) + delta_urls = models.ManyToManyField( + "DeltaUrl", + related_name="%(class)ss", # Makes delta_url.deltaincludepatterns.all() + ) + curated_urls = models.ManyToManyField( + "CuratedUrl", + related_name="%(class)ss", # Makes curated_url.deltaincludepatterns.all() + ) + + def get_url_match_count(self): + """ + Get the number of unique URLs this pattern matches across both delta and curated URLs. + """ + delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True)) + curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True)) + return len(delta_urls.union(curated_urls)) + + def is_most_distinctive_pattern(self, url) -> bool: + """ + Determine if this pattern should apply to a URL by checking: + 1. First checks if this pattern matches this URL + 2. If it matches the smallest number of URLs among all patterns that match this URL + 3. If tied for smallest number of matches, uses the longest pattern string + Returns True if this pattern should be applied. + """ + # First check if this pattern matches the URL + regex_pattern = self.get_regex_pattern() + if not re.search(regex_pattern, url.url): + return False + + my_match_count = self.get_url_match_count() + my_pattern_length = len(self.match_pattern) + + # Get patterns from same type that affect this URL + pattern_class = self.__class__ + matching_patterns = ( + pattern_class.objects.filter(collection=self.collection) + .filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url)) + .exclude(id=self.id) + .distinct() + ) + + # Use M2M relationships for checking other patterns since those are already established + for pattern in matching_patterns: + other_match_count = pattern.get_url_match_count() + if other_match_count < my_match_count: + # Other pattern matches fewer URLs - definitely not most distinctive + return False + if other_match_count == my_match_count: + # Same match count - check pattern length + if len(pattern.match_pattern) > my_pattern_length: + # Other pattern is longer - not most distinctive + return False + + return True + + def get_regex_pattern(self) -> str: + """Convert the match pattern into a proper regex based on pattern type.""" + escaped_pattern = re.escape(self.match_pattern) + if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL: + return f"{escaped_pattern}$" + return escaped_pattern.replace(r"\*", ".*") + + def get_matching_delta_urls(self) -> models.QuerySet: + """Get all DeltaUrls that match this pattern.""" + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + regex_pattern = self.get_regex_pattern() + return DeltaUrl.objects.filter(collection=self.collection, url__regex=regex_pattern) + + def get_matching_curated_urls(self) -> models.QuerySet: + """Get all CuratedUrls that match this pattern.""" + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + regex_pattern = self.get_regex_pattern() + return CuratedUrl.objects.filter(collection=self.collection, url__regex=regex_pattern) + + def update_affected_delta_urls_list(self) -> None: + """Update the many-to-many relationship for matched DeltaUrls.""" + self.delta_urls.set(self.get_matching_delta_urls()) + + def update_affected_curated_urls_list(self) -> None: + """Update the many-to-many relationship for matched CuratedUrls.""" + self.curated_urls.set(self.get_matching_curated_urls()) + + def apply(self) -> None: + """Apply pattern effects. Must be implemented by subclasses.""" + raise NotImplementedError + + def unapply(self) -> None: + """Remove pattern effects. Must be implemented by subclasses.""" + raise NotImplementedError + + def save(self, *args, **kwargs) -> None: + super().save(*args, **kwargs) + self.apply() + + def delete(self, *args, **kwargs) -> None: + self.unapply() + super().delete(*args, **kwargs) + + class Meta: + abstract = True + ordering = ["match_pattern"] + unique_together = ("collection", "match_pattern") + + def __str__(self): + return self.match_pattern + + +class InclusionPatternBase(BaseMatchPattern): + """ + Base class for patterns that handle URL inclusion/exclusion. + Both ExcludePattern and IncludePattern share the same core logic for managing + relationships and Delta URL creation/cleanup. + """ + + class Meta(BaseMatchPattern.Meta): + abstract = True + + def apply(self) -> None: + """ + Apply pattern effects to matching URLs: + 1. Find new Curated URLs that match but weren't previously affected + 2. Create Delta URLs for newly affected Curated URLs if needed + 3. Update pattern relationships to manage inclusion/exclusion status + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + + # Get QuerySet of all matching CuratedUrls + matching_curated_urls = self.get_matching_curated_urls() + + # Find Curated URLs that match but weren't previously affected + previously_unaffected_curated = matching_curated_urls.exclude( + id__in=self.curated_urls.values_list("id", flat=True) + ) + + # Create Delta URLs for newly affected Curated URLs if needed + for curated_url in previously_unaffected_curated: + # Skip if Delta already exists + if DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists(): + continue + + # Create new Delta URL copying fields from Curated URL + fields = { + field.name: getattr(curated_url, field.name) + for field in curated_url._meta.fields + if field.name not in ["id", "collection"] + } + fields["to_delete"] = False + fields["collection"] = self.collection + + DeltaUrl.objects.create(**fields) + + # Update relationships - this handles inclusion/exclusion status + self.update_affected_delta_urls_list() + + def unapply(self) -> None: + """ + Remove this pattern's effects by: + 1. Creating Delta URLs for previously excluded Curated URLs to show they're no longer excluded/included + 2. Cleaning up any Delta URLs that are now identical to their Curated URL counterparts + (these would have only existed to show their exclusion/inclusion) + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + + # Create Delta URLs for previously affected Curated URLs + for curated_url in self.curated_urls.all(): + fields = { + field.name: getattr(curated_url, field.name) + for field in curated_url._meta.fields + if field.name not in ["id", "collection"] + } + fields["to_delete"] = False + fields["collection"] = self.collection + + DeltaUrl.objects.get_or_create(**fields) + + # Clean up redundant Delta URLs + for delta_url in self.delta_urls.filter(to_delete=False): + try: + curated_url = CuratedUrl.objects.get(collection=self.collection, url=delta_url.url) + + # Check if Delta is now identical to Curated + fields_match = all( + getattr(delta_url, field.name) == getattr(curated_url, field.name) + for field in delta_url._meta.fields + if field.name not in ["id", "to_delete"] + ) + + if fields_match: + delta_url.delete() + + except CuratedUrl.DoesNotExist: + continue + + # Clear pattern relationships + self.delta_urls.clear() + self.curated_urls.clear() + + +class DeltaExcludePattern(InclusionPatternBase): + """Pattern for marking URLs for exclusion.""" + + reason = models.TextField("Reason for excluding", default="", blank=True) + + class Meta(InclusionPatternBase.Meta): + verbose_name = "Delta Exclude Pattern" + verbose_name_plural = "Delta Exclude Patterns" + + +class DeltaIncludePattern(InclusionPatternBase): + """Pattern for explicitly including URLs.""" + + class Meta(InclusionPatternBase.Meta): + verbose_name = "Delta Include Pattern" + verbose_name_plural = "Delta Include Patterns" + + +class FieldModifyingPattern(BaseMatchPattern): + """ + Abstract base class for patterns that modify a single field on matching URLs. + Examples: DeltaDivisionPattern, DeltaDocumentTypePattern + """ + + class Meta(BaseMatchPattern.Meta): + abstract = True + + def get_field_to_modify(self) -> str: + """Return the name of the field this pattern modifies. Must be implemented by subclasses.""" + raise NotImplementedError + + def get_new_value(self) -> Any: + """Return the new value for the field. Must be implemented by subclasses.""" + raise NotImplementedError + + def apply(self) -> None: + """ + Apply field modification to matching URLs: + 1. Find new Curated URLs that match but weren't previously affected + 2. Create Delta URLs only for Curated URLs where the field value would change + 3. Update the pattern's list of affected URLs + 4. Set the field value on all matching Delta URLs + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + + field = self.get_field_to_modify() + new_value = self.get_new_value() + + # Get newly matching Curated URLs + matching_curated_urls = self.get_matching_curated_urls() + previously_unaffected_curated = matching_curated_urls.exclude( + id__in=self.curated_urls.values_list("id", flat=True) + ) + + # Create DeltaUrls only where field value would change + for curated_url in previously_unaffected_curated: + if not self.is_most_distinctive_pattern(curated_url): + continue + + if ( + getattr(curated_url, field) == new_value + or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists() + ): + continue + + fields = { + f.name: getattr(curated_url, f.name) + for f in curated_url._meta.fields + if f.name not in ["id", "collection"] + } + fields[field] = new_value + fields["to_delete"] = False + fields["collection"] = self.collection + + DeltaUrl.objects.create(**fields) + + # Update all matching DeltaUrls with the new field value if this is the most distinctive pattern + for delta_url in self.get_matching_delta_urls(): + if self.is_most_distinctive_pattern(delta_url): + setattr(delta_url, field, new_value) + delta_url.save() + + # Update pattern relationships + self.update_affected_delta_urls_list() + + def unapply(self) -> None: + """ + Remove field modifications: + 1. Create Delta URLs for affected Curated URLs to explicitly set NULL + 2. Remove field value from affected Delta URLs only if no other patterns affect them + 3. Clean up Delta URLs that become identical to their Curated URL + """ + + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + + field = self.get_field_to_modify() + + # Get all affected URLs + affected_deltas = self.delta_urls.all() + affected_curated = self.curated_urls.all() + + # Get all other patterns of same type for this collection + pattern_class = self.__class__ + other_patterns = pattern_class.objects.filter(collection=self.collection).exclude(id=self.id) + + # Process each affected delta URL + for delta in affected_deltas: + curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first() + + # Find next most specific matching pattern if any + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)] + + next_pattern = None + if matching_patterns: + # Sort by number of URLs matched (ascending) to find most specific + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + + if next_pattern: + # Apply next most specific pattern's value + setattr(delta, field, next_pattern.get_new_value()) + delta.save() + elif curated: + # No other patterns match, revert to curated value + setattr(delta, field, getattr(curated, field)) + delta.save() + + # Check if delta is now redundant + fields_match = all( + getattr(delta, f.name) == getattr(curated, f.name) + for f in delta._meta.fields + if f.name not in ["id", "to_delete"] + ) + if fields_match: + delta.delete() + else: + # No curated URL or other patterns, set to None + setattr(delta, field, None) + delta.save() + + # Handle curated URLs that don't have deltas + for curated in affected_curated: + if not DeltaUrl.objects.filter(url=curated.url).exists(): + # Find any matching patterns + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)] + + if matching_patterns: + # Apply most specific pattern's value + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + fields[field] = next_pattern.get_new_value() + DeltaUrl.objects.create(collection=self.collection, **fields) + else: + # No other patterns, create delta with None + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + fields[field] = None + DeltaUrl.objects.create(collection=self.collection, **fields) + + # Clear pattern relationships + self.delta_urls.clear() + self.curated_urls.clear() + + +class DeltaDocumentTypePattern(FieldModifyingPattern): + """Pattern for setting document types.""" + + document_type = models.IntegerField(choices=DocumentTypes.choices) + + def get_field_to_modify(self) -> str: + return "document_type" + + def get_new_value(self) -> Any: + return self.document_type + + class Meta(FieldModifyingPattern.Meta): + verbose_name = "Delta Document Type Pattern" + verbose_name_plural = "Delta Document Type Patterns" + + +class DeltaDivisionPattern(FieldModifyingPattern): + """Pattern for setting divisions.""" + + division = models.IntegerField(choices=Divisions.choices) + + def get_field_to_modify(self) -> str: + return "division" + + def get_new_value(self) -> Any: + return self.division + + class Meta(FieldModifyingPattern.Meta): + verbose_name = "Delta Division Pattern" + verbose_name_plural = "Delta Division Patterns" + + +def validate_title_pattern(title_pattern_string: str) -> None: + """Validate title pattern format.""" + parsed_title = parse_title(title_pattern_string) + + for element_type, element_value in parsed_title: + if element_type == "xpath": + if not is_valid_xpath(element_value): + raise ValidationError(f"Invalid xpath: {element_value}") + elif element_type == "brace": + try: + validate_fstring(element_value) + except ValueError as e: + raise ValidationError(str(e)) + + +class DeltaTitlePattern(BaseMatchPattern): + """Pattern for modifying titles of URLs based on a template pattern.""" + + title_pattern = models.CharField( + "Title Pattern", + help_text="Pattern for the new title. Can be an exact replacement string or sinequa-valid code", + validators=[validate_title_pattern], + ) + + def generate_title_for_url(self, url_obj) -> tuple[str, str | None]: + """ + Generate a new title for a URL using the pattern. + Returns tuple of (generated_title, error_message). + """ + context = { + "url": url_obj.url, + "title": url_obj.scraped_title, + "collection": self.collection.name, + } + + try: + return resolve_title(self.title_pattern, context), None + except Exception as e: + return None, str(e) + + def apply(self) -> None: + """ + Apply the title pattern to matching URLs: + 1. Find new Curated URLs that match but weren't previously affected + 2. Create Delta URLs only where the generated title differs + 3. Update all matching Delta URLs with new titles + 4. Track title resolution status and errors + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle") + DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError") + + # Get newly matching Curated URLs + matching_curated_urls = self.get_matching_curated_urls() + previously_unaffected_curated = matching_curated_urls.exclude( + id__in=self.curated_urls.values_list("id", flat=True) + ) + + # Process each previously unaffected curated URL + for curated_url in previously_unaffected_curated: + if not self.is_most_distinctive_pattern(curated_url): + continue + + new_title, error = self.generate_title_for_url(curated_url) + + if error: + DeltaResolvedTitleError.objects.update_or_create( + delta_url=curated_url, defaults={"title_pattern": self, "error_string": error} # lookup field + ) + continue + + # Skip if the generated title matches existing or if Delta already exists + if ( + curated_url.generated_title == new_title + or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists() + ): + continue + + # Create new Delta URL with the new title + fields = { + field.name: getattr(curated_url, field.name) + for field in curated_url._meta.fields + if field.name not in ["id", "collection"] + } + fields["generated_title"] = new_title + fields["to_delete"] = False + fields["collection"] = self.collection + + delta_url = DeltaUrl.objects.create(**fields) + + # Record successful title resolution + DeltaResolvedTitle.objects.create(title_pattern=self, delta_url=delta_url, resolved_title=new_title) + + # Update titles for all matching Delta URLs + for delta_url in self.get_matching_delta_urls(): + if not self.is_most_distinctive_pattern(delta_url): + continue + + new_title, error = self.generate_title_for_url(delta_url) + + if error: + DeltaResolvedTitleError.objects.update_or_create( + delta_url=delta_url, defaults={"title_pattern": self, "error_string": error} # lookup field + ) + continue + + # Update title and record resolution - key change here + DeltaResolvedTitle.objects.update_or_create( + delta_url=delta_url, # Only use delta_url for lookup + defaults={"title_pattern": self, "resolved_title": new_title}, + ) + + delta_url.generated_title = new_title + delta_url.save() + + # Update pattern relationships + self.update_affected_delta_urls_list() + + def unapply(self) -> None: + """ + Remove title modifications, maintaining pattern precedence: + 1. Find any remaining patterns that match each URL + 2. Apply most specific matching pattern's title if one exists + 3. Otherwise revert to curated title or clear title + 4. Update title resolution tracking + 5. Clean up redundant deltas + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle") + DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError") + + # Get all affected URLs + affected_deltas = self.delta_urls.all() + affected_curated = self.curated_urls.all() + + # Get all other title patterns for this collection + other_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(id=self.id) + + # Process each affected delta URL + for delta in affected_deltas: + curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first() + + # Find next most specific matching pattern if any + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)] + + next_pattern = None + if matching_patterns: + # Sort by number of URLs matched (ascending) to find most specific + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + + if next_pattern: + # Apply next most specific pattern's title + new_title, error = next_pattern.generate_title_for_url(delta) + if error: + DeltaResolvedTitleError.objects.update_or_create( + delta_url=delta, defaults={"title_pattern": next_pattern, "error_string": error} + ) + else: + delta.generated_title = new_title + delta.save() + DeltaResolvedTitle.objects.update_or_create( + delta_url=delta, defaults={"title_pattern": next_pattern, "resolved_title": new_title} + ) + elif curated: + # No other patterns match, revert to curated title + delta.generated_title = curated.generated_title + delta.save() + + # Check if delta is now redundant + fields_match = all( + getattr(delta, f.name) == getattr(curated, f.name) + for f in delta._meta.fields + if f.name not in ["id", "to_delete"] + ) + if fields_match: + delta.delete() + else: + # No curated URL or other patterns, clear title + delta.generated_title = "" + delta.save() + + # Handle curated URLs that don't have deltas + for curated in affected_curated: + if not DeltaUrl.objects.filter(url=curated.url).exists(): + # Find any matching patterns + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)] + + if matching_patterns: + # Apply most specific pattern's title + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + + # Copy all fields from curated + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + + # Generate and apply new title + new_title, error = next_pattern.generate_title_for_url(curated) + if not error: + fields["generated_title"] = new_title + delta = DeltaUrl.objects.create(collection=self.collection, **fields) + DeltaResolvedTitle.objects.create( + title_pattern=next_pattern, delta_url=delta, resolved_title=new_title + ) + else: + # No other patterns, create delta with cleared title + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + fields["generated_title"] = "" + DeltaUrl.objects.create(collection=self.collection, **fields) + + # Clear resolution tracking for this pattern + DeltaResolvedTitle.objects.filter(title_pattern=self).delete() + DeltaResolvedTitleError.objects.filter(title_pattern=self).delete() + + # Clear pattern relationships + self.delta_urls.clear() + self.curated_urls.clear() + + class Meta(BaseMatchPattern.Meta): + verbose_name = "Delta Title Pattern" + verbose_name_plural = "Delta Title Patterns" + + +class DeltaResolvedTitleBase(models.Model): + # TODO: need to understand this logic and whether we need to have these match to CuratedUrls as well + + title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE) + delta_url = models.OneToOneField("sde_collections.DeltaUrl", on_delete=models.CASCADE) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + abstract = True + + +class DeltaResolvedTitle(DeltaResolvedTitleBase): + resolved_title = models.CharField(blank=True, default="") + + class Meta: + verbose_name = "Resolved Title" + verbose_name_plural = "Resolved Titles" + + def save(self, *args, **kwargs): + # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it + DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete() + super().save(*args, **kwargs) + + +class DeltaResolvedTitleError(DeltaResolvedTitleBase): + error_string = models.TextField(null=False, blank=False) + http_status_code = models.IntegerField(null=True, blank=True) diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py new file mode 100644 index 00000000..88df502b --- /dev/null +++ b/sde_collections/models/delta_url.py @@ -0,0 +1,184 @@ +import os +from urllib.parse import urlparse + +from django.contrib.postgres.fields import ArrayField +from django.db import models + +from ..utils.paired_field_descriptor import PairedFieldDescriptor +from .collection_choice_fields import Divisions, DocumentTypes, TDAMMTags +from .delta_patterns import DeltaExcludePattern, DeltaIncludePattern + + +class DeltaUrlQuerySet(models.QuerySet): + def with_exclusion_status(self): + """ + Annotate queryset with exclusion status, taking into account both exclude and include patterns. + Include patterns take precedence over exclude patterns. + """ + return self.annotate( + has_exclude=models.Exists( + DeltaExcludePattern.delta_urls.through.objects.filter(deltaurl=models.OuterRef("pk")) + ), + has_include=models.Exists( + DeltaIncludePattern.delta_urls.through.objects.filter(deltaurl=models.OuterRef("pk")) + ), + excluded=models.Case( + # If has_include is True, URL is not excluded regardless of exclude patterns + models.When(has_include=True, then=models.Value(False)), + # Otherwise, excluded status is determined by presence of exclude pattern + default=models.F("has_exclude"), + output_field=models.BooleanField(), + ), + ) + + +class CuratedUrlQuerySet(models.QuerySet): + def with_exclusion_status(self): + """ + Annotate queryset with exclusion status, taking into account both exclude and include patterns. + Include patterns take precedence over exclude patterns. + """ + return self.annotate( + has_exclude=models.Exists( + DeltaExcludePattern.curated_urls.through.objects.filter(curatedurl=models.OuterRef("pk")) + ), + has_include=models.Exists( + DeltaIncludePattern.curated_urls.through.objects.filter(curatedurl=models.OuterRef("pk")) + ), + excluded=models.Case( + # If has_include is True, URL is not excluded regardless of exclude patterns + models.When(has_include=True, then=models.Value(False)), + # Otherwise, excluded status is determined by presence of exclude pattern + default=models.F("has_exclude"), + output_field=models.BooleanField(), + ), + ) + + +# Manager classes remain unchanged since they just use the updated QuerySets +class DeltaUrlManager(models.Manager): + def get_queryset(self): + return DeltaUrlQuerySet(self.model, using=self._db).with_exclusion_status() + + +class CuratedUrlManager(models.Manager): + def get_queryset(self): + return CuratedUrlQuerySet(self.model, using=self._db).with_exclusion_status() + + +class BaseUrl(models.Model): + """Abstract base class for Urls with shared fields and methods.""" + + url = models.CharField("Url", unique=True) + scraped_title = models.CharField( + "Scraped Title", + default="", + blank=True, + help_text="This is the original title scraped by Sinequa", + ) + scraped_text = models.TextField( + "Scraped Text", + default="", + blank=True, + help_text="This is the text scraped by Sinequa", + ) + generated_title = models.CharField( + "Generated Title", + default="", + blank=True, + help_text="This is the title generated based on a Title Pattern", + ) + + visited = models.BooleanField(default=False) + document_type = models.IntegerField(choices=DocumentTypes.choices, null=True) + division = models.IntegerField(choices=Divisions.choices, null=True) + + tdamm_tag = PairedFieldDescriptor( + field_name="tdamm_tag", + field_type=ArrayField(models.CharField(max_length=255, choices=TDAMMTags.choices), blank=True, null=True), + verbose_name="TDAMM Tags", + ) + + class Meta: + abstract = True + ordering = ["url"] + + @property + def fileext(self) -> str: + # Parse the URL to get the path + parsed_url = urlparse(self.url) + path = parsed_url.path + + # Check for cases where the path ends with a slash or is empty, implying a directory or default file + if path.endswith("/") or not path: + return "html" + + # Extract the extension from the path + extension = os.path.splitext(path)[1] + + # Default to .html if no extension is found + if not extension: + return "html" + + if extension.startswith("."): + return extension[1:] + return extension + + def splits(self) -> list[tuple[str, str]]: + """Split the path into multiple collections.""" + parts = [] + part_string = "" + for part in self.path.split("/"): + if part: + part_string += f"/{part}" + parts.append((part_string, part)) + return parts + + @property + def path(self) -> str: + parsed = urlparse(self.url) + path = f"{parsed.path}" + if parsed.query: + path += f"?{parsed.query}" + return path + + def __str__(self): + return self.url + + +class DumpUrl(BaseUrl): + """Stores the raw dump from the server before deltas are calculated.""" + + collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="dump_urls") + + class Meta: + verbose_name = "Dump Urls" + verbose_name_plural = "Dump Urls" + ordering = ["url"] + + +class DeltaUrl(BaseUrl): + """Urls that are being curated. Only deltas are stored in this model.""" + + collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="delta_urls") + + objects = DeltaUrlManager() + to_delete = models.BooleanField(default=False) + + class Meta: + verbose_name = "Delta Urls" + verbose_name_plural = "Delta Urls" + ordering = ["url"] + + +class CuratedUrl(BaseUrl): + """Urls that are curated and ready for production""" + + collection = models.ForeignKey("Collection", on_delete=models.CASCADE, related_name="curated_urls") + + objects = CuratedUrlManager() + + class Meta: + verbose_name = "Curated Urls" + verbose_name_plural = "Curated Urls" + ordering = ["url"] diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index ae5d78ef..774b988e 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -5,10 +5,10 @@ from django.db import models from ..utils.title_resolver import ( - is_valid_fstring, is_valid_xpath, parse_title, resolve_title, + validate_fstring, ) from .collection_choice_fields import Divisions, DocumentTypes @@ -143,10 +143,10 @@ def validate_title_pattern(title_pattern_string): if element_type == "xpath": if not is_valid_xpath(element_value): - raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") + raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") # noqa: E231 elif element_type == "brace": try: - is_valid_fstring(element_value) + validate_fstring(element_value) except ValueError as e: raise ValidationError(str(e)) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index 4540bdfb..4c5cc897 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -1,20 +1,21 @@ from rest_framework import serializers -from .models.candidate_url import CandidateURL -from .models.collection import Collection, WorkflowHistory -from .models.collection_choice_fields import Divisions, DocumentTypes -from .models.pattern import ( - DivisionPattern, - DocumentTypePattern, - ExcludePattern, - IncludePattern, - TitlePattern, +from .models.collection import Collection, ReindexingHistory, WorkflowHistory +from .models.collection_choice_fields import Divisions, DocumentTypes, TDAMMTags +from .models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, + DeltaExcludePattern, + DeltaIncludePattern, + DeltaTitlePattern, ) +from .models.delta_url import CuratedUrl, DeltaUrl class CollectionSerializer(serializers.ModelSerializer): curation_status_display = serializers.CharField(source="get_curation_status_display", read_only=True) workflow_status_display = serializers.CharField(source="get_workflow_status_display", read_only=True) + reindexing_status_display = serializers.CharField(source="get_reindexing_status_display", read_only=True) class Meta: model = Collection @@ -22,8 +23,10 @@ class Meta: "id", "curation_status", "workflow_status", + "reindexing_status", "curation_status_display", "workflow_status_display", + "reindexing_status_display", "curated_by", "division", "document_type", @@ -33,6 +36,7 @@ class Meta: "division": {"required": False}, "document_type": {"required": False}, "name": {"required": False}, + # "reindexing_status": {"required": False}, } # extra_kwargs = { @@ -54,76 +58,140 @@ class Meta: fields = "__all__" -class CandidateURLSerializer(serializers.ModelSerializer): +class ReindexingHistorySerializer(serializers.ModelSerializer): + class Meta: + model = ReindexingHistory + fields = "__all__" + + +class DeltaURLSerializer(serializers.ModelSerializer): excluded = serializers.BooleanField(required=False) document_type_display = serializers.CharField(source="get_document_type_display", read_only=True) division_display = serializers.CharField(source="get_division_display", read_only=True) url = serializers.CharField(required=False) generated_title_id = serializers.SerializerMethodField(read_only=True) match_pattern_type = serializers.SerializerMethodField(read_only=True) - candidate_urls_count = serializers.SerializerMethodField(read_only=True) + delta_urls_count = serializers.SerializerMethodField(read_only=True) + tdamm_tag = serializers.SerializerMethodField() + exclude_pattern_type = serializers.IntegerField(read_only=True) + include_pattern_id = serializers.IntegerField(read_only=True) + + def get_tdamm_tag(self, obj): + tags = obj.tdamm_tag + return tags if tags is not None else [] - def get_candidate_urls_count(self, obj): - titlepattern = obj.titlepattern_urls.last() - return titlepattern.candidate_urls.count() if titlepattern else 0 + def get_delta_urls_count(self, obj): + titlepattern = obj.deltatitlepatterns.last() + return titlepattern.delta_urls.count() if titlepattern else 0 def get_generated_title_id(self, obj): - titlepattern = obj.titlepattern_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.id if titlepattern else None def get_match_pattern_type(self, obj): - titlepattern = obj.titlepattern_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.match_pattern_type if titlepattern else None class Meta: - model = CandidateURL + model = DeltaUrl fields = ( "id", "excluded", "url", + "to_delete", "scraped_title", "generated_title", "generated_title_id", "match_pattern_type", - "candidate_urls_count", + "delta_urls_count", "document_type", "document_type_display", "division", "division_display", "visited", - "test_title", - "production_title", - "present_on_test", - "present_on_prod", + "tdamm_tag", + "exclude_pattern_type", + "include_pattern_id", ) -class CandidateURLBulkCreateSerializer(serializers.ModelSerializer): +class CuratedURLSerializer(serializers.ModelSerializer): + excluded = serializers.BooleanField(required=False) + document_type_display = serializers.CharField(source="get_document_type_display", read_only=True) + division_display = serializers.CharField(source="get_division_display", read_only=True) + url = serializers.CharField(required=False) + generated_title_id = serializers.SerializerMethodField(read_only=True) + match_pattern_type = serializers.SerializerMethodField(read_only=True) + curated_urls_count = serializers.SerializerMethodField(read_only=True) + tdamm_tag = serializers.SerializerMethodField() + + def get_tdamm_tag(self, obj): + tags = obj.tdamm_tag + return tags if tags is not None else [] + + def get_curated_urls_count(self, obj): + titlepattern = obj.deltatitlepatterns.last() + return titlepattern.curated_urls.count() if titlepattern else 0 + + def get_generated_title_id(self, obj): + titlepattern = obj.deltatitlepatterns.last() + return titlepattern.id if titlepattern else None + + def get_match_pattern_type(self, obj): + titlepattern = obj.deltatitlepatterns.last() + return titlepattern.match_pattern_type if titlepattern else None + class Meta: - model = CandidateURL + model = CuratedUrl + fields = ( + "id", + "excluded", + "url", + "scraped_title", + "generated_title", + "generated_title_id", + "match_pattern_type", + "curated_urls_count", + "document_type", + "document_type_display", + "division", + "division_display", + "visited", + "tdamm_tag", + ) + + +class DeltaURLBulkCreateSerializer(serializers.ModelSerializer): + class Meta: + model = DeltaUrl fields = ( "url", "scraped_title", ) -class CandidateURLAPISerializer(serializers.ModelSerializer): +class DeltaURLAPISerializer(serializers.ModelSerializer): document_type = serializers.SerializerMethodField() title = serializers.SerializerMethodField() file_extension = serializers.SerializerMethodField() tree_root = serializers.SerializerMethodField() + tdamm_tag = serializers.SerializerMethodField() class Meta: - model = CandidateURL + model = DeltaUrl fields = ( "url", "title", "document_type", - "hash", "file_extension", "tree_root", + "tdamm_tag", ) + def get_tdamm_tag(self, obj): + tags = obj.tdamm_tag + return tags if tags is not None else [] + def get_document_type(self, obj): if obj.document_type is not None: return obj.get_document_type_display() @@ -141,19 +209,88 @@ def get_file_extension(self, obj): def get_tree_root(self, obj): if obj.collection.is_multi_division: if obj.division: - return f"/{obj.get_division_display()}/{obj.collection.config_folder}" + return f"/{obj.get_division_display()}/{obj.collection.name}/" else: - return f"/{obj.collection.get_division_display()}/{obj.collection.config_folder}" + return f"/{obj.collection.get_division_display()}/{obj.collection.name}/" + else: + return obj.collection.tree_root + + +class CuratedURLAPISerializer(serializers.ModelSerializer): + document_type = serializers.SerializerMethodField() + title = serializers.SerializerMethodField() + file_extension = serializers.SerializerMethodField() + tree_root = serializers.SerializerMethodField() + tdamm_tag = serializers.SerializerMethodField() + + class Meta: + model = CuratedUrl + fields = ( + "url", + "title", + "document_type", + "file_extension", + "tree_root", + "tdamm_tag", + ) + + def get_tdamm_tag(self, obj): + empty_categories = {"messengers": [], "objects": [], "signals": []} + if not obj.tdamm_tag or obj.tdamm_tag == ["NOT_TDAMM"]: + return empty_categories + + categories = empty_categories.copy() + prefix_mapping = {"MMA_M_": "messengers", "MMA_O_": "objects", "MMA_S_": "signals"} + + for tag in obj.tdamm_tag: + if tag == "NOT_TDAMM": + continue + + tag_text = dict(TDAMMTags.choices).get(tag) + if not tag_text: + continue + + for prefix, category in prefix_mapping.items(): + if tag.startswith(prefix): + categories[category].append(tag_text.replace(" - ", "/")) + break + + return categories + + def get_document_type(self, obj): + if obj.document_type is not None: + return obj.get_document_type_display() + elif obj.collection.document_type is not None: + return obj.collection.get_document_type_display() + else: + return "Unknown" + + def get_title(self, obj): + return obj.generated_title if obj.generated_title else obj.scraped_title + + def get_file_extension(self, obj): + return obj.fileext + + def get_tree_root(self, obj): + if obj.collection.is_multi_division: + if obj.division: + return f"/{obj.get_division_display()}/{obj.collection.name}/" + else: + return f"/{obj.collection.get_division_display()}/{obj.collection.name}/" else: return obj.collection.tree_root class BasePatternSerializer(serializers.ModelSerializer): match_pattern_type_display = serializers.CharField(source="get_match_pattern_type_display", read_only=True) - candidate_urls_count = serializers.SerializerMethodField(read_only=True) + delta_urls_count = serializers.SerializerMethodField(read_only=True) + curated_urls_count = serializers.SerializerMethodField(read_only=True) + + def get_delta_urls_count(self, instance): + return instance.delta_urls.count() - def get_candidate_urls_count(self, instance): - return instance.candidate_urls.count() + def get_curated_urls_count(self, instance): + return instance.curated_urls.count() class Meta: fields = ( @@ -162,36 +299,37 @@ class Meta: "match_pattern", "match_pattern_type", "match_pattern_type_display", - "candidate_urls_count", + "delta_urls_count", + "curated_urls_count", ) abstract = True class ExcludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer): class Meta: - model = ExcludePattern + model = DeltaExcludePattern fields = BasePatternSerializer.Meta.fields + ("reason",) class IncludePatternSerializer(BasePatternSerializer, serializers.ModelSerializer): class Meta: - model = IncludePattern + model = DeltaIncludePattern fields = BasePatternSerializer.Meta.fields class TitlePatternSerializer(BasePatternSerializer, serializers.ModelSerializer): class Meta: - model = TitlePattern + model = DeltaTitlePattern fields = BasePatternSerializer.Meta.fields + ("title_pattern",) def validate_match_pattern(self, value): try: - title_pattern = TitlePattern.objects.get( + title_pattern = DeltaTitlePattern.objects.get( match_pattern=value, - match_pattern_type=TitlePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, ) title_pattern.delete() - except TitlePattern.DoesNotExist: + except DeltaTitlePattern.DoesNotExist: pass return value @@ -206,7 +344,7 @@ class DocumentTypePatternSerializer(BasePatternSerializer, serializers.ModelSeri ) class Meta: - model = DocumentTypePattern + model = DeltaDocumentTypePattern fields = BasePatternSerializer.Meta.fields + ( "document_type", "document_type_display", @@ -214,12 +352,12 @@ class Meta: def validate_match_pattern(self, value): try: - title_pattern = DocumentTypePattern.objects.get( + title_pattern = DeltaDocumentTypePattern.objects.get( match_pattern=value, - match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, ) title_pattern.delete() - except DocumentTypePattern.DoesNotExist: + except DeltaDocumentTypePattern.DoesNotExist: pass return value @@ -229,7 +367,7 @@ class DivisionPatternSerializer(BasePatternSerializer, serializers.ModelSerializ division = serializers.ChoiceField(choices=Divisions.choices) class Meta: - model = DivisionPattern + model = DeltaDivisionPattern fields = BasePatternSerializer.Meta.fields + ( "division", "division_display", @@ -237,11 +375,11 @@ class Meta: def validate_match_pattern(self, value): try: - division_pattern = DivisionPattern.objects.get( + division_pattern = DeltaDivisionPattern.objects.get( match_pattern=value, - match_pattern_type=DivisionPattern.MatchPatternTypeChoices.INDIVIDUAL_URL, + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.INDIVIDUAL_URL, ) division_pattern.delete() - except DivisionPattern.DoesNotExist: + except DeltaDivisionPattern.DoesNotExist: pass return value diff --git a/sde_collections/sinequa_api.py b/sde_collections/sinequa_api.py index 1dffe26b..8dedbda0 100644 --- a/sde_collections/sinequa_api.py +++ b/sde_collections/sinequa_api.py @@ -1,3 +1,5 @@ +import json +from collections.abc import Iterator from typing import Any import requests @@ -16,72 +18,107 @@ "app_name": "nasa-sba-smd", "query_name": "query-smd-primary", "base_url": "https://sciencediscoveryengine.test.nasa.gov", + "index": "sde_index", }, "production": { "app_name": "nasa-sba-smd", "query_name": "query-smd-primary", "base_url": "https://sciencediscoveryengine.nasa.gov", + "index": "sde_index", }, "secret_test": { "app_name": "nasa-sba-sde", "query_name": "query-sde-primary", "base_url": "https://sciencediscoveryengine.test.nasa.gov", + "index": "sde_index", }, "secret_production": { "app_name": "nasa-sba-sde", "query_name": "query-sde-primary", "base_url": "https://sciencediscoveryengine.nasa.gov", + "index": "sde_index", }, - "lis_server": { + "xli": { "app_name": "nasa-sba-smd", "query_name": "query-smd-primary", "base_url": "http://sde-xli.nasa-impact.net", + "index": "sde_index", }, - "lrm_dev_server": { - "app_name": "nasa-sba-smd", - "query_name": "query-smd-primary", + "lrm_dev": { + "app_name": "sde-init-check", + "query_name": "query-init-check", "base_url": "https://sde-lrm.nasa-impact.net", + "index": "sde_init_check", }, - "lrm_qa_server": { - "app_name": "nasa-sba-smd", - "query_name": "query-smd-primary", + "lrm_qa": { + "app_name": "sde-init-check", + "query_name": "query-init-check", "base_url": "https://sde-qa.nasa-impact.net", }, } class Api: - def __init__(self, server_name: str) -> None: + def __init__(self, server_name: str = None, user: str = None, password: str = None, token: str = None) -> None: self.server_name = server_name - self.app_name: str = server_configs[server_name]["app_name"] - self.query_name: str = server_configs[server_name]["query_name"] - self.base_url: str = server_configs[server_name]["base_url"] - self.xli_user = settings.XLI_USER - self.xli_password = settings.XLI_PASSWORD - self.lrm_user = settings.LRM_USER - self.lrm_password = settings.LRM_PASSWORD - self.lrm_qa_user = settings.LRM_QA_USER - self.lrm_qa_password = settings.LRM_QA_PASSWORD - - def process_response(self, url: str, payload: dict[str, Any]) -> Any: - response = requests.post(url, headers={}, json=payload, verify=False) - - if response.status_code == requests.status_codes.codes.ok: - meaningful_response = response.json() - else: - raise Exception(response.text) + if server_name not in server_configs: + raise ValueError(f"Invalid server configuration: '{server_name}' is not a recognized server name") + + self.config = server_configs[server_name] + self.app_name: str = self.config["app_name"] + self.query_name: str = self.config["query_name"] + self.base_url: str = self.config["base_url"] + self.dev_servers = ["xli", "lrm_dev", "lrm_qa"] + + self._provided_user = user + self._provided_password = password + self._provided_token = token + + def _get_user(self) -> str | None: + """Retrieve the user, using the provided value or defaulting to Django settings.""" + return self._provided_user or getattr(settings, f"{self.server_name}_USER".upper(), None) + + def _get_password(self) -> str | None: + """Retrieve the password, using the provided value or defaulting to Django settings.""" + return self._provided_password or getattr(settings, f"{self.server_name}_PASSWORD".upper(), None) + + def _get_token(self) -> str | None: + """Retrieve the token, using the provided value or defaulting to Django settings.""" + return self._provided_token or getattr(settings, f"{self.server_name}_TOKEN".upper(), None) - return meaningful_response + def _get_source_name(self) -> str: + """by default, the source is /SDE/. However for the various dev servers, the source is tends to be /scrapers/""" + return "scrapers" if self.server_name in self.dev_servers else "SDE" - def query(self, page: int, collection_config_folder: str = "") -> Any: - if self.server_name == "lis_server": - url = f"{self.base_url}/api/v1/search.query?Password={self.xli_password}&User={self.xli_user}" - elif self.server_name == "lrm_dev_server": - url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_password}&User={self.lrm_user}" - elif self.server_name == "lrm_qa_server": - url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_qa_password}&User={self.lrm_qa_user}" + def process_response( + self, + url: str, + payload: dict[str, Any] | None = None, + headers: dict[str, str] | None = None, + raw_data: str | None = None, + ) -> Any: + """Sends a POST request and processes the response.""" + response = requests.post( + url, headers=headers, json=payload if raw_data is None else None, data=raw_data, verify=False + ) + if response.status_code == requests.codes.ok: + return response.json() else: - url = f"{self.base_url}/api/v1/search.query" + response.raise_for_status() + + def query(self, page: int, collection_config_folder: str | None = None, source: str | None = None) -> Any: + url = f"{self.base_url}/api/v1/search.query" + if self.server_name in self.dev_servers: + user = self._get_user() + password = self._get_password() + if not user or not password: + raise ValueError( + f"Authentication error: Missing credentials for dev server '{self.server_name}'. " + f"Both username and password are required for servers: {', '.join(self.dev_servers)}" + ) + authentication = f"?Password={password}&User={user}" + url = f"{url}{authentication}" + payload = { "app": self.app_name, "query": { @@ -94,11 +131,165 @@ def query(self, page: int, collection_config_folder: str = "") -> Any: } if collection_config_folder: - if self.server_name == "lis_server": - payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/" - else: - payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/" + source = source if source else self._get_source_name() + payload["query"]["advanced"]["collection"] = f"/{source}/{collection_config_folder}/" + + return self.process_response(url, payload) + + def _execute_sql_query(self, sql: str) -> dict: + """ + Executes a SQL query against the Sinequa API. + + Args: + sql (str): The SQL query to execute + + Returns: + dict: The JSON response from the API containing 'Rows' and 'TotalRowCount' + + Raises: + ValueError: If no token is available for authentication + """ + token = self._get_token() + if not token: + raise ValueError("Authentication error: Token is required for SQL endpoint access") + + url = f"{self.base_url}/api/v1/engine.sql" + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} + raw_payload = json.dumps( + { + "method": "engine.sql", + "sql": sql, + "pretty": True, + } + ) + + return self.process_response(url, headers=headers, raw_data=raw_payload) + + def _process_rows_to_records(self, rows: list) -> list[dict]: + """ + Converts raw SQL row data into structured record dictionaries. + + Args: + rows (list): List of rows, where each row is [url, full_text, title] + + Returns: + list[dict]: List of processed records with url, full_text, and title keys + + Raises: + ValueError: If any row doesn't contain exactly 3 elements + """ + processed_records = [] + for idx, row in enumerate(rows): + if len(row) != 3: + raise ValueError( + f"Invalid row format at index {idx}: Expected exactly three elements (url, full_text, title). " + f"Received {len(row)} elements." + ) + processed_records.append({"url": row[0], "full_text": row[1], "title": row[2]}) + return processed_records + + def get_full_texts( + self, + collection_config_folder: str, + source: str = None, + start_at: int = 0, + batch_size: int = 500, + min_batch_size: int = 1, + ) -> Iterator[dict]: + """ + Retrieves and yields batches of text records from the SQL database for a given collection. + Uses pagination to handle large datasets efficiently. If a query fails, it automatically + reduces the batch size and retries, with the ability to recover batch size after successful queries. + + Args: + collection_config_folder (str): The collection folder to query (e.g., "EARTHDATA", "CASEI") + source (str, optional): The source to query. If None, defaults to "scrapers" for dev servers + or "SDE" for other servers. + start_at (int, optional): Starting offset for records. Defaults to 0. + page_size (int, optional): Initial number of records per batch. Defaults to 500. + min_batch_size (int, optional): Minimum batch size before giving up. Defaults to 1. + + Yields: + list[dict]: Batches of records, where each record is a dictionary containing: + { + "url": str, # The URL of the document + "full_text": str, # The full text content of the document + "title": str # The title of the document + } + + Raises: + ValueError: If the server's index is not defined in its configuration + ValueError: If batch size reaches minimum without success + + Note: + - Results are paginated with adaptive batch sizing + - Each batch is processed into clean dictionaries before being yielded + - The iterator will stop when either: + 1. No more rows are returned from the query + 2. The total count of records has been reached + - Batch size will decrease on failure and can recover after successful queries + """ + if not source: + source = self._get_source_name() + + if (index := self.config.get("index")) is None: + raise ValueError( + f"Configuration error: Index not defined for server '{self.server_name}'. " + "Please update server configuration with the required index." + ) + + base_sql = f"SELECT url1, text, title FROM {index} WHERE collection = '/{source}/{collection_config_folder}/'" + + current_offset = start_at + current_batch_size = batch_size + total_count = None + + while True: + sql = f"{base_sql} SKIP {current_offset} COUNT {current_batch_size}" + + try: + response = self._execute_sql_query(sql) + rows = response.get("Rows", []) + + if not rows: # Stop if we get an empty batch + break + + if total_count is None: + total_count = response.get("TotalRowCount", 0) + + yield self._process_rows_to_records(rows) + + current_offset += len(rows) + + if total_count and current_offset >= total_count: # Stop if we've processed all records + break + + except (requests.RequestException, ValueError) as e: + if current_batch_size <= min_batch_size: + raise ValueError( + f"Failed to process batch even at minimum size {min_batch_size}. " f"Last error: {str(e)}" + ) + + # Halve the batch size and retry + current_batch_size = max(current_batch_size // 2, min_batch_size) + print(f"Reducing batch size to {current_batch_size} and retrying...") + continue - response = self.process_response(url, payload) + @staticmethod + def _process_full_text_response(batch_data: dict): + if "Rows" not in batch_data or not isinstance(batch_data["Rows"], list): + raise ValueError( + "Invalid response format: Expected 'Rows' key with list data in Sinequa server response. " + f"Received: {type(batch_data.get('Rows', None))}" + ) - return response + processed_data = [] + for idx, row in enumerate(batch_data["Rows"]): + if len(row) != 3: + raise ValueError( + f"Invalid row format at index {idx}: Expected exactly three elements (url, full_text, title). " + f"Received {len(row)} elements." + ) + url, full_text, title = row + processed_data.append({"url": url, "full_text": full_text, "title": title}) + return processed_data diff --git a/sde_collections/tasks.py b/sde_collections/tasks.py index fa754efc..86605124 100644 --- a/sde_collections/tasks.py +++ b/sde_collections/tasks.py @@ -7,10 +7,15 @@ from django.conf import settings from django.core import management from django.core.management.commands import loaddata +from django.db import transaction from config import celery_app +from sde_collections.models.collection_choice_fields import ( + ReindexingStatusChoices, + WorkflowStatusChoices, +) -from .models.collection import Collection, WorkflowStatusChoices +from .models.delta_url import DumpUrl from .sinequa_api import Api from .utils.github_helper import GitHubHandler @@ -49,7 +54,7 @@ def _get_data_to_import(collection, server_name): continue augmented_data = { - "model": "sde_collections.candidateurl", + "model": "sde_collections.url", "fields": { "collection": collection_pk, "url": url, @@ -66,6 +71,7 @@ def _get_data_to_import(collection, server_name): def import_candidate_urls_from_api(server_name="test", collection_ids=[]): TEMP_FOLDER_NAME = "temp" os.makedirs(TEMP_FOLDER_NAME, exist_ok=True) + Collection = apps.get_model("sde_collections", "Collection") collections = Collection.objects.filter(id__in=collection_ids) @@ -104,6 +110,8 @@ def import_candidate_urls_from_api(server_name="test", collection_ids=[]): @celery_app.task() def push_to_github_task(collection_ids): + Collection = apps.get_model("sde_collections", "Collection") + collections = Collection.objects.filter(id__in=collection_ids) github_handler = GitHubHandler(collections) github_handler.push_to_github() @@ -111,12 +119,16 @@ def push_to_github_task(collection_ids): @celery_app.task() def sync_with_production_webapp(): + Collection = apps.get_model("sde_collections", "Collection") + for collection in Collection.objects.all(): collection.sync_with_production_webapp() @celery_app.task() def pull_latest_collection_metadata_from_github(): + Collection = apps.get_model("sde_collections", "Collection") + FILENAME = "github_collections.json" gh = GitHubHandler(collections=Collection.objects.none()) @@ -141,3 +153,70 @@ def resolve_title_pattern(title_pattern_id): TitlePattern = apps.get_model("sde_collections", "TitlePattern") title_pattern = TitlePattern.objects.get(id=title_pattern_id) title_pattern.apply() + + +@celery_app.task(soft_time_limit=600) +def fetch_and_replace_full_text(collection_id, server_name): + """ + Task to fetch and replace full text and metadata for a collection. + Handles data in batches to manage memory usage and updates appropriate statuses + upon completion. + """ + Collection = apps.get_model("sde_collections", "Collection") + + collection = Collection.objects.get(id=collection_id) + api = Api(server_name) + + initial_workflow_status = collection.workflow_status + initial_reindexing_status = collection.reindexing_status + + # Step 1: Delete existing DumpUrl entries + deleted_count, _ = DumpUrl.objects.filter(collection=collection).delete() + print(f"Deleted {deleted_count} old records.") + + try: + # Step 2: Process data in batches + total_processed = 0 + for batch in api.get_full_texts(collection.config_folder): + with transaction.atomic(): + DumpUrl.objects.bulk_create( + [ + DumpUrl( + url=record["url"], + collection=collection, + scraped_text=record["full_text"], + scraped_title=record["title"], + ) + for record in batch + ] + ) + total_processed += len(batch) + print(f"Processed batch of {len(batch)} records. Total: {total_processed}") + + # Step 3: Migrate dump URLs to delta URLs + collection.migrate_dump_to_delta() + + # Step 4: Update statuses if needed + collection.refresh_from_db() + + # Check workflow status transition + pre_workflow_statuses = [ + WorkflowStatusChoices.RESEARCH_IN_PROGRESS, + WorkflowStatusChoices.READY_FOR_ENGINEERING, + WorkflowStatusChoices.ENGINEERING_IN_PROGRESS, + WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV, + ] + if initial_workflow_status in pre_workflow_statuses: + collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION + collection.save() + + # Check reindexing status transition + if initial_reindexing_status == ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV: + collection.reindexing_status = ReindexingStatusChoices.REINDEXING_READY_FOR_CURATION + collection.save() + + return f"Successfully processed {total_processed} records and updated the database." + + except Exception as e: + print(f"Error processing records: {str(e)}") + raise diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py new file mode 100644 index 00000000..dded5d5c --- /dev/null +++ b/sde_collections/tests/factories.py @@ -0,0 +1,90 @@ +import factory +from django.contrib.auth import get_user_model +from django.utils import timezone + +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import ( + ConnectorChoices, + Divisions, + DocumentTypes, + UpdateFrequencies, + WorkflowStatusChoices, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl + +User = get_user_model() + + +class UserFactory(factory.django.DjangoModelFactory): + class Meta: + model = User + + username = factory.Sequence(lambda n: f"user{n}") + email = factory.LazyAttribute(lambda obj: f"{obj.username}@example.com") + + +class CollectionFactory(factory.django.DjangoModelFactory): + class Meta: + model = Collection + + name = factory.Faker("company") + config_folder = factory.Sequence( + lambda n: f"config_folder_{n}" + ) # might need to update this to be calculated based on name? + url = factory.Faker("url") + division = Divisions.ASTROPHYSICS + connector = ConnectorChoices.CRAWLER2 + update_frequency = UpdateFrequencies.WEEKLY + document_type = DocumentTypes.DOCUMENTATION + delete = False + is_multi_division = False + + github_issue_number = factory.Sequence(lambda n: n) + notes = factory.Faker("paragraph") + updated_at = factory.LazyFunction(timezone.now) + new_collection = False + + workflow_status = WorkflowStatusChoices.RESEARCH_IN_PROGRESS + tracker = factory.Maybe("workflow_status") + + # ForeignKey to User for `curated_by` + curated_by = factory.SubFactory(UserFactory) + curation_started = factory.LazyFunction(timezone.now) + + +class DumpUrlFactory(factory.django.DjangoModelFactory): + class Meta: + model = DumpUrl + + collection = factory.SubFactory(CollectionFactory) + url = factory.Faker("url") + scraped_title = factory.Faker("sentence") + scraped_text = factory.Faker("paragraph") + # generated_title = factory.Faker("sentence") + # visited = factory.Faker("boolean") + # document_type = 1 + # division = 1 + + +class DeltaUrlFactory(factory.django.DjangoModelFactory): + class Meta: + model = DeltaUrl + + collection = factory.SubFactory(CollectionFactory) + url = factory.Faker("url") + scraped_title = factory.Faker("sentence") + to_delete = False + + +class CuratedUrlFactory(factory.django.DjangoModelFactory): + class Meta: + model = CuratedUrl + + collection = factory.SubFactory(CollectionFactory) + url = factory.Faker("url") + scraped_title = factory.Faker("sentence") + scraped_text = factory.Faker("paragraph") + generated_title = factory.Faker("sentence") + visited = factory.Faker("boolean") + document_type = 1 + division = 1 diff --git a/sde_collections/tests/frontend/__init__.py b/sde_collections/tests/frontend/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sde_collections/tests/frontend/base.py b/sde_collections/tests/frontend/base.py new file mode 100644 index 00000000..66a82941 --- /dev/null +++ b/sde_collections/tests/frontend/base.py @@ -0,0 +1,50 @@ +import shutil + +import pytest +from django.contrib.staticfiles.testing import StaticLiveServerTestCase +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.support.ui import WebDriverWait + +from .mixins import AuthenticationMixin + + +class BaseTestCase(StaticLiveServerTestCase, AuthenticationMixin): + """Base class for all frontend tests using Selenium.""" + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Verify ChromeDriver and Chromium are available + chromedriver_path = shutil.which("chromedriver") + chromium_path = shutil.which("chromium") + + if not chromedriver_path: + pytest.fail("ChromeDriver not found. Please ensure chromium-driver is installed.") + if not chromium_path: + pytest.fail("Chromium not found. Please ensure chromium is installed.") + + # Set up Chrome options + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.binary_location = chromium_path + + try: + service = Service(executable_path=chromedriver_path) + cls.driver = webdriver.Chrome(service=service, options=chrome_options) + cls.driver.set_window_size(1920, 1080) + cls.driver.implicitly_wait(10) + cls.wait = WebDriverWait(cls.driver, 10) + + except Exception as e: + pytest.fail(f"Failed to initialize ChromeDriver: {str(e)}") + + @classmethod + def tearDownClass(cls): + if hasattr(cls, "driver"): + cls.driver.quit() + super().tearDownClass() diff --git a/sde_collections/tests/frontend/mixins.py b/sde_collections/tests/frontend/mixins.py new file mode 100644 index 00000000..8b24cac1 --- /dev/null +++ b/sde_collections/tests/frontend/mixins.py @@ -0,0 +1,55 @@ +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC + +from ..factories import UserFactory + + +class AuthenticationMixin: + """Mixin for authentication-related test methods.""" + + def create_test_user(self, username="test_user", password="test_password123", **kwargs): + """Create a test user using UserFactory.""" + # Delete user if it already exists + UserFactory._meta.model.objects.filter(username=username).delete() + + user = UserFactory(username=username, is_active=True, **kwargs) + user.set_password(password) + user.save() + + return user, password + + def login(self, username="test_user", password="test_password123"): + """ + Login helper method. + Returns True if login successful, False otherwise. + """ + self.driver.get(f"{self.live_server_url}/accounts/login/") + + try: + username_input = self.wait.until(EC.presence_of_element_located((By.NAME, "login"))) + username_input.send_keys(username) + + password_input = self.driver.find_element(By.NAME, "password") + password_input.send_keys(password) + + login_button = self.driver.find_element(By.CSS_SELECTOR, "button[type='submit']") + login_button.click() + + self.wait.until(EC.title_is("Collections | COSMOS")) + return True + + except Exception as e: + print(f"Login failed: {str(e)}") + return False + + def logout(self): + """Logout helper method.""" + try: + logout_link = self.driver.find_element(By.CSS_SELECTOR, "a[href='/accounts/logout/']") + self.driver.execute_script("arguments[0].click();", logout_link) + + self.wait.until(EC.presence_of_element_located((By.NAME, "login"))) + return True + except Exception as e: + print(f"Logout failed: {str(e)}") + return False diff --git a/sde_collections/tests/frontend/test_auth.py b/sde_collections/tests/frontend/test_auth.py new file mode 100644 index 00000000..1a31fac7 --- /dev/null +++ b/sde_collections/tests/frontend/test_auth.py @@ -0,0 +1,56 @@ +from selenium.webdriver.common.by import By + +from .base import BaseTestCase + + +class TestAuthentication(BaseTestCase): + """Test authentication functionality.""" + + def setUp(self): + super().setUp() + # Create test user with factory + self.user, self.password = self.create_test_user( + username="test_user", password="test_password123", is_staff=True + ) + + def test_successful_login(self): + """Test successful login process.""" + # Attempt login + login_success = self.login(self.user.username, self.password) + assert login_success, "Login Failed" + + # Verify successful login by checking welcome message + assert "Welcome back!" in self.driver.page_source, "Welcome message not found" + + def test_failed_login(self): + """Test login failure with incorrect credentials.""" + # Attempt login with wrong password + login_success = self.login(self.user.username, "wrong_password") + assert not login_success, "Login should fail with incorrect password" + + # Verify we're still on login page + assert "/accounts/login/" in self.driver.current_url, "Should remain on login page" + + # Verify error message is displayed + error_message = (self.driver.find_element(By.CLASS_NAME, "alert")).text + assert "The username and/or password you specified are not correct" in error_message, "Error message not found" + + def test_logout(self): + """Test logout functionality.""" + # First login + login_success = self.login(self.user.username, self.password) + assert login_success, "Initial login failed" + + # Verify we're logged in + assert "Welcome back!" in self.driver.page_source, "Not properly logged in" + + # Perform logout + logout_success = self.logout() + assert logout_success, "Logout failed" + + # Verify redirect to login page + assert "/accounts/login/" in self.driver.current_url, "Should redirect to login page after logout" + + def tearDown(self): + """Clean up after each test.""" + super().tearDown() diff --git a/sde_collections/tests/frontend/test_homepage_features.py b/sde_collections/tests/frontend/test_homepage_features.py new file mode 100644 index 00000000..8bb439ae --- /dev/null +++ b/sde_collections/tests/frontend/test_homepage_features.py @@ -0,0 +1,245 @@ +# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/frontend/test_homepage_features.py + +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC + +from ..factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, + UserFactory, +) +from .base import BaseTestCase + + +class TestHomepageFeatures(BaseTestCase): + """Test features available in COSMOS Homepage""" + + def setUp(self): + """Set up test data.""" + super().setUp() + self.user, self.password = self.create_test_user(is_staff=True) + + # Create 3 test collections + self.collections = [CollectionFactory(curated_by=self.user) for _ in range(3)] + self.collection_names = [collection.name for collection in self.collections] + + self.login(self.user.username, self.password) + + def test_collections_display(self): + """Test that collections are displayed after login.""" + # Navigate to collections page + self.driver.get(f"{self.live_server_url}/") + + table = self.wait.until(EC.presence_of_element_located((By.ID, "collection_table"))) + assert "table-striped dataTable" in table.get_attribute("class") + + # Verify each collection name is present + table_text = table.text + for collection_name in self.collection_names: + assert collection_name in table_text, f"Collection '{collection_name}' not found in table" + + def test_universal_search(self): + """Test universal search functionality.""" + + self.driver.get(f"{self.live_server_url}/") + # Wait for search input and enter search term + search_input = self.wait.until(EC.presence_of_element_located((By.ID, "collectionSearch"))) + search_input.send_keys(self.collections[0].name) # Search for first collection + + # Wait for table to update + table = self.wait.until(EC.presence_of_element_located((By.ID, "collection_table"))) + + # Verify search results + table_text = table.text + assert self.collections[0].name in table_text, "Target collection should be present" + assert self.collections[1].name not in table_text, "Collection #2 should not be present" + assert self.collections[2].name not in table_text, "Collection #3 should not be present" + + +class TestSearchPaneFeatures(BaseTestCase): + """Test search pane features on homepage""" + + def setUp(self): + super().setUp() + self.user, self.password = self.create_test_user(is_staff=True) + self.second_test_user = UserFactory() + self.third_test_user = UserFactory() + + # Create collections with diverse attributes + self.collections = [ + CollectionFactory(curated_by=self.user, division=1, workflow_status=3, connector=1, reindexing_status=1), + CollectionFactory( + curated_by=self.second_test_user, division=3, workflow_status=1, connector=1, reindexing_status=2 + ), + CollectionFactory( + curated_by=self.third_test_user, division=4, workflow_status=3, connector=2, reindexing_status=4 + ), + ] + + # Factory sometimes struggle to generate unique URLs by itself, so applying this technique + self.delta_urls = [] + self.curated_urls = [] + for i, collection in enumerate(self.collections): + num_urls = 10**i # 1, 10, 100, ... + self.delta_urls.extend( + [ + DeltaUrlFactory(collection=collection, url=f"https://example-{collection.id}-{j}.com") + for j in range(num_urls) + ] + ) + self.curated_urls.extend( + [ + CuratedUrlFactory(collection=collection, url=f"https://example-{collection.id}-{j}.com") + for j in range(num_urls) + ] + ) + + self.login(self.user.username, self.password) + self.driver.get(f"{self.live_server_url}/") + self.COLUMNS = self.driver.execute_script("return COLUMNS;") + + def test_division_searchpane(self): + """Test division search pane filtering""" + + # Find and click Astrophysics option + astrophysics_option = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, "span.dtsp-name[title='Astrophysics']")) + ) + astrophysics_option.click() + + # Get all rows from the filtered table + rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr") + assert len(rows) > 0, "No rows found after filtering" + + # Verify each row shows Astrophysics division + for row in rows: + division_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["DIVISION"]] + assert division_cell.text.lower() == "astrophysics", f"Expected Astrophysics but found {division_cell.text}" + + def test_delta_urls_searchpane(self): + """Test Delta URLs search pane filtering""" + + # Find the Delta URLs pane using its index and then find the "1 solo URL" option within it + search_panes = self.driver.find_elements(By.CSS_SELECTOR, "div.dtsp-searchPane") + delta_urls_pane = search_panes[self.COLUMNS["DELTA_URLS"]] + delta_url_option = delta_urls_pane.find_element(By.CSS_SELECTOR, "span.dtsp-name[title='1 solo URL']") + delta_url_option.click() + + # Get all rows from the filtered table + rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr") + assert len(rows) > 0, "No rows found after filtering" + + # Verify each row shows "1" in Delta URLs column + for row in rows: + delta_urls_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["DELTA_URLS"]] + assert delta_urls_cell.text == "1", f"Expected '1' but found {delta_urls_cell.text}" + + def test_curated_urls_searchpane(self): + """Test Curated URLs search pane filtering""" + + # Find the Curated URLs pane using its index and then find the "1 to 100 URLs" option within it + search_panes = self.driver.find_elements(By.CSS_SELECTOR, "div.dtsp-searchPane") + curated_urls_pane = search_panes[self.COLUMNS["CURATED_URLS"]] + curated_url_option = curated_urls_pane.find_element(By.CSS_SELECTOR, "span.dtsp-name[title='1 to 100 URLs']") + curated_url_option.click() + + # Get all rows from the filtered table + rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr") + assert len(rows) > 0, "No rows found after filtering" + + # Verify each row shows a number between 1 and 100 in Curated URLs column + for row in rows: + curated_urls_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["CURATED_URLS"]] + url_count = int(curated_urls_cell.text) + assert 1 < url_count <= 100, f"Expected number between 1 and 100 but found {url_count}" + + def test_workflow_status_searchpane(self): + """Test Workflow Status search pane filtering""" + + # Find and click the option with "Engineering in Progress" button + workflow_status_option = self.wait.until( + EC.element_to_be_clickable( + (By.XPATH, "//div[@class='dtsp-nameCont']//button[text()='Engineering in Progress']") + ) + ) + workflow_status_option.click() + + # Get all rows from the filtered table + rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr") + assert len(rows) > 0, "No rows found after filtering" + + # Verify each row shows "ENGINEERING IN PROGRESS" in Workflow Status column + for row in rows: + workflow_status_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["WORKFLOW_STATUS"]] + assert ( + workflow_status_cell.text.lower() == "engineering in progress" + ), f"Expected 'ENGINEERING IN PROGRESS' but found {workflow_status_cell.text}" + + def test_curator_searchpane(self): + """Test Curator search pane filtering""" + + # Find and click the option with "test_user" button + curator_option = self.wait.until( + EC.element_to_be_clickable((By.XPATH, "//div[@class='dtsp-nameCont']//button[text()='test_user']")) + ) + curator_option.click() + + # Get all rows from the filtered table + rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr") + assert len(rows) > 0, "No rows found after filtering" + + # Verify each row shows "test_user" in Curator column + for row in rows: + curator_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["CURATOR"]] + assert curator_cell.text.lower() == "test_user", f"Expected 'test_user' but found {curator_cell.text}" + + def test_connector_type_searchpane(self): + """Test Connector Type search pane filtering""" + + # Find and click "crawler2" option + crawler2_option = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, "span.dtsp-name[title='crawler2']")) + ) + crawler2_option.click() + + # Get all rows from the filtered table + rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr") + assert len(rows) > 0, "No rows found after filtering" + + # Verify each row shows "crawler2" connector type + for row in rows: + connector_type_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["CONNECTOR_TYPE"]] + assert ( + connector_type_cell.text.lower() == "crawler2" + ), f"Expected 'crawler2' but found {connector_type_cell.text}" + + def test_reindexing_status_searchpane(self): + """Test Reindexing Status search pane filtering""" + + # Find and click the option with "Re-Indexing Not Needed" button + reindexing_option = self.wait.until( + EC.element_to_be_clickable( + (By.XPATH, "//div[@class='dtsp-nameCont']//button[text()='Re-Indexing Not Needed']") + ) + ) + reindexing_option.click() + + # Get all rows from the filtered table + rows = self.driver.find_elements(By.CSS_SELECTOR, "#collection_table tbody tr") + assert len(rows) > 0, "No rows found after filtering" + + # Verify each row shows "RE-INDEXING NOT NEEDED" in Reindexing Status column + for row in rows: + reindexing_status_cell = row.find_elements(By.TAG_NAME, "td")[self.COLUMNS["REINDEXING_STATUS"]] + assert ( + reindexing_status_cell.text.lower() == "re-indexing not needed" + ), f"Expected 'RE-INDEXING NOT NEEDED' but found {reindexing_status_cell.text}" + + def tearDown(self): + """Clear all filters after each test""" + + clear_all_button = self.driver.find_element(By.CSS_SELECTOR, "button.dtsp-clearAll") + if "disabled" not in clear_all_button.get_attribute("class"): + clear_all_button.click() + super().tearDown() diff --git a/sde_collections/tests/frontend/test_pattern_application.py b/sde_collections/tests/frontend/test_pattern_application.py new file mode 100644 index 00000000..632c5e29 --- /dev/null +++ b/sde_collections/tests/frontend/test_pattern_application.py @@ -0,0 +1,200 @@ +# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/frontend/test_pattern_application.py + +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC + +from ..factories import CollectionFactory, CuratedUrlFactory, DeltaUrlFactory +from .base import BaseTestCase + + +class TestPatternApplication(BaseTestCase): + """Test different types of pattern application""" + + def setUp(self) -> None: + super().setUp() + self.user, self.password = self.create_test_user(is_staff=True) + + self.collection = CollectionFactory(curated_by=self.user) + + self.delta_urls = [ + DeltaUrlFactory(collection=self.collection, url="https://example.com/docs/page1.html"), + DeltaUrlFactory(collection=self.collection, url="https://example.com/docs/page2.html"), + ] + + self.curated_urls = [ + CuratedUrlFactory(collection=self.collection, url="https://example.com/docs/page3.html"), + CuratedUrlFactory(collection=self.collection, url="https://example.com/index.html"), + ] + + self.login(self.user.username, self.password) + self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls") + + def test_create_exclude_pattern(self): + """Test creating a new exclude pattern.""" + # Click Exclude Patterns tab + exclude_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "excludePatternsTab"))) + exclude_patterns_tab.click() + + # Click Add Pattern button + add_pattern_button = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, "button.addPattern[aria-controls='exclude_patterns_table']")) + ) + add_pattern_button.click() + + # Fill up the form using JavaScript and close modal properly + self.driver.execute_script( + """ + document.querySelector("#excludePatternModal #match_pattern_input").value = 'example.com/docs/'; + document.querySelector('#excludePatternModal .pattern_type_form_select[value="2"]').click(); + document.querySelector("#excludePatternModal button.btn-primary[type='submit']").click(); + """ + ) + + # Verify pattern details + pattern_row = self.wait.until( + EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]")) + ) + row_text = pattern_row.find_element(By.XPATH, "..").text + + assert "example.com/docs/" in row_text + assert "Multi-URL Pattern" in row_text + assert "3" in row_text + + self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls") + + # Verify exclude checkmark for each delta URL + for delta_url in self.delta_urls: + row = self.driver.find_element(By.ID, delta_url.url) + check_icon = row.find_element(By.CSS_SELECTOR, "i[style*='color: green']") + assert check_icon.text == "check" + + def test_create_include_pattern(self): + """Test creating a new include pattern.""" + # Click Include Patterns tab + include_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "includePatternsTab"))) + include_patterns_tab.click() + + # Click Add Pattern button + add_pattern_button = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, "button.addPattern[aria-controls='include_patterns_table']")) + ) + add_pattern_button.click() + + # Fill up the form using JavaScript and close modal properly + self.driver.execute_script( + """ + document.querySelector("#includePatternModal #match_pattern_input").value = 'example.com/docs/'; + document.querySelector('#includePatternModal .pattern_type_form_select[value="2"]').click(); + document.querySelector("#includePatternModal button.btn-primary[type='submit']").click(); + """ + ) + + # Verify pattern details + pattern_row = self.wait.until( + EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]")) + ) + row_text = pattern_row.find_element(By.XPATH, "..").text + + assert "example.com/docs/" in row_text + assert "Multi-URL Pattern" in row_text + assert "3" in row_text + + self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls") + + # Verify no exclude checkmark for each delta URL + for delta_url in self.delta_urls: + row = self.driver.find_element(By.ID, delta_url.url) + check_icon = row.find_element(By.CSS_SELECTOR, "i[style*='color: red']") + assert check_icon.text == "close" + + def test_create_title_pattern(self): + """Test creating a new title pattern.""" + # Click Title Patterns tab + title_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "titlePatternsTab"))) + title_patterns_tab.click() + + # Click Add Pattern button + add_pattern_button = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, "button.addPattern[aria-controls='title_patterns_table']")) + ) + add_pattern_button.click() + + # Fill up the form using JavaScript and close modal properly + self.driver.execute_script( + """ + document.querySelector("#titlePatternModal #match_pattern_input").value = 'example.com/docs/'; + document.querySelector("#titlePatternModal #title_pattern_input").value = 'Documentation: {title}'; + document.querySelector('#titlePatternModal .pattern_type_form_select[value="2"]').click(); + document.querySelector("#titlePatternModal button.btn-primary[type='submit']").click(); + """ + ) + + # Verify pattern details + pattern_row = self.wait.until( + EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]")) + ) + row_text = pattern_row.find_element(By.XPATH, "..").text + + assert "example.com/docs/" in row_text + assert "Documentation: {title}" in row_text + assert "Multi-URL Pattern" in row_text + assert "3" in row_text + + self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls") + + # Wait for at least one row to be present in the table + self.wait.until( + EC.presence_of_element_located((By.CSS_SELECTOR, "#delta_urls_table tbody tr td:not(.dt-empty)")) + ) + + table_html = self.driver.find_element(By.ID, "delta_urls_table").get_attribute("outerHTML") + + # Verify that previous curated_url now appear in delta_urls page after pattern application + assert "example.com/docs/page3.html" in table_html + + # Verify each delta URL's title has been updated with the pattern + for delta_url in self.collection.delta_urls.all(): + expected_title = f"Documentation: {delta_url.scraped_title}" + assert expected_title in table_html, f"Expected title '{expected_title}' not found in table" + + def test_create_documenttype_pattern(self): + """Test creating a new document type pattern.""" + # Click Document Type Patterns tab + documenttype_patterns_tab = self.wait.until(EC.element_to_be_clickable((By.ID, "documentTypePatternsTab"))) + documenttype_patterns_tab.click() + + # Click Add Pattern button + add_pattern_button = self.wait.until( + EC.element_to_be_clickable( + (By.CSS_SELECTOR, "button.addPattern[aria-controls='document_type_patterns_table']") + ) + ) + add_pattern_button.click() + + # Fill up the form using JavaScript and close modal properly + self.driver.execute_script( + """ + document.querySelector("#documentTypePatternModal #match_pattern_input").value = 'example.com/docs/'; + document.querySelector('#documentTypePatternModal .document_type_form_select[value="2"]').click(); // DATA + document.querySelector('#documentTypePatternModal .pattern_type_form_select[value="2"]').click(); + document.querySelector("#documentTypePatternModal button.btn-primary[type='submit']").click(); + """ + ) + + # Verify pattern details + pattern_row = self.wait.until( + EC.presence_of_element_located((By.XPATH, "//td[contains(text(), 'example.com/docs/')]")) + ) + row_text = pattern_row.find_element(By.XPATH, "..").text + + assert "example.com/docs/" in row_text + assert "Multi-URL Pattern" in row_text + assert "3" in row_text + + self.driver.get(f"{self.live_server_url}/{self.collection.id}/delta-urls") + + # Verify document type is set to Data + for delta_url in self.delta_urls: + row = self.driver.find_element(By.ID, delta_url.url) + doc_type_button = row.find_element(By.CSS_SELECTOR, "button.btn-success") + assert doc_type_button.text == "DATA" diff --git a/sde_collections/tests/test_database_backup.py b/sde_collections/tests/test_database_backup.py new file mode 100644 index 00000000..680da2f1 --- /dev/null +++ b/sde_collections/tests/test_database_backup.py @@ -0,0 +1,177 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_database_backup.py +import gzip +import os +import subprocess +from datetime import datetime +from unittest.mock import Mock, patch + +import pytest +from django.core.management import call_command + +from sde_collections.management.commands import database_backup +from sde_collections.management.commands.database_backup import temp_file_handler + + +@pytest.fixture +def mock_subprocess(): + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + yield mock_run + + +@pytest.fixture +def mock_date(): + with patch("sde_collections.management.commands.database_backup.datetime") as mock_dt: + mock_dt.now.return_value = datetime(2024, 1, 15) + yield mock_dt + + +@pytest.fixture +def mock_settings(settings): + """Configure test database settings.""" + settings.DATABASES = { + "default": { + "HOST": "test-db-host", + "NAME": "test_db", + "USER": "test_user", + "PASSWORD": "test_password", + } + } + return settings + + +@pytest.fixture +def command(): + return database_backup.Command() + + +class TestBackupCommand: + def test_get_backup_filename_compressed(self, command, mock_date, monkeypatch): + """Test backup filename generation with compression.""" + monkeypatch.setenv("BACKUP_ENVIRONMENT", "staging") + backup_file, dump_file = command.get_backup_filename(compress=True) + assert backup_file.endswith("staging_backup_20240115.sql.gz") + assert dump_file.endswith("staging_backup_20240115.sql") + + def test_get_backup_filename_uncompressed(self, command, mock_date, monkeypatch): + """Test backup filename generation without compression.""" + monkeypatch.setenv("BACKUP_ENVIRONMENT", "production") + backup_file, dump_file = command.get_backup_filename(compress=False) + assert backup_file.endswith("production_backup_20240115.sql") + assert dump_file == backup_file + + def test_get_backup_filename_no_environment(self, command, mock_date, monkeypatch): + """Test backup filename generation with no environment set.""" + monkeypatch.delenv("BACKUP_ENVIRONMENT", raising=False) + backup_file, dump_file = command.get_backup_filename(compress=True) + assert backup_file.endswith("unknown_backup_20240115.sql.gz") + assert dump_file.endswith("unknown_backup_20240115.sql") + + def test_run_pg_dump(self, command, mock_subprocess, mock_settings): + """Test pg_dump command execution.""" + env = {"PGPASSWORD": "test_password"} + command.run_pg_dump("test_output.sql", env) + + mock_subprocess.assert_called_once() + cmd_args = mock_subprocess.call_args[0][0] + assert cmd_args == [ + "pg_dump", + "-h", + "test-db-host", + "-U", + "test_user", + "-d", + "test_db", + "--no-owner", + "--no-privileges", + "-f", + "test_output.sql", + ] + + def test_compress_file(self, command, tmp_path): + """Test file compression.""" + input_file = tmp_path / "test.sql" + output_file = tmp_path / "test.sql.gz" + test_content = b"Test database content" + + # Create test input file + input_file.write_bytes(test_content) + + # Compress the file + command.compress_file(str(input_file), str(output_file)) + + # Verify compression + assert output_file.exists() + with gzip.open(output_file, "rb") as f: + assert f.read() == test_content + + def test_temp_file_handler_cleanup(self, tmp_path): + """Test temporary file cleanup.""" + test_file = tmp_path / "temp.sql" + test_file.touch() + + with temp_file_handler(str(test_file)): + assert test_file.exists() + assert not test_file.exists() + + def test_temp_file_handler_cleanup_on_error(self, tmp_path): + """Test temporary file cleanup when an error occurs.""" + test_file = tmp_path / "temp.sql" + test_file.touch() + + with pytest.raises(ValueError): + with temp_file_handler(str(test_file)): + assert test_file.exists() + raise ValueError("Test error") + assert not test_file.exists() + + @pytest.mark.parametrize( + "compress,env_name", + [ + (True, "production"), + (False, "staging"), + (True, "carson_local"), + ], + ) + def test_handle_integration(self, compress, env_name, mock_subprocess, mock_date, mock_settings, monkeypatch): + """Test full backup process integration.""" + monkeypatch.setenv("BACKUP_ENVIRONMENT", env_name) + call_command("database_backup", no_compress=not compress) + + # Verify correct command execution + mock_subprocess.assert_called_once() + + # Verify correct filename used + cmd_args = mock_subprocess.call_args[0][0] + date_str = "20240115" + expected_base = f"{env_name}_backup_{date_str}.sql" + assert cmd_args[-1].endswith(expected_base) + + # Verify cleanup attempted if compressed + if compress: + assert not os.path.exists(expected_base) + + def test_handle_pg_dump_error(self, mock_subprocess, mock_date, monkeypatch): + """Test error handling when pg_dump fails.""" + mock_subprocess.side_effect = subprocess.CalledProcessError(1, "pg_dump") + monkeypatch.setenv("BACKUP_ENVIRONMENT", "staging") + + call_command("database_backup") + + # Verify error handling and cleanup + date_str = "20240115" + temp_file = f"staging_backup_{date_str}.sql" + assert not os.path.exists(temp_file) + + def test_handle_compression_error(self, mock_subprocess, mock_date, command, monkeypatch): + """Test error handling during compression.""" + monkeypatch.setenv("BACKUP_ENVIRONMENT", "staging") + # Mock compression to fail + command.compress_file = Mock(side_effect=Exception("Compression failed")) + + call_command("database_backup") + + # Verify cleanup + date_str = "20240115" + temp_file = f"staging_backup_{date_str}.sql" + assert not os.path.exists(temp_file) diff --git a/sde_collections/tests/test_database_restore.py b/sde_collections/tests/test_database_restore.py new file mode 100644 index 00000000..21088ad0 --- /dev/null +++ b/sde_collections/tests/test_database_restore.py @@ -0,0 +1,269 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_database_restore.py +import gzip +from unittest.mock import patch + +import pytest +from django.core.management import call_command +from django.core.management.base import CommandError +from django.db import connections + +from sde_collections.management.commands import database_restore +from sde_collections.models.collection import Collection +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl +from sde_collections.tests.factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + +# Register the integration mark +pytest.mark.integration = pytest.mark.django_db(transaction=True) + + +@pytest.fixture +def mock_subprocess(): + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + yield mock_run + + +@pytest.fixture +def mock_settings(settings): + """Configure test database settings.""" + settings.DATABASES = { + "default": { + "HOST": "test-db-host", + "NAME": "test_db", + "USER": "test_user", + "PASSWORD": "test_password", + } + } + return settings + + +@pytest.fixture +def command(): + return database_restore.Command() + + +@pytest.fixture +def backup_file(tmp_path): + """Create a temporary backup file.""" + backup_path = tmp_path / "test_backup.sql" + backup_path.write_text("-- Test backup content") + return str(backup_path) + + +@pytest.fixture +def compressed_backup_file(tmp_path): + """Create a temporary compressed backup file.""" + backup_path = tmp_path / "test_backup.sql.gz" + with gzip.open(backup_path, "wt") as f: + f.write("-- Test backup content") + return str(backup_path) + + +class TestRestoreCommand: + def test_get_db_settings(self, command, mock_settings): + """Test database settings retrieval.""" + settings = command.get_db_settings() + assert settings == { + "host": "test-db-host", + "name": "test_db", + "user": "test_user", + "password": "test_password", + } + + def test_run_psql_command(self, command, mock_subprocess, mock_settings): + """Test psql command execution.""" + env = {"PGPASSWORD": "test_password"} + command.run_psql_command("SELECT 1;", "test_db", env) + + mock_subprocess.assert_called_once() + cmd_args = mock_subprocess.call_args[0][0] + assert cmd_args == [ + "psql", + "-h", + "test-db-host", + "-U", + "test_user", + "-d", + "test_db", + "-c", + "SELECT 1;", + ] + + def test_reset_database(self, command, mock_subprocess, mock_settings): + """Test database reset process.""" + env = {"PGPASSWORD": "test_password"} + command.reset_database(env) + + # Verify drop, create and terminate connections commands were executed + assert mock_subprocess.call_count >= 2 + calls = mock_subprocess.call_args_list + assert any("DROP DATABASE" in call[0][0][-1] for call in calls) + assert any("CREATE DATABASE" in call[0][0][-1] for call in calls) + + def test_restore_backup(self, command, mock_subprocess, mock_settings, backup_file): + """Test backup restoration.""" + env = {"PGPASSWORD": "test_password"} + command.restore_backup(backup_file, env) + + mock_subprocess.assert_called_once() + cmd_args = mock_subprocess.call_args[0][0] + assert cmd_args == [ + "psql", + "-h", + "test-db-host", + "-U", + "test_user", + "-d", + "test_db", + "-f", + backup_file, + ] + + def test_decompress_file(self, command, tmp_path, compressed_backup_file): + """Test backup file decompression.""" + output_file = str(tmp_path / "decompressed.sql") + command.decompress_file(compressed_backup_file, output_file) + + with open(output_file) as f: + content = f.read() + assert content == "-- Test backup content" + + def test_handle_file_not_found(self, command): + """Test error handling for non-existent backup file.""" + with pytest.raises(CommandError): + call_command("database_restore", "nonexistent.sql") + + +@pytest.mark.django_db +class TestDatabaseIntegration: + """Integration tests for backup and restore functionality.""" + + def create_test_data(self): + """Create a set of test data using factories.""" + collection = CollectionFactory() + + # Create some URLs + dump_urls = DumpUrlFactory.create_batch(3, collection=collection) + curated_urls = CuratedUrlFactory.create_batch(3, collection=collection) + delta_urls = DeltaUrlFactory.create_batch(3, collection=collection) + + return { + "collection": collection, + "dump_urls": dump_urls, + "curated_urls": curated_urls, + "delta_urls": delta_urls, + } + + def verify_data_integrity(self, original_data): + """Verify that all data matches the original after restore.""" + # Close all existing database connections before verification + connections.close_all() + + # Verify collection + restored_collection = Collection.objects.get(pk=original_data["collection"].pk) + assert restored_collection.name == original_data["collection"].name + assert restored_collection.config_folder == original_data["collection"].config_folder + + # Verify URLs + for original_url in original_data["dump_urls"]: + restored_url = DumpUrl.objects.get(pk=original_url.pk) + assert restored_url.url == original_url.url + assert restored_url.scraped_title == original_url.scraped_title + + for original_url in original_data["curated_urls"]: + restored_url = CuratedUrl.objects.get(pk=original_url.pk) + assert restored_url.url == original_url.url + assert restored_url.scraped_title == original_url.scraped_title + + for original_url in original_data["delta_urls"]: + restored_url = DeltaUrl.objects.get(pk=original_url.pk) + assert restored_url.url == original_url.url + assert restored_url.scraped_title == original_url.scraped_title + + @pytest.mark.integration + def test_full_backup_restore_cycle(self, tmp_path): + """Test complete backup and restore cycle with actual data.""" + # Create test data + original_data = self.create_test_data() + + # Create backup + backup_file = str(tmp_path / "integration_test_backup.sql") + with patch("socket.gethostname", return_value="TEST-SERVER"): + connections.close_all() # Close connections before backup + call_command("database_backup", "--no-compress", output=backup_file) + + # Clear the database + for Model in [Collection, DumpUrl, CuratedUrl, DeltaUrl]: + Model.objects.all().delete() + + assert Collection.objects.count() == 0 + assert DumpUrl.objects.count() == 0 + assert CuratedUrl.objects.count() == 0 + assert DeltaUrl.objects.count() == 0 + + # Restore from backup + connections.close_all() # Close connections before restore + call_command("database_restore", backup_file) + + # Verify data integrity + self.verify_data_integrity(original_data) + + @pytest.mark.integration + def test_compressed_backup_restore_cycle(self, tmp_path): + """Test backup and restore cycle with compression.""" + # Create test data + original_data = self.create_test_data() + + # Create compressed backup + backup_file = str(tmp_path / "integration_test_backup.sql.gz") + with patch("socket.gethostname", return_value="TEST-SERVER"): + connections.close_all() # Close connections before backup + call_command("database_backup", output=backup_file) # Compression is enabled by default + + # Clear the database + connections.close_all() # Close connections before clearing + Collection.objects.all().delete() + + # Restore from compressed backup + connections.close_all() # Close connections before restore + call_command("database_restore", backup_file) + + # Verify data integrity + self.verify_data_integrity(original_data) + + @pytest.mark.integration + def test_partial_data_integrity(self, tmp_path): + """Test backup and restore with partial data modifications.""" + # Create initial data + original_data = self.create_test_data() + original_name = original_data["collection"].name + original_url_id = original_data["curated_urls"][0].id # Store the ID explicitly + + # Create backup + backup_file = str(tmp_path / "partial_test_backup.sql") + with patch("socket.gethostname", return_value="TEST-SERVER"): + connections.close_all() # Close connections before backup + call_command("database_backup", "--no-compress", output=backup_file) + + # Modify some data + collection = original_data["collection"] + collection.name = "Modified Name" + collection.save() + + new_curated_url = CuratedUrlFactory(collection=collection) + original_data["curated_urls"][0].delete() + + # Restore from backup + connections.close_all() # Close connections before restore + call_command("database_restore", backup_file) + + # Verify original state is restored + restored_collection = Collection.objects.get(pk=collection.pk) + assert restored_collection.name == original_name + assert not CuratedUrl.objects.filter(pk=new_curated_url.pk).exists() + assert CuratedUrl.objects.filter(pk=original_url_id).exists() # Use the stored ID diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py new file mode 100644 index 00000000..a7941fbd --- /dev/null +++ b/sde_collections/tests/test_delta_patterns.py @@ -0,0 +1,313 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_delta_patterns.py + +import pytest + +from sde_collections.models.delta_patterns import ( + DeltaExcludePattern, + DeltaResolvedTitleError, + DeltaTitlePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl +from sde_collections.tests.factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, +) +from sde_collections.utils.title_resolver import resolve_title + + +@pytest.mark.django_db +def test_exclusion_status(): + """ + new patterns should only exclude DeltaUrls, not CuratedUrls + """ + collection = CollectionFactory() + curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page/1") + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page/2") + + # confirm they both start as not excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False + + # Create an exclusion pattern matches both urls + pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern="*page*", match_pattern_type=2) + pattern.apply() + + # curated urls should not be affected by patterns until the collection is promoted + # curated should be included, but delta should be excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False + + +@pytest.mark.django_db +class TestBaseMatchPattern: + def test_pattern_save_applies_effects(self): + """Test that pattern creation automatically applies effects.""" + collection = CollectionFactory() + curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/test") + + # Create pattern - should automatically apply + pattern = DeltaExcludePattern.objects.create( + collection=collection, match_pattern=curated_url.url, match_pattern_type=1 + ) + + # Delta URL should be created and excluded + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.excluded is True + assert pattern.delta_urls.filter(id=delta_url.id).exists() + + def test_pattern_delete_removes_effects(self): + """Test that deleting a pattern properly removes its effects.""" + collection = CollectionFactory() + curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/test") + + pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern=curated_url.url) + + # Verify initial state + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.excluded is True + + # Delete pattern + pattern.delete() + + # Delta URL should be gone since it was only created for exclusion + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + def test_different_collections_isolation(self): + """Test that patterns only affect URLs in their collection.""" + collection1 = CollectionFactory() + collection2 = CollectionFactory() + + # Create URLs with different paths + curated_url1 = CuratedUrlFactory(collection=collection1, url="https://example.com/test1") + curated_url2 = CuratedUrlFactory(collection=collection2, url="https://example.com/test2") + + DeltaExcludePattern.objects.create( + collection=collection1, match_pattern="https://example.com/*", match_pattern_type=2 + ) + + # Only collection1's URL should be affected + assert DeltaUrl.objects.filter(collection=collection1, url=curated_url1.url).exists() + assert not DeltaUrl.objects.filter(collection=collection2, url=curated_url2.url).exists() + + +@pytest.mark.django_db +class TestDeltaTitlePattern: + + def test_apply_generates_delta_url_if_title_differs(self): + collection = CollectionFactory() + # Step 1: Create a `CuratedUrl` with a `generated_title` that should differ from the new pattern + curated_url = CuratedUrlFactory( + collection=collection, + url="https://example.com/page", + scraped_title="Sample Title", + ) + + # Step 2: Create a `DeltaTitlePattern` with a new title pattern + pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="https://example.com/*", + match_pattern_type=2, # MULTI_URL_PATTERN + title_pattern="{title} - Processed New", + ) + + # Step 3: A new DeltaUrl should be created with the updated `generated_title` + delta_url = DeltaUrl.objects.get(url=curated_url.url) + expected_generated_title = resolve_title( + pattern.title_pattern, + {"title": curated_url.scraped_title, "url": curated_url.url, "collection": collection.name}, + ) + assert delta_url.generated_title == expected_generated_title + + def test_apply_does_not_generate_delta_url_if_titles_match(self): + collection = CollectionFactory() + title_pattern = "{title} - Processed" + context = { + "url": "https://example.com/page", + "title": "Sample Title", + "collection": collection.name, + } + curated_url = CuratedUrlFactory( + collection=collection, + url=context["url"], + scraped_title=context["title"], + generated_title=resolve_title(title_pattern, context), + ) + + # Create and apply a `DeltaTitlePattern` with the same title pattern + DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="https://example.com/*", + match_pattern_type=2, + title_pattern=title_pattern, + ) + # pattern.apply() + + # Since the title matches, no new `DeltaUrl` should be created + DeltaUrl.objects.filter(url=curated_url.url).first() + + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + def test_apply_resolves_title_for_delta_urls(self): + collection = CollectionFactory() + # Create a `DeltaUrl` that will be matched and have the title pattern applied + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title") + + # Create and apply a `DeltaTitlePattern` to apply a generated title + pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="https://example.com/*", + match_pattern_type=2, + title_pattern="{title} - Processed", + ) + pattern.apply() + + # The `generated_title` in `DeltaUrl` should reflect the applied pattern + delta_url.refresh_from_db() + expected_generated_title = resolve_title(pattern.title_pattern, {"title": delta_url.scraped_title}) + assert delta_url.generated_title == expected_generated_title + + def test_apply_logs_error_on_title_resolution_failure(self): + # TODO: note that if you apply a pattern with an error multiple times + # it will not log multiple errors on a url. it will instead throw a duplicate key error + # at some point, the error code should be made more robust to handle this + collection = CollectionFactory() + # Create a `DeltaUrl` that will trigger a resolution error + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title") + + # Create a `DeltaTitlePattern` with an invalid title pattern to trigger an error + DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="https://example.com/*", + match_pattern_type=2, + title_pattern="{invalid_field} - Processed", + ) + + # Check that a `DeltaResolvedTitleError` was logged + error_entry = DeltaResolvedTitleError.objects.get(delta_url__url=delta_url.url) + assert "invalid_field" in error_entry.error_string + + def test_unapply_clears_generated_titles_from_delta_urls(self): + collection = CollectionFactory() + # Create a `DeltaUrl` with an existing `scraped_title` + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title") + + # Create and apply a `DeltaTitlePattern` + pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="https://example.com/*", + match_pattern_type=2, + title_pattern="{title} - Processed", + ) + delta_url.refresh_from_db() + assert delta_url.generated_title == "Sample Title - Processed" + + # Unapply the pattern, which should clear the `generated_title` in `DeltaUrl` + pattern.delete() + delta_url.refresh_from_db() + assert delta_url.generated_title == "" + + def test_unapply_removes_pattern_relationships(self): + collection = CollectionFactory() + # Create a `CuratedUrl` and matching `DeltaUrl` + curated_url = CuratedUrlFactory( + collection=collection, url="https://example.com/page", scraped_title="Sample Title" + ) + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title") + + # Create and apply a `DeltaTitlePattern` + pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="https://example.com/*", + match_pattern_type=2, + title_pattern="{title} - Processed", + ) + pattern.apply() + pattern.refresh_from_db() + + # Ensure relationships are set + assert pattern.delta_urls.filter(pk=delta_url.pk).exists() + # this actually shouldn't match until after promotion + assert not pattern.curated_urls.filter(pk=curated_url.pk).exists() + + # Unapply the pattern + pattern.unapply() + + # Verify relationships have been cleared + assert not pattern.delta_urls.filter(pk=delta_url.pk).exists() + assert not pattern.curated_urls.filter(pk=curated_url.pk).exists() + + def test_pattern_reapplication_does_not_duplicate_delta_urls(self): + """ + Ensures that reapplying a pattern does not create duplicate `DeltaUrls` or affect existing `CuratedUrls`. + """ + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Title Before") + + # Apply a pattern + pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="https://example.com/*", + match_pattern_type=2, + title_pattern="{title} - Processed", + ) + + delta_url.refresh_from_db() + delta_url.generated_title = "Title Before - Processed" + + # Promote to CuratedUrl + collection.promote_to_curated() + curated_url = CuratedUrl.objects.get(url=delta_url.url) + + # Ensure no new `DeltaUrl` is created after reapplying the pattern + pattern.apply() + assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0 + + # Ensure no new `DeltaUrl` is created after reapplying the pattern + pattern.apply() + assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0 + + @pytest.mark.django_db + def test_title_pattern_error_updates(self): + """ + Test that when a more specific pattern creates an error, + it updates rather than duplicates the error record. + """ + # Create a collection and URL + collection = CollectionFactory() + url = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/specific/item.html", scraped_title="Original Title" + ) + + # Create a general pattern first + general_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs*", + title_pattern="{invalid}", # Invalid variable name will cause error + match_pattern_type=2, + ) + + # Verify initial error state + error = url.deltaresolvedtitleerror + assert error.title_pattern == general_pattern + assert "Variable 'invalid' not allowed in f-string pattern" in error.error_string + + # Create a more specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs/specific*", + title_pattern="{another_invalid}", + match_pattern_type=2, + ) + + # Re-fetch error to see latest state + error.refresh_from_db() + + # Error should now be from specific pattern + assert ( + error.title_pattern == specific_pattern + ), f"Error still associated with {error.title_pattern} instead of {specific_pattern}" + assert "Variable 'another_invalid' not allowed in f-string pattern" in error.error_string + + # Verify we still only have one error record + assert DeltaResolvedTitleError.objects.filter(delta_url=url).count() == 1 diff --git a/sde_collections/tests/test_exclude_patterns.py b/sde_collections/tests/test_exclude_patterns.py new file mode 100644 index 00000000..3bf474d2 --- /dev/null +++ b/sde_collections/tests/test_exclude_patterns.py @@ -0,0 +1,366 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_exclude_patterns.py + +import pytest +from django.contrib.contenttypes.models import ContentType +from django.db import IntegrityError +from django.test import TestCase + +from sde_collections.models.delta_patterns import DeltaExcludePattern +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl + +from .factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + + +class BaseCollectionTest(TestCase): + def setUp(self): + super().setUp() + self.collection = CollectionFactory() + + # Ensure ContentTypes are created for all pattern models + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltaexcludepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltaincludepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltatitlepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltadocumenttypepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltadivisionpattern", + ) + + +@pytest.mark.django_db +class TestDeltaExcludePatternBasics(TestCase): + """Test basic functionality of exclude patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_create_simple_exclude_pattern(self): + """Test creation of a basic exclude pattern.""" + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern="https://example.com/exclude-me", reason="Test exclusion" + ) + assert pattern.match_pattern_type == DeltaExcludePattern.MatchPatternTypeChoices.INDIVIDUAL_URL + + def test_exclude_single_curated_url(self): + """Test excluding a single curated URL creates appropriate delta.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/exclude-me", scraped_title="Test Title" + ) + + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + + # Pattern should create a delta URL + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert pattern.delta_urls.filter(id=delta_url.id).exists() + assert not pattern.curated_urls.filter(id=curated_url.id).exists() + + def test_exclude_single_curated_url_multiple_applies(self): + """ + Test excluding a single curated URL creates appropriate delta. + even if the pattern is applied multiple times + """ + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/exclude-me", scraped_title="Test Title" + ) + + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + pattern.save() + pattern.apply() + pattern.apply() + pattern.save() + + # Pattern should create a delta URL + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert pattern.delta_urls.filter(id=delta_url.id).exists() + assert not pattern.curated_urls.filter(id=curated_url.id).exists() + + def test_wildcard_pattern_exclusion(self): + """Test excluding multiple URLs with wildcard pattern.""" + # Create multiple curated URLs + urls = [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/docs/internal/{i}", + scraped_title=f"Internal Doc {i}", + ) + for i in range(3) + ] + + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/internal/*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # All URLs should have corresponding deltas + assert DeltaUrl.objects.filter(collection=self.collection).count() == 3 + for url in urls: + assert pattern.delta_urls.filter(url=url.url).exists() + assert not pattern.curated_urls.filter(id=url.id).exists() + + def test_exclusion_selectivity(self): + """ + new patterns should only exclude DeltaUrls, not CuratedUrls + """ + curated_url = CuratedUrlFactory(collection=self.collection, url="https://example.com/page/1") + delta_url = DeltaUrlFactory(collection=self.collection, url="https://example.com/page/2") + + # confirm they both start as not excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False + + # Create an exclusion pattern matches both urls + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern="*page*", match_pattern_type=2 + ) + pattern.apply() + + # curated urls should not be affected by patterns until the collection is promoted + # curated should be included, but delta should be excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False + + +class TestDeltaExcludePatternWorkflow(BaseCollectionTest): + """Test complex workflows involving exclude patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_removal_creates_reversal_deltas(self): + """ + Test that removing an exclude pattern after promotion creates delta URLs + to reverse the exclusion of previously excluded curated URLs. + """ + collection = self.collection + # Create curated URL + curated_url = CuratedUrlFactory( + collection=collection, url="https://example.com/test", scraped_title="Test Title" + ) + + # Create exclude pattern - this should create excluded delta URL + pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern=curated_url.url) + + # Verify delta URL was created and is excluded + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.excluded is True + + # Promote collection - this should convert excluded delta URL to excluded curated URL + collection.promote_to_curated() + + # Verify curated URL is now excluded and delta URL is gone + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + curated_url = CuratedUrl.objects.get(url=curated_url.url) + assert curated_url.excluded is True + + # Remove pattern - this should create new delta URL to show URL will be included + pattern.delete() + + reversal_delta = DeltaUrl.objects.get(url=curated_url.url) + assert reversal_delta.excluded is False + + collection.promote_to_curated() + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + curated_url = CuratedUrl.objects.get(url=curated_url.url) + assert curated_url.excluded is False + + def test_promote_and_new_exclude_workflow(self): + """Test workflow: add URLs, exclude some, promote, then add new exclude pattern.""" + # Initial setup with curated URLs + [ + CuratedUrlFactory(collection=self.collection, url=f"https://example.com/page{i}", scraped_title=f"Page {i}") + for i in range(3) + ] + + # Create first exclude pattern + DeltaExcludePattern.objects.create(collection=self.collection, match_pattern="https://example.com/page1") + + # Verify delta URL created + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + + # Simulate promotion + self.collection.promote_to_curated() + + # Create new exclude pattern after promotion + pattern2 = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern="https://example.com/page2" + ) + + # Should have new delta URL for newly excluded URL + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + assert pattern2.delta_urls.count() == 1 + + def test_dump_migration_with_excludes(self): + """Test handling of excluded URLs during dump migration.""" + # Create initial curated URLs + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test", scraped_title="Original Title" + ) + + # Create exclude pattern, this should not effect the curated + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + + # Create dump URL with different content, same as curated, different title, will make delta + DumpUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Updated Title") + + # Migrate dump to delta + self.collection.migrate_dump_to_delta() + + # Should have delta URL reflecting both exclusion and content change + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert delta_url.scraped_title == "Updated Title" + assert pattern.delta_urls.filter(id=delta_url.id).exists() + assert delta_url.excluded is True + + +class TestDeltaExcludePatternEdgeCases(TestCase): + """Test edge cases and complex scenarios.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_exclude_pattern_uniqueness(self): + """Test that we cannot create duplicate exclude patterns for the same URL in a collection.""" + from django.db import transaction + + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test", scraped_title="Test Title" + ) + + # Create first exclude pattern + pattern1 = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, reason="First exclusion" + ) + + # Verify we start with one pattern + assert DeltaExcludePattern.objects.filter(collection=self.collection).count() == 1 + + # Attempt to create second exclude pattern with same match_pattern should fail + with pytest.raises(IntegrityError), transaction.atomic(): + DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, reason="Second exclusion" + ) + + # Verify we still only have one pattern + assert DeltaExcludePattern.objects.filter(collection=self.collection).count() == 1 + + # Verify only one delta URL exists and is associated with the pattern + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert pattern1.delta_urls.filter(id=delta_url.id).exists() + + def test_different_patterns_matching_same_url(self): + """Test that different patterns can affect the same URL.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test/page", scraped_title="Test Title" + ) + + # Create pattern matching exact URL + pattern1 = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, reason="Exact match exclusion" + ) + + # Create pattern with wildcard that also matches the URL + pattern2 = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/test/*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + reason="Wildcard exclusion", + ) + + # Should still only have one delta URL + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + delta_url = DeltaUrl.objects.get(url=curated_url.url) + + # URL should be associated with both patterns + assert pattern1.delta_urls.filter(id=delta_url.id).exists() + assert pattern2.delta_urls.filter(id=delta_url.id).exists() + + def test_exclude_modified_url(self): + """Test excluding a URL that already has modifications in delta doesn't lose delta mods""" + # Create curated URL + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test", scraped_title="Original Title" + ) + + # Create modified delta URL + DeltaUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Modified Title") + + # Create exclude pattern + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + + # Should still only have one delta URL with both modification and exclusion + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + updated_delta = DeltaUrl.objects.get(url=curated_url.url) + assert updated_delta.scraped_title == "Modified Title" + assert pattern.delta_urls.filter(id=updated_delta.id).exists() + + def test_pattern_update_workflow(self): + """ + Test updating an exclude pattern's criteria properly updates URL associations + while preserving existing delta changes. + """ + # Create multiple curated URLs + urls = [ + CuratedUrlFactory( + collection=self.collection, url=f"https://example.com/section{i}/page", scraped_title=f"Page {i}" + ) + for i in range(3) + ] + + # Create a delta URL for section1 with a modified title + DeltaUrlFactory( + collection=self.collection, url=urls[1].url, scraped_title="Modified Title for Section 1" # section1 + ) + + # Create initial pattern matching section1/* + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/section1/*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Verify initial state + assert not pattern.delta_urls.filter(url=urls[0].url).exists() # section0 + assert pattern.delta_urls.filter(url=urls[1].url).exists() # section1 + assert not pattern.delta_urls.filter(url=urls[2].url).exists() # section2 + + # Verify the delta URL still exists and has its modified title + assert DeltaUrl.objects.filter(url=urls[1].url).exists() + assert DeltaUrl.objects.get(url=urls[1].url).scraped_title == "Modified Title for Section 1" + + # Update pattern to match section2/* instead + pattern.match_pattern = "https://example.com/section2/*" + pattern.save() + + # Verify pattern associations have updated correctly + assert not pattern.delta_urls.filter(url=urls[0].url).exists() # section0 + assert not pattern.delta_urls.filter(url=urls[1].url).exists() # section1 + assert pattern.delta_urls.filter(url=urls[2].url).exists() # section2 + + # Verify section1's delta URL still exists with its modified title + assert DeltaUrl.objects.filter(url=urls[1].url).exists() + delta_after_update = DeltaUrl.objects.get(url=urls[1].url) + assert delta_after_update.scraped_title == "Modified Title for Section 1" diff --git a/sde_collections/tests/test_field_modifier_patterns.py b/sde_collections/tests/test_field_modifier_patterns.py new file mode 100644 index 00000000..db15a21e --- /dev/null +++ b/sde_collections/tests/test_field_modifier_patterns.py @@ -0,0 +1,490 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_patterns.py + +import pytest +from django.contrib.contenttypes.models import ContentType +from django.db import IntegrityError +from django.test import TestCase + +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl + +from .factories import CollectionFactory, CuratedUrlFactory, DeltaUrlFactory + + +class BaseCollectionTest(TestCase): + def setUp(self): + super().setUp() + self.collection = CollectionFactory() + + # Ensure ContentTypes are created for all pattern models + for model in [ + "deltaexcludepattern", + "deltaincludepattern", + "deltatitlepattern", + "deltadocumenttypepattern", + "deltadivisionpattern", + ]: + ContentType.objects.get_or_create( + app_label="sde_collections", + model=model, + ) + + +@pytest.mark.django_db +class TestFieldModifierPatternBasics(TestCase): + """Test basic functionality of field modifier patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_create_document_type_pattern_single(self): + """Test creation of a document type pattern for single URL.""" + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/guide.pdf", + document_type=DocumentTypes.DOCUMENTATION, + ) + assert pattern.match_pattern_type == DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL + assert pattern.document_type == DocumentTypes.DOCUMENTATION + + def test_create_document_type_pattern_multi(self): + """Test creation of a document type pattern with wildcard.""" + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.pdf", + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + document_type=DocumentTypes.DOCUMENTATION, + ) + assert pattern.match_pattern_type == DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN + assert pattern.document_type == DocumentTypes.DOCUMENTATION + + def test_create_division_pattern(self): + """Test creation of a division pattern.""" + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/helio/data.html", + division=Divisions.HELIOPHYSICS, + ) + assert pattern.match_pattern_type == DeltaDivisionPattern.MatchPatternTypeChoices.INDIVIDUAL_URL + assert pattern.division == Divisions.HELIOPHYSICS + + def test_modify_single_curated_url_document_type(self): + """Test modifying document type for a single curated URL.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/tools/analysis.html", document_type=DocumentTypes.DATA + ) + + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.SOFTWARETOOLS + ) + + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert delta_url.document_type == DocumentTypes.SOFTWARETOOLS + assert pattern.delta_urls.filter(id=delta_url.id).exists() + # curated url should be unchanged + assert CuratedUrl.objects.get(url=curated_url.url).document_type == DocumentTypes.DATA + + def test_modify_single_curated_url_division(self): + """Test modifying division for a single curated URL.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/planetary/mars.html", division=Divisions.EARTH_SCIENCE + ) + + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.PLANETARY + ) + + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert delta_url.division == Divisions.PLANETARY + assert pattern.delta_urls.filter(id=delta_url.id).exists() + + +@pytest.mark.django_db +class TestFieldModifierPatternBehavior(TestCase): + """Test complex behaviors of field modifier patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_with_existing_delta(self): + """Test applying pattern when delta URL already exists.""" + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/instruments/telescope.html", + document_type=DocumentTypes.DOCUMENTATION, + ) + + # Create delta URL with different title + delta_url = DeltaUrlFactory( + collection=self.collection, + url=curated_url.url, + scraped_title="Updated Telescope Info", + document_type=DocumentTypes.DOCUMENTATION, + ) + + # Apply pattern - should modify existing delta + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.MISSIONSINSTRUMENTS + ) + + # Should still be only one delta URL with both changes + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + updated_delta = DeltaUrl.objects.get(url=curated_url.url) + assert updated_delta.id == delta_url.id + assert updated_delta.document_type == DocumentTypes.MISSIONSINSTRUMENTS + assert updated_delta.scraped_title == "Updated Telescope Info" + assert CuratedUrl.objects.get(url=curated_url.url).document_type == DocumentTypes.DOCUMENTATION + + def test_multi_url_pattern_modification(self): + """Test modifying multiple URLs with wildcard pattern.""" + # Create multiple curated URLs + [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/images/galaxy{i}.jpg", + document_type=DocumentTypes.DOCUMENTATION, + ) + for i in range(3) + ] + + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/images/*.jpg", + document_type=DocumentTypes.IMAGES, + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + assert DeltaUrl.objects.filter(collection=self.collection).count() == 3 + for delta_url in DeltaUrl.objects.all(): + assert delta_url.document_type == DocumentTypes.IMAGES + assert pattern.delta_urls.filter(id=delta_url.id).exists() + + +@pytest.mark.django_db +class TestFieldModifierPatternLifecycle(TestCase): + """Test pattern lifecycle including promotion and removal.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_removal_creates_reversal_deltas(self): + """Test that removing a pattern creates deltas to reverse its effects.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/bio/experiment.html", division=Divisions.GENERAL + ) + + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.division == Divisions.BIOLOGY + + self.collection.promote_to_curated() + + curated_url = CuratedUrl.objects.get(url=curated_url.url) + + assert curated_url.division == Divisions.BIOLOGY + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + pattern.delete() + + # when all you have in the system is a curated url and a pattern setting a value + # removal of the pattern should make a delta that sets the value to None + reversal_delta = DeltaUrl.objects.get(url=curated_url.url) + assert reversal_delta.division is None + + def test_multiple_patterns_same_url(self): + """Test that different types of patterns can affect same URL.""" + url = "https://example.com/astro/telescope_data.fits" + + CuratedUrlFactory( + collection=self.collection, url=url, division=Divisions.GENERAL, document_type=DocumentTypes.DOCUMENTATION + ) + + # Apply both division and document type patterns + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=url, division=Divisions.ASTROPHYSICS + ) + + doc_type_pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=url, document_type=DocumentTypes.DATA + ) + + # Should have one delta URL reflecting both changes + assert DeltaUrl.objects.count() == 1 + delta_url = DeltaUrl.objects.get() + assert delta_url.division == Divisions.ASTROPHYSICS + assert delta_url.document_type == DocumentTypes.DATA + assert division_pattern.delta_urls.filter(id=delta_url.id).exists() + assert doc_type_pattern.delta_urls.filter(id=delta_url.id).exists() + + +@pytest.mark.django_db +class TestFieldModifierPatternConstraints(TestCase): + """Test pattern constraints and validation.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_uniqueness_per_collection(self): + """Test that patterns must be unique per collection.""" + url = "https://example.com/data/sample.fits" + + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=url, document_type=DocumentTypes.DATA + ) + + with pytest.raises(IntegrityError): + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=url, document_type=DocumentTypes.DOCUMENTATION + ) + + +@pytest.mark.django_db +class TestFieldModifierDeltaCleanup(TestCase): + """ + Test complex delta URL cleanup scenarios, particularly around pattern removal + and interaction between multiple patterns. + """ + + def setUp(self): + self.collection = CollectionFactory() + + def test_delta_retained_with_other_changes(self): + """ + Test that a delta URL with changes from multiple patterns is properly + handled when one pattern is removed. + """ + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + scraped_title="Original Title", # Adding this to test preservation of manual changes + ) + + # Create two patterns affecting the same URL + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.ASTROPHYSICS + ) + + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA + ) + + # Manually modify the title to simulate a non-pattern change + delta_url = DeltaUrl.objects.get(url=curated_url.url) + delta_url.scraped_title = "Modified Title" + delta_url.save() + + # Remove one pattern - delta should be retained with other changes + division_pattern.delete() + + # Delta should still exist with doc type change and manual title change + retained_delta = DeltaUrl.objects.get(url=curated_url.url) + assert retained_delta.document_type == DocumentTypes.DATA + assert retained_delta.scraped_title == "Modified Title" + assert retained_delta.division == Divisions.GENERAL # Division reverted to curated value + + def test_delta_cleanup_after_all_patterns_removed(self): + """ + Test cleanup of delta URLs when all patterns affecting them are removed, + but only if no other changes exist. + """ + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + ) + + doc_type_pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA + ) + + # Verify delta exists with both changes + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.document_type == DocumentTypes.DATA + + # Remove pattern + doc_type_pattern.delete() + + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + def test_delta_cleanup_with_manual_changes(self): + """ + Test that deltas are retained when patterns are removed but manual changes exist. + """ + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + scraped_title="Original Title", + ) + + # Create pattern and let it create a delta + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.ASTROPHYSICS + ) + + # Add manual change to delta + delta_url = DeltaUrl.objects.get(url=curated_url.url) + delta_url.scraped_title = "Modified Title" + delta_url.save() + + # Remove pattern + pattern.delete() + + # Delta should be retained due to manual title change + retained_delta = DeltaUrl.objects.get(url=curated_url.url) + assert retained_delta.scraped_title == "Modified Title" + assert retained_delta.division == Divisions.GENERAL + + def test_multi_url_pattern_cleanup(self): + """ + Test cleanup behavior when removing a pattern that affects multiple URLs. + """ + # Create several curated URLs + curated_urls = [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/data/set{i}.fits", + document_type=DocumentTypes.DOCUMENTATION, + ) + for i in range(3) + ] + + # Create pattern affecting all URLs + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/data/*.fits", + document_type=DocumentTypes.DATA, + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Modify one delta with additional changes + delta_to_retain = DeltaUrl.objects.get(url=curated_urls[0].url) + delta_to_retain.scraped_title = "Modified Title" + delta_to_retain.save() + + # Remove pattern + pattern.delete() + + # Only the delta with manual changes should remain + assert DeltaUrl.objects.count() == 1 + retained_delta = DeltaUrl.objects.get() + assert retained_delta.url == curated_urls[0].url + assert retained_delta.scraped_title == "Modified Title" + assert retained_delta.document_type == DocumentTypes.DOCUMENTATION + + def test_pattern_removal_after_promotion(self): + """ + Test that removing a pattern after promotion creates appropriate reversal deltas. + """ + curated_urls = [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/helio/data{i}.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + ) + for i in range(2) + ] + + # Create patterns and manually modify one URL + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/helio/*.fits", + division=Divisions.HELIOPHYSICS, + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Modify first delta with additional changes + delta = DeltaUrl.objects.get(url=curated_urls[0].url) + delta.scraped_title = "Modified Title" + delta.save() + + # Promote collection + self.collection.promote_to_curated() + + # Remove pattern - should create reversal deltas + division_pattern.delete() + + # Should have two deltas: one with just division reversal, + # one with division reversal plus preserved title change + assert DeltaUrl.objects.count() == 2 + + # Check delta with manual changes + modified_delta = DeltaUrl.objects.get(url=curated_urls[0].url) + assert modified_delta.division is None + assert modified_delta.scraped_title == "Modified Title" + + # Check plain reversal delta + plain_delta = DeltaUrl.objects.get(url=curated_urls[1].url) + assert plain_delta.division is None + assert plain_delta.scraped_title == curated_urls[1].scraped_title + + def test_pattern_removal_creates_null_deltas(self): + """ """ + curated_url = DeltaUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.ASTROPHYSICS, + document_type=DocumentTypes.DATA, + ) + + # Create pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.HELIOPHYSICS + ) + + # Verify initial state + delta = DeltaUrl.objects.get(url=curated_url.url) + assert delta.division == Divisions.HELIOPHYSICS + + # Remove pattern + pattern.delete() + + # Should have delta with explicit NULL + new_delta = DeltaUrl.objects.get(url=curated_url.url) + assert new_delta.division is None + + # def test_pattern_removal_with_multiple_patterns(self): + # """ + # Test that removing one pattern doesn't NULL the field if other + # patterns of same type still affect the URL. + # """ + # # TODO: The official stance right now is to simply not make overlapping patterns like this + # # in the future, if this behavior is allowed, then this would be the test case. + # # right now, this behavior is not coded for, and this test does not pass. + + # curated_url = CuratedUrlFactory( + # collection=self.collection, url="https://example.com/astro/data.fits", division=Divisions.GENERAL + # ) + + # # Create two patterns affecting same URL + # pattern1 = DeltaDivisionPattern.objects.create( + # collection=self.collection, + # match_pattern="*.fits", + # division=Divisions.ASTROPHYSICS, + # match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + # ) + + # DeltaDivisionPattern.objects.create( + # collection=self.collection, match_pattern=curated_url.url, division=Divisions.HELIOPHYSICS + # ) + + # # Remove one pattern + # pattern1.delete() + + # # Delta should retain value from remaining pattern + # delta = DeltaUrl.objects.get(url=curated_url.url) + # assert delta.division == Divisions.HELIOPHYSICS diff --git a/sde_collections/tests/test_field_modifier_unapply.py b/sde_collections/tests/test_field_modifier_unapply.py new file mode 100644 index 00000000..9f9f5b01 --- /dev/null +++ b/sde_collections/tests/test_field_modifier_unapply.py @@ -0,0 +1,252 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_unapply.py + +from django.test import TestCase + +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl + +from .factories import CollectionFactory, DumpUrlFactory + + +class TestDeltaPatternUnapplyLogic(TestCase): + """Test complete lifecycle of pattern application and removal.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_dump_to_delta_migration_with_pattern_lifecycle(self): + """ + Test complete lifecycle: + 1. Create dump URLs + 2. Migrate to delta URLs + 3. Apply patterns + 4. Promote to curated + 5. Delete pattern + 6. Verify deltas are created + 7. Promote to curated + 8. Verify curated URLs have division set to None + """ + # Create initial dump URLs + [ + DumpUrlFactory( + collection=self.collection, + url=f"https://example.com/science/data{i}.html", + ) + for i in range(3) + ] + + # Migrate dump to delta + self.collection.migrate_dump_to_delta() + + # Verify dump URLs were migrated to delta URLs + self.assertEqual(DeltaUrl.objects.count(), 3) + self.assertEqual(DumpUrl.objects.count(), 0) + + # Apply division pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/science/*.html", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.BIOLOGY, + ) + + # Verify pattern was applied to existing deltas + for delta_url in DeltaUrl.objects.all(): + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Promote to curated + self.collection.promote_to_curated() + + # Verify promotion + self.assertEqual(CuratedUrl.objects.count(), 3) + self.assertEqual(DeltaUrl.objects.count(), 0) + for curated_url in CuratedUrl.objects.all(): + self.assertEqual(curated_url.division, Divisions.BIOLOGY) + + # Remove pattern + pattern.delete() + + # Should have created new deltas for all URLs setting division to None + self.assertEqual(DeltaUrl.objects.count(), 3) + for delta_url in DeltaUrl.objects.all(): + self.assertIsNone(delta_url.division) + + # Promote to curated + self.collection.promote_to_curated() + + # Should updated all Curated setting division to None + self.assertEqual(CuratedUrl.objects.count(), 3) + for delta_url in CuratedUrl.objects.all(): + self.assertIsNone(delta_url.division) + + # Test for README_UNNAPLY_LOGIC.md Case 1: Delta Only (New URL) + def test_pattern_removal_with_delta_only(self): + """Test pattern removal when delta exists without corresponding curated URL.""" + # Create initial delta URL (simulating a new URL) + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/new.html") + + # Create and apply pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, division=Divisions.BIOLOGY + ) + + # Verify pattern was applied + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Remove pattern + pattern.delete() + + # Verify delta still exists but with division set to None + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertIsNone(delta_url.division) + self.assertEqual(DeltaUrl.objects.count(), 1) + + # Test for README_UNNAPLY_LOGIC.md Case 2: Delta Created to Apply Pattern + def test_pattern_removal_with_simple_delta(self): + """Test pattern removal when delta was created just to apply pattern.""" + # Create initial curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, url="https://example.com/doc.html", division=None + ) + + # Create and apply pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + # Verify delta was created with pattern's value + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Remove pattern + pattern.delete() + + # Verify delta was deleted since it would match curated + self.assertEqual(DeltaUrl.objects.filter(url=curated_url.url).count(), 0) + + # Test for README_UNNAPLY_LOGIC.md Case 3: Pre-existing Delta + def test_pattern_removal_preserves_other_changes(self): + """Test pattern removal when delta has other changes that should be preserved.""" + # Create curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, + url="https://example.com/doc.html", + division=None, + scraped_title="Original Title", + ) + + # Create delta with modified title + delta_url = DeltaUrl.objects.create( + collection=self.collection, url=curated_url.url, division=None, scraped_title="Modified Title" + ) + + # Create and apply pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + # Verify pattern was applied while preserving title + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + self.assertEqual(delta_url.scraped_title, "Modified Title") + + # Remove pattern + pattern.delete() + + # Verify delta still exists with original changes but pattern effect removed + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertIsNone(delta_url.division) + self.assertEqual(delta_url.scraped_title, "Modified Title") + + # Test for README_UNNAPLY_LOGIC.md Case 4: Multiple Pattern Effects + def test_pattern_removal_with_multiple_patterns(self): + """Test removal of one pattern when URL is affected by multiple patterns.""" + # Create curated URL + curated_url = CuratedUrl.objects.create(collection=self.collection, url="https://example.com/doc.html") + + # Create two patterns affecting the same URL + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA + ) + + # Verify both patterns were applied + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + self.assertEqual(delta_url.document_type, DocumentTypes.DATA) + + # Remove division pattern + division_pattern.delete() + + # Verify delta still exists with doc type but division removed + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertIsNone(delta_url.division) + self.assertEqual(delta_url.document_type, DocumentTypes.DATA) + + # Test for Case 5: Overlapping Patterns, Specific Deleted + def test_specific_pattern_removal_with_overlapping_patterns(self): + """Test removal of specific pattern when more general pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.BIOLOGY, + ) + + # Create specific pattern + specific_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, division=Divisions.ASTROPHYSICS + ) + + # Verify specific pattern took precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS) + + # Remove specific pattern + specific_pattern.delete() + + # Verify general pattern now applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Test for Case 6: Overlapping Patterns, General Deleted + def test_general_pattern_removal_with_overlapping_patterns(self): + """Test removal of general pattern when more specific pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + general_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.BIOLOGY, + ) + + # Create specific pattern + DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, division=Divisions.ASTROPHYSICS + ) + + # Verify specific pattern takes precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS) + + # Remove general pattern + general_pattern.delete() + + # Verify specific pattern still applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS) diff --git a/sde_collections/tests/test_import_fulltexts.py b/sde_collections/tests/test_import_fulltexts.py new file mode 100644 index 00000000..17df38ea --- /dev/null +++ b/sde_collections/tests/test_import_fulltexts.py @@ -0,0 +1,83 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_import_fulltexts.py + +from unittest.mock import patch + +import pytest +from django.db.models.signals import post_save + +from sde_collections.models.collection import create_configs_on_status_change +from sde_collections.models.delta_url import DeltaUrl, DumpUrl +from sde_collections.tasks import fetch_and_replace_full_text +from sde_collections.tests.factories import CollectionFactory + + +@pytest.fixture +def disconnect_signals(): + # Disconnect the signal before each test + post_save.disconnect(create_configs_on_status_change, sender="sde_collections.Collection") + yield + # Reconnect the signal after each test + post_save.connect(create_configs_on_status_change, sender="sde_collections.Collection") + + +@pytest.mark.django_db +def test_fetch_and_replace_full_text(disconnect_signals): + collection = CollectionFactory(config_folder="test_folder") + + mock_batch = [ + {"url": "http://example.com/1", "full_text": "Test Text 1", "title": "Test Title 1"}, + {"url": "http://example.com/2", "full_text": "Test Text 2", "title": "Test Title 2"}, + ] + + def mock_generator(): + yield mock_batch + + with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts: + mock_get_full_texts.return_value = mock_generator() + + fetch_and_replace_full_text(collection.id, "lrm_dev") + + assert DumpUrl.objects.filter(collection=collection).count() == 0 + assert DeltaUrl.objects.filter(collection=collection).count() == 2 + + +@pytest.mark.django_db +def test_fetch_and_replace_full_text_large_dataset(disconnect_signals): + """Test processing a large number of records with proper pagination and batching.""" + collection = CollectionFactory(config_folder="test_folder") + + # Create sample data - 20,000 records in total + def create_batch(start_idx, size): + return [ + {"url": f"http://example.com/{i}", "full_text": f"Test Text {i}", "title": f"Test Title {i}"} + for i in range(start_idx, start_idx + size) + ] + + # Mock the API to return data in batches of 5000 (matching actual API pagination) + def mock_batch_generator(): + batch_size = 5000 + total_records = 20000 + + for start in range(0, total_records, batch_size): + yield create_batch(start, min(batch_size, total_records - start)) + + with patch("sde_collections.sinequa_api.Api.get_full_texts") as mock_get_full_texts: + mock_get_full_texts.return_value = mock_batch_generator() + + # Execute the task + result = fetch_and_replace_full_text(collection.id, "lrm_dev") + + # Verify total number of records + assert DeltaUrl.objects.filter(collection=collection).count() == 20000 + + # Verify some random records exist and have correct data + for i in [0, 4999, 5000, 19999]: # Check boundaries and middle + url = DeltaUrl.objects.get(url=f"http://example.com/{i}") + assert url.scraped_text == f"Test Text {i}" + assert url.scraped_title == f"Test Title {i}" + + # Verify batch processing worked by checking the success message + assert "Successfully processed 20000 records" in result + + # Verify no DumpUrls remain (should all be migrated to DeltaUrls) + assert DumpUrl.objects.filter(collection=collection).count() == 0 diff --git a/sde_collections/tests/test_include_patterns.py b/sde_collections/tests/test_include_patterns.py new file mode 100644 index 00000000..4212efa5 --- /dev/null +++ b/sde_collections/tests/test_include_patterns.py @@ -0,0 +1,132 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_include_patterns.py +import pytest + +from sde_collections.models.delta_patterns import ( + DeltaExcludePattern, + DeltaIncludePattern, +) +from sde_collections.models.delta_url import DeltaUrl +from sde_collections.tests.factories import ( + CollectionFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + + +@pytest.mark.django_db +def test_patterns_applied_after_migration(): + collection = CollectionFactory() + + # Add DumpUrls to migrate - using folder-based structure + DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/1") + DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/2") + DumpUrlFactory(collection=collection, url="https://example.com/included_docs/1") + DumpUrlFactory(collection=collection, url="https://example.com/other_docs/1") + # This URL should be included despite being in excluded_docs folder + DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/included") + + # Create exclude pattern for excluded_docs folder + exclude_pattern = DeltaExcludePattern.objects.create( + collection=collection, match_pattern="https://example.com/excluded_docs/*", match_pattern_type=2 + ) + + # Create include patterns + include_pattern = DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/included_docs/*", match_pattern_type=2 + ) + + # Specific include pattern that overrides the excluded_docs folder + specific_include = DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/excluded_docs/included", match_pattern_type=1 + ) + + # Perform the migration + collection.migrate_dump_to_delta() + + # Verify pattern relationships + assert exclude_pattern.delta_urls.filter( + url="https://example.com/excluded_docs/1" + ).exists(), "Exclude pattern not applied to excluded_docs" + + assert include_pattern.delta_urls.filter( + url="https://example.com/included_docs/1" + ).exists(), "Include pattern not applied to included_docs" + + # Verify URL in other_docs is unaffected + assert not exclude_pattern.delta_urls.filter( + url="https://example.com/other_docs/1" + ).exists(), "Exclude pattern incorrectly applied to other_docs" + assert not include_pattern.delta_urls.filter( + url="https://example.com/other_docs/1" + ).exists(), "Include pattern incorrectly applied to other_docs" + + # Verify excluded status + excluded_url = DeltaUrl.objects.get(url="https://example.com/excluded_docs/1") + included_url = DeltaUrl.objects.get(url="https://example.com/included_docs/1") + neutral_url = DeltaUrl.objects.get(url="https://example.com/other_docs/1") + override_url = DeltaUrl.objects.get(url="https://example.com/excluded_docs/included") + + assert excluded_url.excluded is True, "URL in excluded_docs should be excluded" + assert included_url.excluded is False, "URL in included_docs should not be excluded" + assert neutral_url.excluded is False, "URL in other_docs should not be excluded" + assert ( + override_url.excluded is False + ), "Specifically included URL should not be excluded despite being in excluded_docs" + + # Verify both patterns are applied to the override URL + assert exclude_pattern.delta_urls.filter(url="https://example.com/excluded_docs/included").exists() + assert specific_include.delta_urls.filter(url="https://example.com/excluded_docs/included").exists() + + +# Test cases for the updated functionality +@pytest.mark.django_db +class TestUrlExclusionInclusion: + def test_exclusion_with_no_patterns(self): + """Test that URLs are not excluded by default""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection) + + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + + def test_exclusion_pattern_only(self): + """Test that exclude patterns work when no include patterns exist""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/excluded") + + DeltaExcludePattern.objects.create( + collection=collection, match_pattern="https://example.com/excluded", match_pattern_type=1 + ) + + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True + + def test_include_pattern_overrides_exclude(self): + """Test that include patterns take precedence over exclude patterns""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/both") + + # Create both exclude and include patterns for the same URL + DeltaExcludePattern.objects.create( + collection=collection, match_pattern="https://example.com/both", match_pattern_type=1 + ) + + DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/both", match_pattern_type=1 + ) + + # URL should not be excluded because include takes precedence + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + + def test_wildcard_patterns(self): + """Test that wildcard patterns work correctly with include/exclude precedence""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/docs/file.pdf") + + # Exclude all PDFs but include those in /docs/ + DeltaExcludePattern.objects.create(collection=collection, match_pattern="*.pdf", match_pattern_type=2) + + DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/docs/*", match_pattern_type=2 + ) + + # URL should not be excluded because the include pattern matches + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False diff --git a/sde_collections/tests/test_migrate_dump.py b/sde_collections/tests/test_migrate_dump.py new file mode 100644 index 00000000..c0f460d6 --- /dev/null +++ b/sde_collections/tests/test_migrate_dump.py @@ -0,0 +1,313 @@ +# noqa: F841 +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migrate_dump.py + +import pytest + +from sde_collections.models.collection_choice_fields import DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDocumentTypePattern, + DeltaExcludePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl +from sde_collections.tests.factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + +DELTA_COMPARISON_FIELDS = ["scraped_title"] # Assuming a central definition + + +@pytest.mark.django_db +class TestMigrationHelpers: + def test_clear_delta_urls(self): + collection = CollectionFactory() + DeltaUrlFactory.create_batch(5, collection=collection) + collection.clear_delta_urls() + assert DeltaUrl.objects.filter(collection=collection).count() == 0 + + def test_clear_dump_urls(self): + collection = CollectionFactory() + DumpUrlFactory.create_batch(5, collection=collection) + collection.clear_dump_urls() + assert DumpUrl.objects.filter(collection=collection).count() == 0 + + def test_create_or_update_delta_url_add(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection) + collection.create_or_update_delta_url(dump_url, to_delete=False) + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.to_delete is False + for field in DELTA_COMPARISON_FIELDS: + assert getattr(delta, field) == getattr(dump_url, field) + + def test_create_or_update_delta_url_delete(self): + collection = CollectionFactory() + curated_url = CuratedUrlFactory(collection=collection) + collection.create_or_update_delta_url(curated_url, to_delete=True) + delta = DeltaUrl.objects.get(url=curated_url.url) + assert delta.to_delete is True + assert delta.scraped_title == curated_url.scraped_title + + +@pytest.mark.django_db +class TestMigrateDumpToDelta: + def test_new_url_in_dump_only(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection) + collection.migrate_dump_to_delta() + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.to_delete is False + for field in DELTA_COMPARISON_FIELDS: + assert getattr(delta, field) == getattr(dump_url, field) + + def test_url_in_both_with_different_field(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection, scraped_title="New Title") + CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Old Title") + collection.migrate_dump_to_delta() + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.to_delete is False + assert delta.scraped_title == "New Title" + + def test_url_in_curated_only(self): + collection = CollectionFactory() + curated_url = CuratedUrlFactory(collection=collection) + collection.migrate_dump_to_delta() + delta = DeltaUrl.objects.get(url=curated_url.url) + assert delta.to_delete is True + assert delta.scraped_title == curated_url.scraped_title + + def test_identical_url_in_both(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection, scraped_title="Same Title") + CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title") + collection.migrate_dump_to_delta() + assert not DeltaUrl.objects.filter(url=dump_url.url).exists() + + def test_full_migration_flow(self): + collection = CollectionFactory() + dump_url_new = DumpUrlFactory(collection=collection) # New URL + dump_url_update = DumpUrlFactory(collection=collection, scraped_title="Updated Title") + CuratedUrlFactory(collection=collection, url=dump_url_update.url, scraped_title="Old Title") + curated_url_delete = CuratedUrlFactory(collection=collection) # Missing in Dump + + collection.migrate_dump_to_delta() + + # New URL moved to DeltaUrls + assert DeltaUrl.objects.filter(url=dump_url_new.url, to_delete=False).exists() + + # Updated URL moved to DeltaUrls + delta_update = DeltaUrl.objects.get(url=dump_url_update.url) + assert delta_update.scraped_title == "Updated Title" + assert delta_update.to_delete is False + + # Deleted URL in CuratedUrls marked as delete in DeltaUrls + delta_delete = DeltaUrl.objects.get(url=curated_url_delete.url) + assert delta_delete.to_delete is True + + def test_empty_collections(self): + collection = CollectionFactory() + collection.migrate_dump_to_delta() + assert DeltaUrl.objects.filter(collection=collection).count() == 0 + + def test_partial_data_in_dump_urls(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection, scraped_title="") + collection.migrate_dump_to_delta() + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.scraped_title == "" + assert delta.to_delete is False + + +@pytest.mark.django_db +class TestMigrationIdempotency: + def test_migrate_dump_to_delta_idempotency(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection) + CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Different Title") + + # First migration run + collection.migrate_dump_to_delta() + assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1 + + # Run migration again + collection.migrate_dump_to_delta() + assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1 # Ensure no duplicates + + def test_create_or_update_delta_url_idempotency(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection) + + # First call + collection.create_or_update_delta_url(dump_url, to_delete=False) + assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1 + + # Second call with the same data + collection.create_or_update_delta_url(dump_url, to_delete=False) + assert DeltaUrl.objects.filter(url=dump_url.url).count() == 1 # Should still be one + + +@pytest.mark.django_db +def test_create_or_update_delta_url_field_copy(): + collection = CollectionFactory() + dump_url = DumpUrlFactory( + collection=collection, + scraped_title="Test Title", + scraped_text="Test Text", + generated_title="Generated Test Title", + visited=True, + document_type=1, + division=2, + ) + + collection.create_or_update_delta_url(dump_url, to_delete=False) + delta = DeltaUrl.objects.get(url=dump_url.url) + + # Verify each field is copied correctly + for field in DumpUrl._meta.fields: + if field.name not in ["id", "collection", "url"]: + assert getattr(delta, field.name) == getattr(dump_url, field.name) + + +@pytest.mark.django_db +class TestGranularFullMigrationFlow: + def test_full_migration_new_url(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection) # New URL + collection.migrate_dump_to_delta() + + # New URL should be added to DeltaUrls + assert DeltaUrl.objects.filter(url=dump_url.url, to_delete=False).exists() + + def test_full_migration_updated_url(self): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection, scraped_title="Updated Title") + collection.migrate_dump_to_delta() + + # URL with differing fields should be updated in DeltaUrls + delta_update = DeltaUrl.objects.get(url=dump_url.url) + assert delta_update.scraped_title == "Updated Title" + assert delta_update.to_delete is False + + def test_full_migration_deleted_url(self): + collection = CollectionFactory() + curated_url = CuratedUrlFactory(collection=collection) # URL to be deleted + collection.migrate_dump_to_delta() + + # Missing URL in DumpUrls should be marked as delete in DeltaUrls + delta_delete = DeltaUrl.objects.get(url=curated_url.url) + assert delta_delete.to_delete is True + + +@pytest.mark.django_db +def test_empty_delta_comparison_fields(): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection, scraped_title="Same Title") + CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="Same Title") # noqa + + global DELTA_COMPARISON_FIELDS + original_fields = DELTA_COMPARISON_FIELDS + DELTA_COMPARISON_FIELDS = [] # Simulate empty comparison fields + + try: + collection.migrate_dump_to_delta() + # No DeltaUrl should be created as there are no fields to compare + assert not DeltaUrl.objects.filter(url=dump_url.url).exists() + finally: + DELTA_COMPARISON_FIELDS = original_fields # Reset the fields after test + + +@pytest.mark.django_db +def test_partial_data_in_curated_urls(): + collection = CollectionFactory() + dump_url = DumpUrlFactory(collection=collection, scraped_title="Title Exists") + CuratedUrlFactory(collection=collection, url=dump_url.url, scraped_title="") # noqa + + collection.migrate_dump_to_delta() + + # Since `scraped_title` differs (None vs "Title Exists"), it should create a DeltaUrl + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.scraped_title == "Title Exists" + assert delta.to_delete is False + + +@pytest.mark.django_db +def test_full_migration_with_patterns(): + """ + Test a complete migration flow with exclude patterns and document type patterns. + Tests the following scenarios: + - New URL from dump (should create delta) + - Updated URL from dump (should create delta with new title) + - Deleted URL (should create delta marked for deletion) + - URL matching exclude pattern (should be excluded) + - URL matching document type pattern (should have correct doc type) + """ + collection = CollectionFactory() + + # Set up initial DumpUrls and CuratedUrls + DumpUrlFactory(collection=collection, url="https://example.com/new", scraped_title="New Page") + DumpUrlFactory(collection=collection, url="https://example.com/update", scraped_title="Updated Title") + DumpUrlFactory(collection=collection, url="https://example.com/docs/guide", scraped_title="Documentation Guide") + + CuratedUrlFactory(collection=collection, url="https://example.com/update", scraped_title="Old Title") + CuratedUrlFactory(collection=collection, url="https://example.com/delete", scraped_title="Delete Me") + CuratedUrlFactory(collection=collection, url="https://example.com/docs/guide", scraped_title="Documentation Guide") + + # Create patterns before migration + exclude_pattern = DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/delete", + match_pattern_type=1, # Individual URL + reason="Test exclusion", + ) + + doc_type_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/*", + match_pattern_type=2, # Multi-URL pattern + document_type=DocumentTypes.DOCUMENTATION, + ) + + # Perform migration + collection.migrate_dump_to_delta() + + # 1. Check new URL was created as delta + new_delta = DeltaUrl.objects.get(url="https://example.com/new") + assert new_delta.to_delete is False + assert new_delta.scraped_title == "New Page" + + # 2. Check updated URL has new title in delta + update_delta = DeltaUrl.objects.get(url="https://example.com/update") + assert update_delta.to_delete is False + assert update_delta.scraped_title == "Updated Title" + + # 3. Check deleted URL is marked for deletion + delete_delta = DeltaUrl.objects.get(url="https://example.com/delete") + assert delete_delta.to_delete is True + assert delete_delta.excluded is True # Should be excluded due to pattern + + # 4. Check documentation URL has correct type + docs_delta = DeltaUrl.objects.get(url="https://example.com/docs/guide") + assert docs_delta.document_type == DocumentTypes.DOCUMENTATION + assert docs_delta.to_delete is False + + # 5. Verify pattern relationships + exclude_pattern.refresh_from_db() + doc_type_pattern.refresh_from_db() + + assert exclude_pattern.delta_urls.filter(url="https://example.com/delete").exists() + assert doc_type_pattern.delta_urls.filter(url="https://example.com/docs/guide").exists() + + # 6. Check total number of deltas is correct + assert DeltaUrl.objects.filter(collection=collection).count() == 4 + + # Optional: Test promotion to verify patterns stick + collection.promote_to_curated() + + # Verify results after promotion + assert not CuratedUrl.objects.filter(url="https://example.com/delete").exists() + assert CuratedUrl.objects.get(url="https://example.com/docs/guide").document_type == DocumentTypes.DOCUMENTATION + assert CuratedUrl.objects.get(url="https://example.com/update").scraped_title == "Updated Title" + assert not CuratedUrl.objects.filter(scraped_title="Old Title").exists() diff --git a/sde_collections/tests/test_migration.py b/sde_collections/tests/test_migration.py new file mode 100644 index 00000000..211145e9 --- /dev/null +++ b/sde_collections/tests/test_migration.py @@ -0,0 +1,264 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migration.py + +import pytest +from django.test import TestCase + +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, + DeltaExcludePattern, +) +from sde_collections.models.delta_url import DeltaUrl, DumpUrl +from sde_collections.tests.factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + + +@pytest.mark.django_db +class TestMigrateDumpToDelta(TestCase): + """Test the migrate_dump_to_delta process comprehensively.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_basic_migration_new_url(self): + """Test basic migration of a new URL with no existing curated version.""" + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/new", + scraped_title="New Doc", + document_type=DocumentTypes.DOCUMENTATION, + division=Divisions.ASTROPHYSICS, + ) + + self.collection.migrate_dump_to_delta() + + # Verify delta created with all fields + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.scraped_title == dump_url.scraped_title + assert delta.document_type == dump_url.document_type + assert delta.division == dump_url.division + assert delta.to_delete is False + + def test_migration_with_differing_curated(self): + """Test migration when dump differs from existing curated URL.""" + url = "https://example.com/doc" + + dump_url = DumpUrlFactory( + collection=self.collection, + url=url, + scraped_title="New Title", + document_type=DocumentTypes.DATA, + ) + + CuratedUrlFactory( + collection=self.collection, + url=url, + scraped_title="Old Title", + document_type=DocumentTypes.DOCUMENTATION, + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=url) + assert delta.scraped_title == dump_url.scraped_title + assert delta.document_type == dump_url.document_type + assert delta.to_delete is False + + def test_migration_marks_missing_urls_for_deletion(self): + """Test that curated URLs not in dump are marked for deletion.""" + # Create only curated URL, no dump + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/old", + scraped_title="Old Doc", + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=curated_url.url) + assert delta.to_delete is True + assert delta.scraped_title == curated_url.scraped_title + + def test_migration_handles_null_fields(self): + """Test migration properly handles null/empty fields.""" + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/doc", + scraped_title="", # Empty string + document_type=None, # Null + division=None, # Null + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.scraped_title == "" + assert delta.document_type is None + assert delta.division is None + + def test_migration_clears_existing_deltas(self): + """Test that existing deltas are cleared before migration.""" + # Create pre-existing delta + old_delta = DeltaUrlFactory( + collection=self.collection, + url="https://example.com/old", + scraped_title="Old Delta", + ) + + # Create new dump URL + new_dump = DumpUrlFactory( + collection=self.collection, + url="https://example.com/new", + scraped_title="New Dump", + ) + + self.collection.migrate_dump_to_delta() + + # Verify old delta is gone and only new one exists + assert not DeltaUrl.objects.filter(url=old_delta.url).exists() + assert DeltaUrl.objects.filter(url=new_dump.url).exists() + + def test_migration_with_exclude_pattern(self): + """Test migration interacts correctly with exclude patterns.""" + # Create pattern first + DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="*internal*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Create dump URL that should be excluded + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/internal/doc", + scraped_title="Internal Doc", + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.excluded is True + + def test_migration_with_field_modifying_pattern(self): + """Test migration with patterns that modify fields.""" + # Create document type pattern + DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="*.pdf", + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + document_type=DocumentTypes.DATA, + ) + + # Create division pattern + DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="*/astro/*", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.ASTROPHYSICS, + ) + + # Create dump URL that matches both patterns + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.pdf", + scraped_title="Astro Data", + document_type=DocumentTypes.DOCUMENTATION, # Different from pattern + division=Divisions.EARTH_SCIENCE, # Different from pattern + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.document_type == DocumentTypes.DATA + assert delta.division == Divisions.ASTROPHYSICS + + def test_migration_with_multiple_urls(self): + """Test migration with multiple URLs in various states.""" + # Create mix of dump and curated URLs + dump_urls = [DumpUrlFactory(collection=self.collection) for _ in range(3)] + curated_urls = [CuratedUrlFactory(collection=self.collection) for _ in range(2)] + + self.collection.migrate_dump_to_delta() + + # Should have deltas for all dump URLs + for dump_url in dump_urls: + assert DeltaUrl.objects.filter(url=dump_url.url, to_delete=False).exists() + + # Should have deletion deltas for curated URLs not in dump + for curated_url in curated_urls: + assert DeltaUrl.objects.filter(url=curated_url.url, to_delete=True).exists() + + def test_migration_with_empty_states(self): + """Test migration handles empty dump and curated states.""" + # No dump or curated URLs exist + self.collection.migrate_dump_to_delta() + assert DeltaUrl.objects.count() == 0 + + # Only curated URLs exist + CuratedUrlFactory(collection=self.collection) + self.collection.migrate_dump_to_delta() + assert DeltaUrl.objects.count() == 1 + assert DeltaUrl.objects.first().to_delete is True + + def test_migration_preserves_all_fields(self): + """Test that ALL fields are preserved during migration, not just changed ones.""" + # Create dump URL with all fields populated + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/doc", + scraped_title="Title", + scraped_text="Full text content", + generated_title="Generated Title", + document_type=DocumentTypes.DOCUMENTATION, + division=Divisions.ASTROPHYSICS, + visited=True, + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + + # Verify all fields were copied + fields_to_check = [ + "scraped_title", + "scraped_text", + "generated_title", + "document_type", + "division", + "visited", + ] + + for field in fields_to_check: + assert getattr(delta, field) == getattr(dump_url, field) + + def test_clearing_dump_urls(self): + """Test that dump URLs are cleared after migration.""" + DumpUrlFactory(collection=self.collection) + DumpUrlFactory(collection=self.collection) + + self.collection.migrate_dump_to_delta() + + assert DumpUrl.objects.filter(collection=self.collection).count() == 0 + + def test_pattern_relationships_updated(self): + """Test that pattern relationships are properly updated after migration.""" + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="*test*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/test/doc", + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert pattern.delta_urls.filter(id=delta.id).exists() diff --git a/sde_collections/tests/test_models_collections.py b/sde_collections/tests/test_models_collections.py deleted file mode 100644 index a5a2a114..00000000 --- a/sde_collections/tests/test_models_collections.py +++ /dev/null @@ -1,12 +0,0 @@ -from config_generation.db_to_xml import XmlEditor - -from ..models.collection import Collection -from ..models.collection_choice_fields import Divisions, DocumentTypes - - -def test_create_config_xml(): - collection = Collection(name="test", division=Divisions.EARTH_SCIENCE, document_type=DocumentTypes.DATA) - output_xml = collection.create_config_xml() - editor = XmlEditor(output_xml) - assert collection.tree_root == editor.fetch_treeroot() - assert collection.document_type == editor.fetch_document_type() diff --git a/sde_collections/tests/test_pattern_specificity.py b/sde_collections/tests/test_pattern_specificity.py new file mode 100644 index 00000000..7eafe978 --- /dev/null +++ b/sde_collections/tests/test_pattern_specificity.py @@ -0,0 +1,220 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py + +import pytest + +from sde_collections.models.collection_choice_fields import DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDocumentTypePattern, + DeltaResolvedTitle, + DeltaTitlePattern, +) +from sde_collections.models.delta_url import DeltaUrl +from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory + + +@pytest.mark.django_db +def test_title_pattern_multiple_resolved_titles_extended(): + """Test that patterns properly handle title resolution based on URL set size.""" + collection = CollectionFactory() + + # Create URLs with different levels of specificity + url1 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/item.html", scraped_title="Original Title" + ) + url2 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/item2.html", scraped_title="Original Title" + ) + url3 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/pdfs/item1.html", scraped_title="Original Title" + ) + + # Create general pattern (matches all URLs) + general_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs*", + title_pattern="{title} - Docs", + match_pattern_type=2, + ) + + # Verify initial pattern application + assert general_pattern.get_url_match_count() == 3 + assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs" + assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs" + assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - Docs" + + # Verify DeltaResolvedTitle entries + assert DeltaResolvedTitle.objects.count() == 3 + for url in [url1, url2, url3]: + resolved = DeltaResolvedTitle.objects.get(delta_url=url) + assert resolved.title_pattern == general_pattern + assert resolved.resolved_title == "Original Title - Docs" + + # Create more specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, match_pattern="*docs/pdfs*", title_pattern="{title} - HTML", match_pattern_type=2 + ) + + # Verify pattern match counts + assert specific_pattern.get_url_match_count() == 1 # Only matches pdfs URL + assert general_pattern.get_url_match_count() == 3 # Matches all URLs + + # Verify titles were updated appropriately + assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs" # Unchanged + assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs" # Unchanged + assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - HTML" # Updated + + # Verify DeltaResolvedTitle entries + assert DeltaResolvedTitle.objects.count() == 3 # Still one per URL + + # URLs with general pattern should be unchanged + for url in [url1, url2]: + resolved = DeltaResolvedTitle.objects.get(delta_url=url) + assert resolved.title_pattern == general_pattern + assert resolved.resolved_title == "Original Title - Docs" + + # PDF URL should now use specific pattern + resolved_pdf = DeltaResolvedTitle.objects.get(delta_url=url3) + assert resolved_pdf.title_pattern == specific_pattern + assert resolved_pdf.resolved_title == "Original Title - HTML" + + # Verify pattern relationships are maintained + assert url1 in general_pattern.delta_urls.all() + assert url2 in general_pattern.delta_urls.all() + assert url3 in general_pattern.delta_urls.all() + assert url3 in specific_pattern.delta_urls.all() + + +@pytest.mark.django_db +def test_field_modifying_pattern_layered_specificity(): + """Test overlapping patterns with different levels of specificity.""" + collection = CollectionFactory() + + # Create URLs in a hierarchy that allows for overlapping pattern matches + deep_tool = DeltaUrlFactory( + collection=collection, + url="https://example.com/tools/analysis/v2/processor.py", + document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation + ) + mid_tool = DeltaUrlFactory( + collection=collection, + url="https://example.com/tools/analysis/helper.py", + document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation + ) + top_tool = DeltaUrlFactory( + collection=collection, + url="https://example.com/tools/simple.py", + document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation + ) + + # Create patterns with overlapping matches + broad_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="*/tools/*.py", # Matches all 3 URLs + document_type=DocumentTypes.SOFTWARETOOLS, + match_pattern_type=2, + ) + + mid_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="*/tools/analysis/*.py", # Matches 2 URLs (mid and deep) + document_type=DocumentTypes.DATA, # Different type to clearly show which pattern won + match_pattern_type=2, + ) + + specific_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="*/analysis/v2/*.py", # Matches only 1 URL (deep) + document_type=DocumentTypes.DOCUMENTATION, # Different type to clearly show which pattern won + match_pattern_type=2, + ) + + # Verify URL match counts + assert broad_pattern.get_url_match_count() == 3 + assert mid_pattern.get_url_match_count() == 2 + assert specific_pattern.get_url_match_count() == 1 + + # Verify patterns were applied correctly based on specificity + deep_tool.refresh_from_db() + mid_tool.refresh_from_db() + top_tool.refresh_from_db() + + # The most specific pattern (1 match) should win for the deep URL + assert deep_tool.document_type == DocumentTypes.DOCUMENTATION, "Deep URL should use most specific pattern" + + # The mid-level pattern (2 matches) should win for the middle URL + assert mid_tool.document_type == DocumentTypes.DATA, "Mid URL should use mid-level pattern" + + # The broad pattern (3 matches) should only affect the top URL + assert top_tool.document_type == DocumentTypes.SOFTWARETOOLS, "Top URL should use broad pattern" + + # Verify the relationships are tracked correctly + assert deep_tool.pk in specific_pattern.delta_urls.values_list("pk", flat=True) + assert deep_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True) + assert deep_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) + + assert mid_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True) + assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) + + assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) + + +@pytest.mark.django_db +def test_pattern_specificity_tiebreaker(): + """Test that when patterns match the same number of URLs, longer patterns are considered more specific.""" + collection = CollectionFactory() + + # Create URLs that would result in same match count for different patterns + url1 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/specific/item1.html", scraped_title="Title 1" + ) + url2 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/specific/item2.html", scraped_title="Title 2" + ) + + # Create patterns with same match count but different lengths + general_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs*", # Shorter pattern + title_pattern="{title}", + match_pattern_type=2, + ) + + specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs/specific*", # Longer pattern + title_pattern="{title} - Specific", + match_pattern_type=2, + ) + + # Both patterns will match both URLs (same match count) + assert general_pattern.get_url_match_count() == 2 + assert specific_pattern.get_url_match_count() == 2 + + # But the longer pattern should be considered more specific + assert general_pattern.is_most_distinctive_pattern(url1) is False + assert specific_pattern.is_most_distinctive_pattern(url1) is True + + # Check that this applies to both URLs + assert general_pattern.is_most_distinctive_pattern(url2) is False + assert specific_pattern.is_most_distinctive_pattern(url2) is True + + # Create an even more specific pattern + very_specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs/specific/item1*", # Even longer pattern + title_pattern="{title} - Very Specific", + match_pattern_type=2, + ) + + # It matches fewer URLs + assert very_specific_pattern.get_url_match_count() == 1 + + # For URL1, the very specific pattern should win due to fewer matches + assert general_pattern.is_most_distinctive_pattern(url1) is False + assert specific_pattern.is_most_distinctive_pattern(url1) is False + assert very_specific_pattern.is_most_distinctive_pattern(url1) is True + + # For URL2, the middle pattern should still win since very_specific doesn't match + assert general_pattern.is_most_distinctive_pattern(url2) is False + assert specific_pattern.is_most_distinctive_pattern(url2) is True + assert very_specific_pattern.is_most_distinctive_pattern(url2) is False diff --git a/sde_collections/tests/test_promote_collection.py b/sde_collections/tests/test_promote_collection.py new file mode 100644 index 00000000..8791efae --- /dev/null +++ b/sde_collections/tests/test_promote_collection.py @@ -0,0 +1,392 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_promote_collection.py +import pytest + +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, + DeltaExcludePattern, + DeltaIncludePattern, + DeltaTitlePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl +from sde_collections.tests.factories import CollectionFactory + + +@pytest.fixture +def collection(): + # Use the factory to create a collection with all necessary fields populated + return CollectionFactory() + + +@pytest.mark.django_db +def test_initial_promotion_creates_curated_urls(collection): + # Start with no DeltaUrls or CuratedUrls + assert DeltaUrl.objects.filter(collection=collection).count() == 0 + assert CuratedUrl.objects.filter(collection=collection).count() == 0 + + # Add new DeltaUrls to promote + DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1") + DeltaUrl.objects.create(collection=collection, url="https://example2.com", scraped_title="Title 2") + + # Promote DeltaUrls to CuratedUrls + collection.promote_to_curated() + + # Check that CuratedUrls were created + curated_urls = CuratedUrl.objects.filter(collection=collection) + assert curated_urls.count() == 2 + assert curated_urls.filter(url="https://example1.com", scraped_title="Title 1").exists() + assert curated_urls.filter(url="https://example2.com", scraped_title="Title 2").exists() + + +@pytest.mark.django_db +def test_promotion_updates_existing_curated_urls(collection): + # Dictionary containing test data for each URL + test_data = { + "url1": {"url": "https://example1.com", "original_title": "Title 1", "updated_title": "Updated Title 1"}, + "url2": {"url": "https://example2.com", "original_title": "Title 2", "updated_title": "Updated Title 2"}, + } + + # Create initial DeltaUrls and promote them + for data in test_data.values(): + DeltaUrl.objects.create(collection=collection, url=data["url"], scraped_title=data["original_title"]) + collection.promote_to_curated() + + assert DeltaUrl.objects.all().count() == 0 + + # Re-create DeltaUrls with updated titles + for data in test_data.values(): + DeltaUrl.objects.create(collection=collection, url=data["url"], scraped_title=data["updated_title"]) + + # Promote the updates + collection.promote_to_curated() + + # Check that CuratedUrls were updated with the updated titles + for data in test_data.values(): + curated_url = CuratedUrl.objects.get(url=data["url"]) + assert curated_url.scraped_title == data["updated_title"] + + +@pytest.mark.django_db +def test_promotion_deletes_curated_urls(collection): + # Create initial DeltaUrls and promote them + DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1") + DeltaUrl.objects.create(collection=collection, url="https://example2.com", scraped_title="Title 2") + collection.promote_to_curated() + + # create a new DeltaUrl marked for deletion + DeltaUrl.objects.create(collection=collection, url="https://example1.com", scraped_title="Title 1", to_delete=True) + + # Promote the deletion + collection.promote_to_curated() + + # Check that the CuratedUrl for the deleted DeltaUrl was removed + assert not CuratedUrl.objects.filter(url="https://example1.com").exists() + # Ensure the other CuratedUrl is still present + assert CuratedUrl.objects.filter(url="https://example2.com").exists() + + +@pytest.mark.django_db +def test_patterns_reapplied_after_promotion(collection): + # Add DeltaUrls matching the patterns + DeltaUrl.objects.create(collection=collection, url="https://exclude.com", scraped_title="Exclude This") + DeltaUrl.objects.create(collection=collection, url="https://include.com", scraped_title="Include This") + + # Create exclude and include patterns + exclude_pattern = DeltaExcludePattern.objects.create( + collection=collection, match_pattern_type=2, match_pattern="exclude.*" + ) + include_pattern = DeltaIncludePattern.objects.create( + collection=collection, match_pattern_type=2, match_pattern="include.*" + ) + + # Promote DeltaUrls to CuratedUrls + collection.promote_to_curated() + + # Refresh the patterns and check relationships + exclude_pattern.refresh_from_db() + include_pattern.refresh_from_db() + + # Verify that patterns are reapplied + curated_urls = CuratedUrl.objects.filter(collection=collection) + + assert curated_urls.filter(url="https://exclude.com").exists() + assert curated_urls.filter(url="https://include.com").exists() + + # Ensure exclude_pattern and include_pattern relationships are populated + assert exclude_pattern.curated_urls.filter(url="https://exclude.com").exists() + assert include_pattern.curated_urls.filter(url="https://include.com").exists() + + # Verify exclusion status + assert curated_urls.filter(url="https://exclude.com", excluded=True).exists() + + +@pytest.mark.django_db +def test_promotion_with_overlapping_patterns_and_deletion(): + """Test complex scenario with multiple overlapping patterns and URL deletion.""" + collection = CollectionFactory() + + # Create a more complex set of URLs that might trigger overlapping patterns + urls = [ + "https://example.com/docs/guide1", + "https://example.com/docs/guide2", + "https://example.com/api/v1/doc1", + "https://example.com/api/v1/doc2", + ] + + # Create initial DeltaUrls + for url in urls: + DeltaUrl.objects.create(collection=collection, url=url, scraped_title=f"Title for {url}") + + # Create overlapping patterns that will affect the same URLs + patterns = [ + {"pattern": ".*docs.*", "title": "Documentation: {title}"}, + {"pattern": ".*guide.*", "title": "Guide: {title}"}, + {"pattern": ".*api.*", "title": "API: {title}"}, + {"pattern": ".*doc[0-9]", "title": "Doc Number: {title}"}, + ] + + # Create and apply multiple patterns + title_patterns = [] + for p in patterns: + pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern=p["pattern"], + match_pattern_type=2, # Multi-URL Pattern + title_pattern=p["title"], + ) + pattern.apply() + title_patterns.append(pattern) + + # Initial promotion + collection.promote_to_curated() + + # Verify our complex setup + for pattern in title_patterns: + matching_urls = pattern.curated_urls.all() + print(f"\nPattern '{pattern.match_pattern}' matches {matching_urls.count()} URLs:") + for url in matching_urls: + print(f"- {url.url}") + + # Now create deletion DeltaUrls but with overlapping pattern matches + urls_to_delete = ["https://example.com/docs/guide1", "https://example.com/api/v1/doc1"] + for url in urls_to_delete: + DeltaUrl.objects.create(collection=collection, url=url, to_delete=True) + + # Try the promotion - this should trigger similar conditions to production + collection.promote_to_curated() + + # Print final state for debugging + print("\nFinal state:") + for pattern in title_patterns: + print(f"\nPattern '{pattern.match_pattern}':") + for url in pattern.curated_urls.all(): + print(f"- {url.url}") + + +@pytest.mark.django_db +def test_promotion_with_title_change(): + """Test updating a CuratedUrl that has active title pattern relationships.""" + collection = CollectionFactory() + + # Create initial DeltaUrl and promote it + url = "https://example.com/doc1" + DeltaUrl.objects.create(collection=collection, url=url, scraped_title="Original Title") + + # Create and apply a title pattern + pattern = DeltaTitlePattern.objects.create( + collection=collection, match_pattern=".*doc1", match_pattern_type=2, title_pattern="Pattern: {title}" + ) + pattern.apply() + + # Initial promotion + collection.promote_to_curated() + + # Verify pattern relationship exists + curated = CuratedUrl.objects.get(url=url) + assert pattern.curated_urls.filter(id=curated.id).exists() + + # Now create new DeltaUrl with updated title + DeltaUrl.objects.create(collection=collection, url=url, scraped_title="New Title") # Changed title + + # This should trigger the same error we're seeing in production + collection.promote_to_curated() + + +@pytest.mark.django_db +def test_promotion_maintains_pattern_relationships_through_updates(collection): + """Test that pattern relationships survive multiple promotions with updates""" + # Initial setup + DeltaUrl.objects.create(collection=collection, url="https://example.com", scraped_title="Title") + pattern = DeltaTitlePattern.objects.create( + collection=collection, match_pattern="example.com", match_pattern_type=1, title_pattern="Pattern: {title}" + ) + + collection.promote_to_curated() + + # Record initial state + curated = CuratedUrl.objects.get(url="https://example.com") + initial_id = curated.id + initial_pattern_relations = list(pattern.curated_urls.all()) + + # Create new delta with changes + DeltaUrl.objects.create(collection=collection, url="https://example.com", scraped_title="New Title") + collection.promote_to_curated() + + # Verify relationships maintained + curated.refresh_from_db() + assert curated.id == initial_id # ID should not change + assert list(pattern.curated_urls.all()) == initial_pattern_relations + + +@pytest.mark.django_db +def test_sequential_promotions_with_multiple_patterns(collection): + """Test complex scenario with multiple promotions and pattern changes""" + # Initial setup with two URLs + urls = ["https://example.com/doc", "https://example.com/guide"] + for url in urls: + DeltaUrl.objects.create(collection=collection, url=url, scraped_title=f"Title for {url}") + + # First pattern and promotion + pattern1 = DeltaTitlePattern.objects.create( + collection=collection, match_pattern="doc", match_pattern_type=2, title_pattern="Doc: {title}" + ) + + collection.promote_to_curated() + + # Record state after first promotion + initial_ids = {url: CuratedUrl.objects.get(url=url).id for url in urls} + + DeltaUrl.objects.create(collection=collection, url="https://example.com/guide", scraped_title="Updated guide") + collection.promote_to_curated() + + pattern2 = DeltaTitlePattern.objects.create( + collection=collection, match_pattern="guide", match_pattern_type=2, title_pattern="Guide: {title}" + ) + assert not pattern2.curated_urls.filter(url__contains="guide").exists() + + # Verify state + for url in urls: + curated = CuratedUrl.objects.get(url=url) + assert curated.id == initial_ids[url] # IDs should be preserved + + collection.promote_to_curated() + + # Verify pattern relationships + assert pattern1.curated_urls.filter(url__contains="doc").exists() + assert pattern2.curated_urls.filter(url__contains="guide").exists() + + +@pytest.mark.django_db +def test_promotion_with_division_changes(collection): + """Test that division patterns are correctly promoted and applied""" + # Initial setup + DeltaUrl.objects.create(collection=collection, url="https://example.com/astrophysics", division=Divisions.GENERAL) + DeltaUrl.objects.create(collection=collection, url="https://example.com/helio", division=Divisions.GENERAL) + + # Create and apply division patterns + DeltaDivisionPattern.objects.create( + collection=collection, + match_pattern="astrophysics", + match_pattern_type=2, # Multi-URL pattern + division=Divisions.ASTROPHYSICS, + ) + + DeltaDivisionPattern.objects.create( + collection=collection, match_pattern="helio", match_pattern_type=2, division=Divisions.HELIOPHYSICS + ) + + # Promote and verify divisions were set + collection.promote_to_curated() + + assert CuratedUrl.objects.get(url__contains="astrophysics").division == Divisions.ASTROPHYSICS + assert CuratedUrl.objects.get(url__contains="helio").division == Divisions.HELIOPHYSICS + + +@pytest.mark.django_db +def test_promotion_with_document_type_changes(collection): + """Test document type patterns through promotion""" + # Create URLs with default doc type + DeltaUrl.objects.create( + collection=collection, url="https://example.com/data/set1", document_type=DocumentTypes.DOCUMENTATION + ) + DeltaUrl.objects.create( + collection=collection, url="https://example.com/tools/tool1", document_type=DocumentTypes.DOCUMENTATION + ) + + # Set up patterns for different doc types + DeltaDocumentTypePattern.objects.create( + collection=collection, match_pattern="data/*", match_pattern_type=2, document_type=DocumentTypes.DATA + ).apply() + + DeltaDocumentTypePattern.objects.create( + collection=collection, match_pattern="tools/*", match_pattern_type=2, document_type=DocumentTypes.SOFTWARETOOLS + ).apply() + + collection.promote_to_curated() + + # Verify document types were correctly set + assert CuratedUrl.objects.get(url__contains="/data/").document_type == DocumentTypes.DATA + assert CuratedUrl.objects.get(url__contains="/tools/").document_type == DocumentTypes.SOFTWARETOOLS + + +@pytest.mark.django_db +def test_promotion_with_multiple_metadata_changes_dump(collection): + """Test complex scenario with multiple metadata changes through multiple promotions""" + # Initial URL we'll be working with + url = "https://example.com/helio/data" + + # Create initial DumpUrl + DumpUrl.objects.create( + collection=collection, + url=url, + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + scraped_title="Raw Data Title", + ) + + # Migrate DumpUrls to DeltaUrls + collection.migrate_dump_to_delta() + + # Create patterns that will affect this URL + DeltaDivisionPattern.objects.create( + collection=collection, match_pattern="*helio*", match_pattern_type=2, division=Divisions.HELIOPHYSICS + ) + + DeltaDocumentTypePattern.objects.create( + collection=collection, match_pattern="*data*", match_pattern_type=2, document_type=DocumentTypes.DATA + ) + + DeltaTitlePattern.objects.create( + collection=collection, match_pattern="*data*", match_pattern_type=2, title_pattern="Heliophysics Data: {title}" + ) + + # First promotion + collection.promote_to_curated() + + # Verify initial promotion worked correctly + curated = CuratedUrl.objects.get(url=url) + assert curated.division == Divisions.HELIOPHYSICS + assert curated.document_type == DocumentTypes.DATA + assert curated.generated_title == "Heliophysics Data: Raw Data Title" + + # Create new DumpUrl with updated data to simulate a new crawl + DumpUrl.objects.create( + collection=collection, + url=url, + division=Divisions.GENERAL, # These will be overridden by patterns + document_type=DocumentTypes.DOCUMENTATION, # These will be overridden by patterns + scraped_title="Updated Data Title", + ) + + # Migrate new dump to delta + collection.migrate_dump_to_delta() + + # Second promotion - should maintain pattern-applied metadata while updating the title + collection.promote_to_curated() + + # Verify final state + curated = CuratedUrl.objects.get(url=url) + assert curated.division == Divisions.HELIOPHYSICS # Should still be preserved from pattern + assert curated.document_type == DocumentTypes.DATA # Should still be preserved from pattern + assert curated.generated_title == "Heliophysics Data: Updated Data Title" # Should reflect new title diff --git a/sde_collections/tests/test_sinequa_api.py b/sde_collections/tests/test_sinequa_api.py new file mode 100644 index 00000000..85a24bc7 --- /dev/null +++ b/sde_collections/tests/test_sinequa_api.py @@ -0,0 +1,365 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/api_tests.py +import json +from unittest.mock import MagicMock, patch + +import pytest +import requests +from django.utils import timezone + +from sde_collections.models.collection import WorkflowStatusChoices +from sde_collections.sinequa_api import Api +from sde_collections.tests.factories import CollectionFactory, UserFactory + + +@pytest.mark.django_db +class TestApiClass: + """ + Test suite for the Sinequa API integration. + Tests cover authentication, query construction, response processing, + and error handling across different server configurations. + """ + + @pytest.fixture + def collection(self): + """Fixture to create a collection object for testing.""" + user = UserFactory() + return CollectionFactory( + curated_by=user, + curation_started=timezone.now(), + config_folder="example_config", + workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS, + ) + + @pytest.fixture + def api_instance(self): + """ + Fixture to create an Api instance with mocked server configs. + Provides a consistent test environment with predefined credentials. + """ + with patch( + "sde_collections.sinequa_api.server_configs", + { + "test_server": { + "app_name": "test_app", + "query_name": "test_query", + "base_url": "http://testserver.com/api", + "index": "test_index", + } + }, + ): + return Api(server_name="test_server", user="test_user", password="test_pass", token="test_token") + + @patch("requests.post") + def test_process_response_success(self, mock_post, api_instance): + """ + Test that process_response successfully handles and parses API responses. + Verifies: + 1. Correct HTTP request processing + 2. JSON response parsing + 3. Return value structure + """ + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"key": "value"} + mock_post.return_value = mock_response + + response = api_instance.process_response("http://example.com", payload={"test": "data"}) + assert response == {"key": "value"} + mock_post.assert_called_once() + + @patch("requests.post") + def test_process_response_failure(self, mock_post, api_instance): + """ + Test that process_response properly handles failed API requests. + Verifies appropriate exception raising and error messaging. + """ + mock_response = MagicMock() + mock_response.status_code = 500 + mock_post.return_value = mock_response + mock_response.raise_for_status.side_effect = requests.RequestException("Internal Server Error") + + with pytest.raises(requests.RequestException, match="Internal Server Error"): + api_instance.process_response("http://example.com", payload={"test": "data"}) + + def test_missing_token_for_sql_query(self, api_instance): + """ + Test that attempting SQL queries without a token raises an appropriate error. + Verifies token validation before query execution. + """ + api_instance._provided_token = None + with pytest.raises(ValueError, match="Token is required"): + api_instance._execute_sql_query("SELECT * FROM test") + + @patch("sde_collections.sinequa_api.Api.process_response") + def test_query(self, mock_process_response, api_instance): + """ + Test that query method: + 1. Constructs the correct URL and payload based on input parameters + 2. Processes API response correctly + 3. Returns expected data structure + """ + mock_process_response.return_value = {"result": "success"} + response = api_instance.query(page=1, collection_config_folder="folder") + assert response == {"result": "success"} + + # Verify payload construction + mock_process_response.assert_called_once() + call_args = mock_process_response.call_args + assert "folder" in str(call_args) # Verify collection folder is included + assert "page" in str(call_args) # Verify pagination parameters + + def test_process_rows_to_records(self, api_instance): + """ + Test processing of raw SQL row data into structured record dictionaries. + Verifies: + 1. Correct parsing of valid input data + 2. Error handling for malformed rows + 3. Output format consistency + """ + # Test valid input + valid_rows = [["http://example.com/1", "Text 1", "Title 1"], ["http://example.com/2", "Text 2", "Title 2"]] + expected_output = [ + {"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"}, + {"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"}, + ] + assert api_instance._process_rows_to_records(valid_rows) == expected_output + + # Test invalid row length + invalid_rows = [["http://example.com", "Text"]] # Missing title + with pytest.raises(ValueError, match="Invalid row format at index 0"): + api_instance._process_rows_to_records(invalid_rows) + + @patch("sde_collections.sinequa_api.Api.process_response") + def test_execute_sql_query(self, mock_process_response, api_instance): + """ + Test SQL query execution with token-based authentication. + Verifies: + 1. Query construction + 2. Token validation + 3. Response processing + """ + mock_process_response.return_value = {"Rows": [], "TotalRowCount": 0} + + # Test successful query + result = api_instance._execute_sql_query("SELECT * FROM test") + assert result == {"Rows": [], "TotalRowCount": 0} + + # Test query with missing token + api_instance._provided_token = None + with pytest.raises(ValueError, match="Token is required"): + api_instance._execute_sql_query("SELECT * FROM test") + + @patch("sde_collections.sinequa_api.Api._execute_sql_query") + def test_get_full_texts_pagination(self, mock_execute_sql, api_instance): + """ + Test pagination handling in get_full_texts method. + Verifies: + 1. Correct batch processing + 2. Accurate record counting + 3. Proper iteration termination + """ + # Mock responses for two pages of results + mock_execute_sql.side_effect = [ + { + "Rows": [["http://example.com/1", "Text 1", "Title 1"], ["http://example.com/2", "Text 2", "Title 2"]], + "TotalRowCount": 3, + }, + {"Rows": [["http://example.com/3", "Text 3", "Title 3"]], "TotalRowCount": 3}, + {"Rows": [], "TotalRowCount": 3}, + ] + + # Collect all batches from the iterator + batches = list(api_instance.get_full_texts("test_folder")) + + assert len(batches) == 2 # Should have two batches + assert len(batches[0]) == 2 # First batch has 2 records + assert len(batches[1]) == 1 # Second batch has 1 record + + # Verify content of batches + assert batches[0] == [ + {"url": "http://example.com/1", "full_text": "Text 1", "title": "Title 1"}, + {"url": "http://example.com/2", "full_text": "Text 2", "title": "Title 2"}, + ] + assert batches[1] == [{"url": "http://example.com/3", "full_text": "Text 3", "title": "Title 3"}] + + def test_get_full_texts_missing_index(self, api_instance): + """ + Test error handling when index configuration is missing. + Verifies appropriate error message and exception type. + """ + api_instance.config.pop("index", None) + with pytest.raises(ValueError, match="Index not defined for server"): + next(api_instance.get_full_texts("test_folder")) + + @pytest.mark.parametrize( + "server_name,expect_auth", + [ + ("xli", True), # dev server should have auth + ("production", False), # prod server should not have auth + ], + ) + @patch("requests.post") + def test_query_authentication(self, mock_post, server_name, expect_auth, api_instance): + """ + Test authentication handling for different server types. + Verifies: + 1. Dev servers require authentication + 2. Production servers skip authentication + 3. Correct credential handling + """ + api_instance.server_name = server_name + mock_post.return_value = MagicMock(status_code=200, json=lambda: {"result": "success"}) + + response = api_instance.query(page=1, collection_config_folder="folder") + assert response == {"result": "success"} + + called_url = mock_post.call_args[0][0] + auth_present = "?Password=test_pass&User=test_user" in called_url + assert auth_present == expect_auth + + @patch("requests.post") + def test_query_dev_server_missing_credentials(self, mock_post, api_instance): + """ + Test error handling for dev servers with missing credentials. + Verifies appropriate error messages and authentication requirements. + """ + api_instance.server_name = "xli" + api_instance._provided_user = None + api_instance._provided_password = None + + with pytest.raises(ValueError, match="Authentication error: Missing credentials for dev server"): + api_instance.query(page=1) + + @patch("sde_collections.sinequa_api.Api._execute_sql_query") + def test_get_full_texts_batch_size_reduction(self, mock_execute_sql, api_instance): + """ + Test batch size reduction logic when queries fail. + Verifies: + 1. Progressive batch size reduction + 2. Retry mechanism + 3. Successful recovery + """ + # Mock first query to fail, then succeed with smaller batch + mock_execute_sql.side_effect = [ + requests.RequestException("Query too large"), # First attempt fails + { + "Rows": [["http://example.com/1", "Text 1", "Title 1"]], + "TotalRowCount": 1, + }, # Succeeds with smaller batch + ] + + batches = list(api_instance.get_full_texts("test_folder", batch_size=100, min_batch_size=1)) + + # Verify the batches were processed correctly after size reduction + assert len(batches) == 1 + assert len(batches[0]) == 1 + assert batches[0][0]["url"] == "http://example.com/1" + + # Verify batch size reduction logic + assert mock_execute_sql.call_count == 2 + first_call = mock_execute_sql.call_args_list[0][0][0] + second_call = mock_execute_sql.call_args_list[1][0][0] + assert "COUNT 100" in first_call + assert "COUNT 50" in second_call # Should be halved from 100 + + @patch("sde_collections.sinequa_api.Api._execute_sql_query") + def test_get_full_texts_minimum_batch_size(self, mock_execute_sql, api_instance): + """ + Test behavior when reaching minimum batch size. + Verifies error handling at minimum batch size threshold. + """ + mock_execute_sql.side_effect = requests.RequestException("Query failed") + + # Start with batch_size=4, min_batch_size=1 + with pytest.raises(ValueError, match="Failed to process batch even at minimum size 1"): + list(api_instance.get_full_texts("test_folder", batch_size=4, min_batch_size=1)) + + # Verify retry attempts + assert mock_execute_sql.call_count == 3 + calls = mock_execute_sql.call_args_list + assert "COUNT 4" in calls[0][0][0] # First try with 4 + assert "COUNT 2" in calls[1][0][0] # Second try with 2 + assert "COUNT 1" in calls[2][0][0] # Final try with 1 + + @patch("requests.post") + def test_sql_query_construction(self, mock_post, api_instance): + """ + Test direct SQL query execution with specific URL and payload validation. + Verifies: + 1. Correct URL construction + 2. Proper payload formatting + 3. Token-based authentication + """ + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"Rows": [["http://example.com", "sample text", "sample title"]]} + mock_post.return_value = mock_response + + sql = "SELECT url1, text, title FROM test_index WHERE collection = '/SDE/sample_folder/'" + api_instance._execute_sql_query(sql) + + # Verify URL and payload construction + mock_post.assert_called_once() + call_args = mock_post.call_args + + # Get the actual payload from the call arguments + _, kwargs = call_args + payload = json.loads(kwargs.get("data", "{}")) + + # Verify each component separately + assert "engine.sql" in call_args[0][0] # Verify endpoint + assert kwargs["headers"]["Authorization"] == "Bearer test_token" # Verify token usage + assert payload["sql"] == sql # Verify SQL query inclusion + + def test_process_full_text_response(self, api_instance): + """ + Test static method for processing full text response data. + Verifies: + 1. Correct parsing of raw response data + 2. Proper dictionary structure creation + 3. Error handling for invalid response format + """ + # Test valid response processing + raw_response = { + "Rows": [ + ["http://example.com/article1", "Full text 1", "Title 1"], + ["http://example.com/article2", "Full text 2", "Title 2"], + ] + } + expected = [ + {"url": "http://example.com/article1", "full_text": "Full text 1", "title": "Title 1"}, + {"url": "http://example.com/article2", "full_text": "Full text 2", "title": "Title 2"}, + ] + processed = Api._process_full_text_response(raw_response) + assert processed == expected + + # Test invalid response format + with pytest.raises(ValueError, match="Invalid response format"): + Api._process_full_text_response({"wrong_key": []}) + + @patch("sde_collections.sinequa_api.Api._execute_sql_query") + def test_get_full_texts_batch_size_progression(self, mock_execute_sql, api_instance): + """ + Test multiple batch size reductions followed by successful query. + Verifies: + 1. Progressive batch size reduction steps + 2. Recovery after multiple failures + 3. Final successful query execution + """ + mock_execute_sql.side_effect = [ + requests.RequestException("First failure"), + requests.RequestException("Second failure"), + {"Rows": [["http://example.com/1", "Text 1", "Title 1"]], "TotalRowCount": 1}, + ] + + # Start with batch_size=100, should reduce to 25 before succeeding + batches = list(api_instance.get_full_texts("test_folder", batch_size=100, min_batch_size=1)) + + assert len(batches) == 1 # Should get one successful batch + assert mock_execute_sql.call_count == 3 + + calls = mock_execute_sql.call_args_list + # Verify the progression of batch sizes + assert "COUNT 100" in calls[0][0][0] # First attempt + assert "COUNT 50" in calls[1][0][0] # After first failure + assert "COUNT 25" in calls[2][0][0] # After second failure diff --git a/sde_collections/tests/test_tdamm_tags.py b/sde_collections/tests/test_tdamm_tags.py new file mode 100644 index 00000000..f520b63b --- /dev/null +++ b/sde_collections/tests/test_tdamm_tags.py @@ -0,0 +1,197 @@ +# docker-compose -f local.yml run --rm django pytest -s sde_collections/tests/test_tdamm_tags.py + +import pytest + +from sde_collections.tests.factories import ( + CollectionFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + +from ..models.delta_url import CuratedUrl, DeltaUrl + + +@pytest.mark.django_db +class TestTDAMMFields: + """Test core TDAMM tags functionality with DeltaUrl""" + + def test_manual_and_ml_field_behavior(self): + """Test the relationship between manual and ML fields""" + url = DeltaUrlFactory() + + # Setting tdamm_tag affects only manual field + url.tdamm_tag = ["MMA_M_EM", "MMA_M_G"] + assert url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G"] + assert url.tdamm_tag_ml is None + + # ML field must be set explicitly + url.tdamm_tag_ml = ["MMA_M_N"] + assert url.tdamm_tag_ml == ["MMA_M_N"] + assert url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G"] + + def test_field_priority(self): + """Test that manual field takes priority over ML field""" + url = DeltaUrlFactory() + + # Set ML tags first + url.tdamm_tag_ml = ["MMA_M_EM"] + assert url.tdamm_tag == ["MMA_M_EM"] + + # Set manual tags - should take priority + url.tdamm_tag = ["MMA_M_G"] + assert url.tdamm_tag == ["MMA_M_G"] + + # Clear manual tags - should fall back to ML tags + url.tdamm_tag_manual = None + assert url.tdamm_tag == ["MMA_M_EM"] + + def test_empty_array_behavior(self): + """Test handling of empty arrays vs None""" + url = DeltaUrlFactory() + + # Set ML tags + url.tdamm_tag_ml = ["MMA_M_EM"] + assert url.tdamm_tag == ["MMA_M_EM"] + + # Empty manual array should not override ML tags + url.tdamm_tag = [] + assert url.tdamm_tag == ["MMA_M_EM"] + + # None manual value should not override ML tags + url.tdamm_tag = None + assert url.tdamm_tag == ["MMA_M_EM"] + + def test_field_deletion(self): + """Test deletion of fields""" + url = DeltaUrlFactory() + + # Set both manual and ML tags + url.tdamm_tag = ["MMA_M_EM"] + url.tdamm_tag_ml = ["MMA_M_G"] + + # Delete tdamm_tag + del url.tdamm_tag + assert url.tdamm_tag_manual is None + assert url.tdamm_tag_ml is None + + def test_multiple_tags(self): + """Test handling of multiple tags""" + url = DeltaUrlFactory() + + # Test multiple manual tags + manual_tags = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + url.tdamm_tag = manual_tags + assert url.tdamm_tag_manual == manual_tags + + # Test multiple ML tags + ml_tags = ["MMA_O_BH", "MMA_O_N"] + url.tdamm_tag_ml = ml_tags + assert url.tdamm_tag_ml == ml_tags + + def test_persistence(self): + """Test that values persist after save""" + url = DeltaUrlFactory() + + # Set values + url.tdamm_tag = ["MMA_M_EM"] + url.tdamm_tag_ml = ["MMA_M_G"] + url.save() + + # Refresh from database + url.refresh_from_db() + assert url.tdamm_tag_manual == ["MMA_M_EM"] + assert url.tdamm_tag_ml == ["MMA_M_G"] + + +@pytest.mark.django_db +class TestTDAMMTagMigration: + """Test TDAMM tag behavior during the migration process""" + + @pytest.fixture + def collection(self): + return CollectionFactory() + + def test_tdamm_tags_preserved_in_migration(self, collection): + """Test that TDAMM tags are preserved when promoting from Dump to Delta""" + dump_url = DumpUrlFactory(collection=collection, url="https://example.com") + dump_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + dump_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"] + dump_url.save() + + # Migrate to delta + collection.migrate_dump_to_delta() + + # Verify tags in the migrated DeltaUrl + delta_url = DeltaUrl.objects.get(url="https://example.com") + assert delta_url.tdamm_tag == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + assert delta_url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + assert delta_url.tdamm_tag_ml == ["MMA_O_BH", "MMA_O_N"] + + def test_tdamm_tags_updated_in_migration(self, collection): + """Test that TDAMM tags are updated during re-migration""" + # Initial migration + dump_url = DumpUrlFactory(collection=collection, url="https://example.com") + dump_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + dump_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"] + dump_url.save() + + # Migrate to delta + collection.migrate_dump_to_delta() + + # Create new DumpUrl with updated tags + updated_dump_url = DumpUrlFactory(collection=collection, url="https://example.com") + updated_dump_url.tdamm_tag = ["MMA_M_G"] + updated_dump_url.save() + collection.migrate_dump_to_delta() + + # Verify tags were updated + delta_url = DeltaUrl.objects.get(url="https://example.com") + assert delta_url.tdamm_tag == ["MMA_M_G"] + assert delta_url.tdamm_tag_manual == ["MMA_M_G"] + + +@pytest.mark.django_db +class TestTDAMMTagPromotion: + """Test TDAMM tag behavior during the promotion process""" + + @pytest.fixture + def collection(self): + return CollectionFactory() + + def test_tdamm_tags_preserved_in_promotion(self, collection): + """Test that TDAMM tags are preserved when promoting from Delta to Curated""" + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com") + delta_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + delta_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"] + delta_url.save() + + # Promote to curated + collection.promote_to_curated() + + # Verify tags in the promoted CuratedUrl + curated_url = CuratedUrl.objects.get(url="https://example.com") + assert curated_url.tdamm_tag == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + assert curated_url.tdamm_tag_manual == ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + assert curated_url.tdamm_tag_ml == ["MMA_O_BH", "MMA_O_N"] + + def test_tdamm_tags_updated_in_promotion(self, collection): + """Test that TDAMM tags are updated during re-promotion""" + # Initial promotion + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com") + delta_url.tdamm_tag = ["MMA_M_EM", "MMA_M_G", "MMA_M_N"] + delta_url.tdamm_tag_ml = ["MMA_O_BH", "MMA_O_N"] + delta_url.save() + + # Promote to curated + collection.promote_to_curated() + + # Create new DeltaUrl with updated tags + updated_delta_url = DeltaUrlFactory(collection=collection, url="https://example.com") + updated_delta_url.tdamm_tag = ["MMA_M_G"] + updated_delta_url.save() + collection.promote_to_curated() + + # Verify tags were updated + curated_url = CuratedUrl.objects.get(url="https://example.com") + assert curated_url.tdamm_tag == ["MMA_M_G"] + assert curated_url.tdamm_tag_manual == ["MMA_M_G"] diff --git a/sde_collections/tests/test_title_pattern_unapply.py b/sde_collections/tests/test_title_pattern_unapply.py new file mode 100644 index 00000000..db8ed7e5 --- /dev/null +++ b/sde_collections/tests/test_title_pattern_unapply.py @@ -0,0 +1,281 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_pattern_unapply.py + +from django.test import TestCase + +from sde_collections.models.delta_patterns import ( + DeltaResolvedTitle, + DeltaResolvedTitleError, + DeltaTitlePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl + +from .factories import CollectionFactory, DumpUrlFactory + + +class TestTitlePatternUnapplyLogic(TestCase): + """Test complete lifecycle of title pattern application and removal.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_dump_to_delta_migration_with_pattern_lifecycle(self): + """ + Test complete lifecycle: + 1. Create dump URLs + 2. Migrate to delta URLs + 3. Apply title pattern + 4. Promote to curated + 5. Delete pattern + 6. Verify deltas are created + 7. Promote to curated + 8. Verify curated URLs have empty generated titles + """ + # Create initial dump URLs + [ + DumpUrlFactory( + collection=self.collection, + url=f"https://example.com/science/data{i}.html", + ) + for i in range(3) + ] + + # Migrate dump to delta + self.collection.migrate_dump_to_delta() + + # Apply title pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/science/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="Science Document {url}", + ) + + # Verify pattern was applied to all deltas and resolution tracked + for delta_url in DeltaUrl.objects.all(): + self.assertTrue(delta_url.generated_title.startswith("Science Document")) + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Promote to curated + self.collection.promote_to_curated() + + # Verify promotion + self.assertEqual(CuratedUrl.objects.count(), 3) + self.assertEqual(DeltaUrl.objects.count(), 0) + for curated_url in CuratedUrl.objects.all(): + self.assertTrue(curated_url.generated_title.startswith("Science Document")) + + # Remove pattern + pattern.delete() + + # Verify new deltas created with empty titles + self.assertEqual(DeltaUrl.objects.count(), 3) + for delta_url in DeltaUrl.objects.all(): + self.assertEqual(delta_url.generated_title, "") + + # Verify resolution tracking cleared + self.assertEqual(DeltaResolvedTitle.objects.count(), 0) + self.assertEqual(DeltaResolvedTitleError.objects.count(), 0) + + def test_pattern_removal_with_delta_only(self): + """Test pattern removal when delta exists without corresponding curated URL.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/new.html") + + # Create and apply pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="New Document {url}" + ) + + # Verify pattern was applied + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("New Document")) + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Remove pattern + pattern.delete() + + # Verify delta still exists but with empty title + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.generated_title, "") + self.assertEqual(DeltaResolvedTitle.objects.count(), 0) + + def test_pattern_removal_with_simple_delta(self): + """Test pattern removal when delta was created just to apply pattern.""" + # Create initial curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, url="https://example.com/doc.html", generated_title="" + ) + + # Create and apply pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, title_pattern="Documentation {url}" + ) + + # Verify delta was created with pattern's title + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertTrue(delta_url.generated_title.startswith("Documentation")) + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Remove pattern + pattern.delete() + + # Verify delta was deleted since it would match curated + self.assertEqual(DeltaUrl.objects.filter(url=curated_url.url).count(), 0) + self.assertEqual(DeltaResolvedTitle.objects.count(), 0) + + def test_pattern_removal_preserves_other_changes(self): + """Test pattern removal when delta has other changes that should be preserved.""" + # Create curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, + url="https://example.com/doc.html", + generated_title="", + scraped_title="Original Title", + ) + + # Create delta with modified title + delta_url = DeltaUrl.objects.create( + collection=self.collection, url=curated_url.url, generated_title="", scraped_title="Modified Title" + ) + + # Create and apply pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, title_pattern="API Doc {url}" + ) + + # Verify pattern was applied while preserving scraped title + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Doc")) + self.assertEqual(delta_url.scraped_title, "Modified Title") + + # Remove pattern + pattern.delete() + + # Verify delta still exists with original changes but pattern effect removed + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.generated_title, "") + self.assertEqual(delta_url.scraped_title, "Modified Title") + + def test_pattern_removal_with_multiple_patterns(self): + """Test removal of one pattern when URL is affected by multiple patterns.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/doc.html") + + # Create specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="Specific Title {url}" + ) + + # Create another pattern for the same URL + generic_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="Generic Title {url}", + ) + + # Verify specific pattern takes precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("Specific Title")) + + # Verify resolution tracking + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists()) + + # Remove specific pattern + specific_pattern.delete() + + # Verify general pattern is now applied + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("Generic Title")) + + # Verify resolution tracking updated + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=generic_pattern).exists()) + + def test_specific_pattern_removal_with_overlapping_patterns(self): + """Test removal of specific pattern when more general pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="General Document {url}", + ) + + # Create specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="API Spec {url}" + ) + + # Verify specific pattern took precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Spec")) + + # Remove specific pattern + specific_pattern.delete() + + # Verify general pattern now applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("General Document")) + + def test_general_pattern_removal_with_overlapping_patterns(self): + """Test removal of general pattern when more specific pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + general_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="General Document {url}", + ) + + # Create specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="API Spec {url}" + ) + + # Verify specific pattern takes precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Spec")) + + # Verify correct resolution tracking + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists()) + + # Remove general pattern + general_pattern.delete() + + # Verify specific pattern still applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Spec")) + + # Verify resolution tracking unchanged + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists()) + + def test_pattern_removal_with_title_error(self): + """Test handling of title resolution errors during pattern removal.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/doc.html") + + # Create pattern that will cause error (invalid template) + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern=delta_url.url, + title_pattern="{invalid}", # This should cause an error + ) + + # Verify error was recorded + self.assertTrue(DeltaResolvedTitleError.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Remove pattern + pattern.delete() + + # Verify error tracking cleared + self.assertEqual(DeltaResolvedTitleError.objects.count(), 0) + + # Verify delta has empty title + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.generated_title, "") diff --git a/sde_collections/tests/test_title_resolution.py b/sde_collections/tests/test_title_resolution.py new file mode 100644 index 00000000..88c15aef --- /dev/null +++ b/sde_collections/tests/test_title_resolution.py @@ -0,0 +1,133 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_resolution.py + +from unittest.mock import Mock, patch + +import pytest + +from ..utils.title_resolver import ( + clean_text, + is_valid_xpath, + parse_title, + resolve_brace, + resolve_title, + resolve_xpath, + validate_fstring, +) + + +def test_parse_title(): + # Test basic string + assert parse_title("Simple Title") == [("str", "Simple Title")] + + # Test f-string + assert parse_title("Hello {title}") == [("str", "Hello "), ("brace", "{title}")] + + # Test xpath + assert parse_title("xpath://h1") == [("xpath", "//h1")] + + # Test complex pattern + result = parse_title("xpath://h1 | {title} - {collection}") + assert result == [ + ("xpath", "//h1"), + ("str", " | "), + ("brace", "{title}"), + ("str", " - "), + ("brace", "{collection}"), + ] + + +def test_is_valid_xpath(): + assert is_valid_xpath("//h1") is True + assert is_valid_xpath("//div[@class='title']") is True + assert is_valid_xpath("invalid xpath") is False + assert is_valid_xpath("//h1[") is False + + +def test_validate_fstring(): + # Valid cases - should not raise + validate_fstring("{title}") + validate_fstring("{url}") + validate_fstring("{collection}") + + # Invalid cases + with pytest.raises(ValueError): + validate_fstring("{invalid_var}") + with pytest.raises(ValueError): + validate_fstring("{title.upper()}") + with pytest.raises(ValueError): + validate_fstring("{len(title)}") + + +def test_resolve_brace(): + context = {"title": "Test Title", "url": "https://example.com", "collection": "Test Collection"} + + assert resolve_brace("{title}", context) == "Test Title" + assert resolve_brace("{title} - {collection}", context) == "Test Title - Test Collection" + + with pytest.raises(ValueError): + resolve_brace("{invalid}", context) + + +def test_clean_text(): + # Test whitespace handling + assert clean_text(" Title \n With\tSpaces ") == "Title With Spaces" + + # Test HTML entities + assert clean_text("Title & More") == "Title & More" + + # Test unicode normalization + assert clean_text("Café") == "Cafe" + + +@patch("requests.get") +def test_resolve_xpath(mock_get): + mock_response = Mock() + mock_response.ok = True + mock_response.content = b""" + + +

Test Title

+
Inner Content
+ + + """ + mock_get.return_value = mock_response + + # Test basic xpath + assert resolve_xpath("//h1", "https://example.com") == "Test Title" + assert resolve_xpath("//div[@class='content']", "https://example.com") == "Inner Content" + + # Test error cases + mock_response.ok = False + with pytest.raises(ValueError): + resolve_xpath("//h1", "https://example.com") + + mock_response.ok = True + with pytest.raises(ValueError): + resolve_xpath("//nonexistent", "https://example.com") + + +@patch("requests.get") +def test_resolve_title(mock_get): + mock_response = Mock() + mock_response.ok = True + mock_response.content = b""" + + +

Dynamic Content

+ + + """ + mock_get.return_value = mock_response + + context = {"title": "Original Title", "url": "https://example.com", "collection": "Test Collection"} + + # Test combination of xpath and f-string + pattern = "xpath://h1 | {title} - {collection}" + assert resolve_title(pattern, context) == "Dynamic Content | Original Title - Test Collection" + + # Test simple f-string + assert resolve_title("{title} ({collection})", context) == "Original Title (Test Collection)" + + # Test plain string + assert resolve_title("Static Title", context) == "Static Title" diff --git a/sde_collections/tests/test_url_apis.py b/sde_collections/tests/test_url_apis.py new file mode 100644 index 00000000..1c842e8a --- /dev/null +++ b/sde_collections/tests/test_url_apis.py @@ -0,0 +1,266 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_apis.py + +import pytest +from django.urls import reverse +from rest_framework import status + +from sde_collections.tests.factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, +) + + +@pytest.mark.django_db +class TestDeltaURLAPIView: + """Test suite for the Delta URL API endpoints""" + + def setup_method(self): + """Setup test data""" + self.collection = CollectionFactory() + + def test_delta_url_api_empty_list(self, client): + """Should return empty list when no delta URLs exist""" + url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + assert len(response.json()["results"]) == 0 + + def test_delta_url_api_with_data(self, client): + """Should return list of non-excluded delta URLs for given config folder""" + delta_url1 = DeltaUrlFactory(collection=self.collection) + + url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["url"] == delta_url1.url + expected_title = delta_url1.generated_title if delta_url1.generated_title else delta_url1.scraped_title + assert data[0]["title"] == expected_title + + def test_delta_url_api_wrong_config_folder(self, client): + """Should return empty list for non-existent config folder""" + url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": "nonexistent"}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + assert len(response.json()["results"]) == 0 + + def test_delta_url_api_serializer_fields(self, client): + """Should return all expected fields in serializer""" + DeltaUrlFactory(collection=self.collection) + + url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"][0] + expected_fields = {"url", "title", "document_type", "file_extension", "tree_root", "tdamm_tag"} + assert set(data.keys()) == expected_fields + + def test_delta_url_api_pagination(self, client): + """Should correctly paginate results when multiple URLs exist""" + [DeltaUrlFactory(collection=self.collection) for _ in range(15)] + + url = reverse("sde_collections:delta-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert "next" in data + assert "previous" in data + assert "count" in data + assert data["count"] == 15 + + +@pytest.mark.django_db +class TestCuratedURLAPIView: + """Test suite for the Curated URL API endpoints""" + + def setup_method(self): + """Setup test data""" + self.collection = CollectionFactory() + + def test_curated_url_api_empty_list(self, client): + """Should return empty list when no curated URLs exist""" + url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + assert len(response.json()["results"]) == 0 + + def test_curated_url_api_with_data(self, client): + """Should return list of curated URLs for given config folder""" + curated_url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title") + + url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["url"] == curated_url1.url + assert data[0]["title"] == curated_url1.generated_title + + def test_curated_url_api_wrong_config_folder(self, client): + """Should return empty list for non-existent config folder""" + url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": "nonexistent"}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + assert len(response.json()["results"]) == 0 + + def test_curated_url_api_serializer_fields(self, client): + """Should return all expected fields in serializer""" + CuratedUrlFactory(collection=self.collection) + + url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"][0] + expected_fields = {"url", "title", "document_type", "file_extension", "tree_root", "tdamm_tag"} + assert set(data.keys()) == expected_fields + + def test_candidate_url_api_alias(self, client): + """Should verify candidate-urls-api endpoint aliases to curated-urls-api""" + curated_url = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title") + + curated_url = reverse( + "sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder} + ) + candidate_url = reverse( + "sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder} + ) + + curated_response = client.get(curated_url) + candidate_response = client.get(candidate_url) + + assert curated_response.status_code == status.HTTP_200_OK + assert candidate_response.status_code == status.HTTP_200_OK + assert curated_response.json()["results"] == candidate_response.json()["results"] + + def test_multiple_collections(self, client): + """Should only return URLs from the specified collection""" + other_collection = CollectionFactory() + + url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title 1") + CuratedUrlFactory(collection=other_collection, generated_title="Test Generated Title 2") + + url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["url"] == url1.url + assert data[0]["title"] == url1.generated_title + + def test_curated_url_api_invalid_filters(self, client): + """Should handle invalid filter parameters gracefully""" + CuratedUrlFactory(collection=self.collection) + + url = reverse("sde_collections:curated-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(f"{url}?invalid_filter=value") + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + + +@pytest.mark.django_db +class TestCandidateURLAPIView: + """Test suite for the Candidate URL API endpoints. Note that this is an alias for Curated URL API""" + + def setup_method(self): + """Setup test data""" + self.collection = CollectionFactory() + + def test_candidate_url_api_empty_list(self, client): + """Should return empty list when no candidate URLs exist""" + url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + assert len(response.json()["results"]) == 0 + + def test_candidate_url_api_with_data(self, client): + """Should return list of candidate URLs for given config folder""" + candidate_url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title") + + url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["url"] == candidate_url1.url + assert data[0]["title"] == candidate_url1.generated_title + + def test_candidate_url_api_wrong_config_folder(self, client): + """Should return empty list for non-existent config folder""" + url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": "nonexistent"}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + assert len(response.json()["results"]) == 0 + + def test_candidate_url_api_serializer_fields(self, client): + """Should return all expected fields in serializer""" + CuratedUrlFactory(collection=self.collection) + + url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"][0] + expected_fields = {"url", "title", "document_type", "file_extension", "tree_root", "tdamm_tag"} + assert set(data.keys()) == expected_fields + + def test_candidate_url_api_alias(self, client): + """Should verify candidate-urls-api endpoint aliases to candidate-urls-api""" + candidate_url = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title") + + candidate_url = reverse( + "sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder} + ) + candidate_url = reverse( + "sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder} + ) + + candidate_response = client.get(candidate_url) + candidate_response = client.get(candidate_url) + + assert candidate_response.status_code == status.HTTP_200_OK + assert candidate_response.status_code == status.HTTP_200_OK + assert candidate_response.json()["results"] == candidate_response.json()["results"] + + def test_multiple_collections(self, client): + """Should only return URLs from the specified collection""" + other_collection = CollectionFactory() + + url1 = CuratedUrlFactory(collection=self.collection, generated_title="Test Generated Title 1") + CuratedUrlFactory(collection=other_collection, generated_title="Test Generated Title 2") + + url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(url) + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["url"] == url1.url + assert data[0]["title"] == url1.generated_title + + def test_candidate_url_api_invalid_filters(self, client): + """Should handle invalid filter parameters gracefully""" + CuratedUrlFactory(collection=self.collection) + + url = reverse("sde_collections:candidate-url-api", kwargs={"config_folder": self.collection.config_folder}) + response = client.get(f"{url}?invalid_filter=value") + + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 diff --git a/sde_collections/tests/test_workflow_status_triggers.py b/sde_collections/tests/test_workflow_status_triggers.py new file mode 100644 index 00000000..82f66720 --- /dev/null +++ b/sde_collections/tests/test_workflow_status_triggers.py @@ -0,0 +1,209 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_workflow_status_triggers.py +from unittest.mock import Mock, patch + +import pytest +from django.db import transaction +from django.test import TestCase, TransactionTestCase + +from sde_collections.models.collection_choice_fields import ( + ReindexingStatusChoices, + WorkflowStatusChoices, +) +from sde_collections.models.delta_url import DeltaUrl, DumpUrl +from sde_collections.tasks import fetch_and_replace_full_text +from sde_collections.tests.factories import CollectionFactory, DumpUrlFactory + + +class TestWorkflowStatusTransitions(TestCase): + def setUp(self): + self.collection = CollectionFactory() + + @patch("sde_collections.models.collection.Collection.create_scraper_config") + @patch("sde_collections.models.collection.Collection.create_indexer_config") + def test_ready_for_engineering_triggers_config_creation(self, mock_indexer, mock_scraper): + """When status changes to READY_FOR_ENGINEERING, it should create configs""" + self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_ENGINEERING + self.collection.save() + + mock_scraper.assert_called_once_with(overwrite=False) + mock_indexer.assert_called_once_with(overwrite=False) + + @patch("sde_collections.tasks.fetch_and_replace_full_text.delay") + def test_indexing_finished_triggers_full_text_fetch(self, mock_fetch): + """When status changes to INDEXING_FINISHED_ON_DEV, it should trigger full text fetch""" + self.collection.workflow_status = WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV + self.collection.save() + + mock_fetch.assert_called_once_with(self.collection.id, "lrm_dev") + + @patch("sde_collections.models.collection.Collection.create_plugin_config") + def test_ready_for_curation_triggers_plugin_config(self, mock_plugin): + """When status changes to READY_FOR_CURATION, it should create plugin config""" + self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_CURATION + self.collection.save() + + mock_plugin.assert_called_once_with(overwrite=True) + + @patch("sde_collections.models.collection.Collection.promote_to_curated") + def test_curated_triggers_promotion(self, mock_promote): + """When status changes to CURATED, it should promote DeltaUrls to CuratedUrls""" + self.collection.workflow_status = WorkflowStatusChoices.CURATED + self.collection.save() + + mock_promote.assert_called_once() + + @patch("sde_collections.models.collection.Collection.add_to_public_query") + def test_quality_check_perfect_triggers_public_query(self, mock_add): + """When status changes to QUALITY_CHECK_PERFECT, it should add to public query""" + self.collection.workflow_status = WorkflowStatusChoices.QUALITY_CHECK_PERFECT + self.collection.save() + + mock_add.assert_called_once() + + +class TestReindexingStatusTransitions(TestCase): + def setUp(self): + # Mock the GitHubHandler to return valid XML content + self.mock_github_handler = patch("sde_collections.models.collection.GitHubHandler").start() + + self.mock_github_handler.return_value._get_file_contents.return_value.decoded_content = ( + b'\n' + b"\n" + b" false\n" + b" Sample Collection\n" + b"" + ) + + self.addCleanup(patch.stopall) + + # Create the collection with the mock applied + self.collection = CollectionFactory( + workflow_status=WorkflowStatusChoices.QUALITY_CHECK_PERFECT, + reindexing_status=ReindexingStatusChoices.REINDEXING_NOT_NEEDED, + ) + + @patch("sde_collections.tasks.fetch_and_replace_full_text.delay") + def test_reindexing_finished_triggers_full_text_fetch(self, mock_fetch): + """When reindexing status changes to FINISHED, it should trigger full text fetch""" + self.collection.reindexing_status = ReindexingStatusChoices.REINDEXING_FINISHED_ON_DEV + self.collection.save() + + mock_fetch.assert_called_once_with(self.collection.id, "lrm_dev") + + @patch("sde_collections.models.collection.Collection.promote_to_curated") + def test_reindexing_curated_triggers_promotion(self, mock_promote): + """When reindexing status changes to CURATED, it should promote DeltaUrls""" + self.collection.reindexing_status = ReindexingStatusChoices.REINDEXING_CURATED + self.collection.save() + + mock_promote.assert_called_once() + + +class TestFullTextImport(TestCase): + def setUp(self): + self.collection = CollectionFactory() + self.existing_dump = DumpUrlFactory(collection=self.collection) + self.api_response = [ + {"url": "http://example.com/1", "title": "Title 1", "full_text": "Content 1"}, + {"url": "http://example.com/2", "title": "Title 2", "full_text": "Content 2"}, + ] + + @patch("sde_collections.tasks.Api") + @patch("sde_collections.models.collection.GitHubHandler") + def test_full_text_import_workflow(self, MockGitHub, MockApi): + """Test the full process of importing full text data""" + # Setup mock GitHub handler with proper XML content + mock_github = Mock() + mock_github.check_file_exists.return_value = True + mock_file_contents = Mock() + # Include all the fields that convert_template_to_plugin_indexer checks for + mock_xml = """ + + false + false + false + false + false + false + true + true + false + true + true + true + True + + false + true + false + + + + + + """ + mock_file_contents.decoded_content = mock_xml.encode("utf-8") + mock_github._get_file_contents.return_value = mock_file_contents + MockGitHub.return_value = mock_github + + # Setup mock API + mock_api = Mock() + mock_api.get_full_texts.return_value = [self.api_response] + MockApi.return_value = mock_api + + # Setup initial workflow state + self.collection.workflow_status = WorkflowStatusChoices.INDEXING_FINISHED_ON_DEV + self.collection.save() + + # Run the import + fetch_and_replace_full_text(self.collection.id, "lrm_dev") + + # Verify old DumpUrls were cleared + assert not DumpUrl.objects.filter(id=self.existing_dump.id).exists() + + # Verify new Delta urls were created + new_deltas = DeltaUrl.objects.filter(collection=self.collection) + assert new_deltas.count() == 2 + assert {dump.url for dump in new_deltas} == {"http://example.com/1", "http://example.com/2"} + + # Verify status updates + self.collection.refresh_from_db() + assert self.collection.workflow_status == WorkflowStatusChoices.READY_FOR_CURATION + + +class TestErrorHandling(TransactionTestCase): + def setUp(self): + self.collection = CollectionFactory(workflow_status=WorkflowStatusChoices.RESEARCH_IN_PROGRESS) + + @patch("sde_collections.models.collection.Collection.create_scraper_config") + @patch("sde_collections.models.collection.Collection.create_indexer_config") + def test_config_creation_failure_handling(self, mock_indexer, mock_scraper): + """Test handling of config creation failures""" + mock_scraper.side_effect = Exception("Config creation failed") + + initial_status = self.collection.workflow_status + + with pytest.raises(Exception): + with transaction.atomic(): + self.collection.workflow_status = WorkflowStatusChoices.READY_FOR_ENGINEERING + self.collection.save() + + # Verify status wasn't changed on error + self.collection.refresh_from_db() + assert self.collection.workflow_status == initial_status + + @patch("sde_collections.tasks.Api") + def test_full_text_fetch_failure_handling(self, MockApi): + """Test handling of full text fetch failures""" + mock_api = Mock() + mock_api.get_full_texts.side_effect = Exception("API error") + MockApi.return_value = mock_api + + initial_status = self.collection.workflow_status + + with pytest.raises(Exception): + fetch_and_replace_full_text(self.collection.id, "lrm_dev") + + # Verify status wasn't changed on error + self.collection.refresh_from_db() + assert self.collection.workflow_status == initial_status diff --git a/sde_collections/urls.py b/sde_collections/urls.py index 4e3d0534..9ee77759 100644 --- a/sde_collections/urls.py +++ b/sde_collections/urls.py @@ -8,7 +8,8 @@ router = routers.DefaultRouter() router.register(r"collections", views.CollectionViewSet, basename="collection") router.register(r"collections-read", views.CollectionReadViewSet, basename="collection-read") -router.register(r"candidate-urls", views.CandidateURLViewSet) +router.register(r"delta-urls", views.DeltaURLViewSet) +router.register(r"curated-urls", views.CuratedURLViewSet) router.register(r"exclude-patterns", views.ExcludePatternViewSet) router.register(r"include-patterns", views.IncludePatternViewSet) router.register(r"title-patterns", views.TitlePatternViewSet) @@ -32,31 +33,37 @@ views.IndexingInstructionsView.as_view(), name="indexing_instructions", ), - path("api/assign-division//", views.CandidateURLViewSet.as_view({"post": "update_division"})), + path("api/assign-division//", views.DeltaURLViewSet.as_view({"post": "update_division"})), path( "delete-required-url/", view=views.RequiredUrlsDeleteView.as_view(), name="delete_required_url", ), path( - "/candidate-urls", - view=views.CandidateURLsListView.as_view(), - name="candidate_urls", + "/delta-urls", + view=views.DeltaURLsListView.as_view(), + name="delta_urls", ), path( "consolidate/", view=views.WebappGitHubConsolidationView.as_view(), name="consolidate_db_and_github_configs", ), - # List all CandidateURL instances: /candidate-urls/ - # Retrieve a specific CandidateURL instance: /candidate-urls/{id}/ - # Create a new CandidateURL instance: /candidate-urls/ - # Update an existing CandidateURL instance: /candidate-urls/{id}/ - # Delete an existing CandidateURL instance: /candidate-urls/{id}/ + # List all DeltaURL instances: /delta-urls/ + # Retrieve a specific DeltaURL instance: /delta-urls/{id}/ + # Create a new DeltaURL instance: /delta-urls/ + # Update an existing DeltaURL instance: /delta-urls/{id}/ + # Delete an existing DeltaURL instance: /delta-urls/{id}/ path("api/", include(router.urls)), + path( + "delta-urls-api//", + view=views.DeltaURLAPIView.as_view(), + name="delta-url-api", + ), + path("curated-urls-api//", view=views.CuratedURLAPIView.as_view(), name="curated-url-api"), path( "candidate-urls-api//", - view=views.CandidateURLAPIView.as_view(), + view=views.CuratedURLAPIView.as_view(), name="candidate-url-api", ), path("titles-and-errors/", views.TitlesAndErrorsView.as_view(), name="titles-and-errors-list"), diff --git a/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md b/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md new file mode 100644 index 00000000..cd6cc4fc --- /dev/null +++ b/sde_collections/utils/README_PAIRED_FIELD_DESCRIPTOR.md @@ -0,0 +1,90 @@ +# Paired Field Descriptor System + +## Overview + +The Paired Field Descriptor is a Django model descriptor designed to manage fields with both manual and machine learning (ML) generated variants. This system provides a flexible approach to handling metadata fields, with a focus on tag management and priority handling. + +## Core Concepts + +### Field Pairing Mechanism +The descriptor automatically creates two associated fields for each defined descriptor: +- **Manual Field**: Manually entered or curated metadata +- **ML Field**: Machine learning generated metadata + +### Key Characteristics +- Manual field takes precedence over ML field +- Flexible field type support +- Handles empty arrays and None values +- Requires explicit setting of ML fields + +## Implementation + +### Creating a Paired Field Descriptor + +```python +tdamm_tag = PairedFieldDescriptor( + field_name="tdamm_tag", + field_type=ArrayField(models.CharField(max_length=255, choices=TDAMMTags.choices), blank=True, null=True), + verbose_name="TDAMM Tags", +) +``` + +#### Parameters +- `field_name`: Base name for the descriptor +- `field_type`: Django field type (supports various field types) +- `verbose_name`: Optional human-readable name + +### Field Naming Convention +When you define a descriptor, two additional fields are automatically created: +- `{field_name}_manual`: For manually entered values +- `{field_name}_ml`: For machine learning generated values + +## Characteristics + +### Field Priority +1. Manual field always takes precedence +2. ML field serves as a fallback +3. Empty manual fields or None values defer to ML field + +### Field Retrieval +```python +# Retrieval automatically prioritizes manual field +tags = url.tdamm_tag # Returns manual tags if exist, otherwise ML tags +``` + +### Field Setting +```python +# Sets only the manual field +url.tdamm_tag = ["MMA_M_EM", "MMA_M_G"] + +# ML field must be set explicitly +url.tdamm_tag_ml = ["MMA_O_BH"] +``` + +### Field Deletion +```python +# Deletes both manual and ML fields +del url.tdamm_tag +``` + +### Data Preservation +- Paired fields maintain their state during: + - Dump to Delta migration + - Delta to Curated promotion +- Manual entries take precedence in all migration stages + +## Serializer Integration + +Here's the way to configure the serializer to retrieve the paired field, seamlessly extracting either manual or ML tags based on the descriptor's priority rules. +```python +class DeltaUrlSerializer(serializers.ModelSerializer): + tdamm_tag = serializers.SerializerMethodField() + + class Meta: + model = DeltaUrl + fields = ("url", "tdamm_tag") + + def get_tdamm_tag(self, obj): + tags = obj.tdamm_tag + return tags if tags is not None else [] +``` diff --git a/sde_collections/utils/health_check.py b/sde_collections/utils/health_check.py index 19c45369..0e09bd87 100644 --- a/sde_collections/utils/health_check.py +++ b/sde_collections/utils/health_check.py @@ -127,12 +127,12 @@ def create_exclude_pattern_report(match_pattern, url): # check with http:// if match_pattern.find("http://") == -1: - url = f"http://{match_pattern}" + url = f"http://{match_pattern}" # noqa: E231 if url in candidate_urls_sinequa: exclude_pattern_report.append(create_exclude_pattern_report(match_pattern, url)) if match_pattern.find("https://") == -1: - url = f"https://{match_pattern}" + url = f"https://{match_pattern}" # noqa: E231 if url in candidate_urls_sinequa: exclude_pattern_report.append(create_exclude_pattern_report(match_pattern, url)) else: diff --git a/sde_collections/utils/paired_field_descriptor.py b/sde_collections/utils/paired_field_descriptor.py new file mode 100644 index 00000000..afebc35a --- /dev/null +++ b/sde_collections/utils/paired_field_descriptor.py @@ -0,0 +1,73 @@ +class PairedFieldDescriptor: + """ + A descriptor that manages paired manual/ML fields where: + - Setting the main field only affects the manual field + - ML field must be set explicitly + - Getting the main field returns manual if present, otherwise ML + """ + + def __init__(self, field_name, field_type, verbose_name=""): + self.field_name = field_name + self.manual_field_name = f"{field_name}_manual" + self.ml_field_name = f"{field_name}_ml" + self.field_type = field_type + self.verbose_name = verbose_name or field_name.replace("_", " ").title() + + def contribute_to_class(self, cls, name): + """Called by Django when the descriptor is added to the model class.""" + # Create manual field + manual_field = self._create_field(verbose_name=f"{self.verbose_name} Manual", db_column=self.manual_field_name) + + # Create ML field + ml_field = self._create_field(verbose_name=f"{self.verbose_name} ML", db_column=self.ml_field_name) + + # Add fields to the model's _meta + cls.add_to_class(self.manual_field_name, manual_field) + cls.add_to_class(self.ml_field_name, ml_field) + + # Store the descriptor + setattr(cls, name, self) + + def _create_field(self, verbose_name, db_column): + """Helper method to create a new field instance with the right configuration""" + if isinstance(self.field_type, type): + # If field_type is a class, instantiate it + field = self.field_type() + else: + # If field_type is already an instance, clone it + field = self.field_type.clone() + + field.verbose_name = verbose_name + field.db_column = db_column + + return field + + def __get__(self, instance, owner): + """ + Get the value of the main field: + - Returns manual tags if they exist + - Otherwise returns ML tags + """ + if instance is None: + return self + + manual_value = getattr(instance, self.manual_field_name, None) + ml_value = getattr(instance, self.ml_field_name, None) + + # Return manual value only if it exists and is not empty + if manual_value and len(manual_value) > 0: + return manual_value + return ml_value + + def __set__(self, instance, value): + """ + Set only the manual field when setting the field. + ML field must be set explicitly. + """ + + setattr(instance, self.manual_field_name, value) + + def __delete__(self, instance): + """Delete both manual and ML fields""" + setattr(instance, self.manual_field_name, None) + setattr(instance, self.ml_field_name, None) diff --git a/sde_collections/utils/slack_utils.py b/sde_collections/utils/slack_utils.py index c4cfd78b..44979e04 100644 --- a/sde_collections/utils/slack_utils.py +++ b/sde_collections/utils/slack_utils.py @@ -4,95 +4,59 @@ from ..models.collection_choice_fields import WorkflowStatusChoices SLACK_ID_MAPPING = { - "Carson Davis": "@UESJLQXH6", - "Bishwas Praveen": "@U05QZUF182J", - "Xiang Li": "@U03PPLNDZA7", - "Shravan Vishwanathan": "@U056B4HMGEP", - "Advait Yogaonkar": "@U06L5SKQ5QA", - "Emily Foshee": "@UPKDARB9P", - "Ashish Acharya": "@UC97PNAF6", - "channel": "!here", + "Shravan Vishwanathan": "<@U056B4HMGEP>", + "Advait Yogaonkar": "<@U06L5SKQ5QA>", + "channel": "", } - STATUS_CHANGE_NOTIFICATIONS = { (WorkflowStatusChoices.RESEARCH_IN_PROGRESS, WorkflowStatusChoices.READY_FOR_ENGINEERING): { "message": "Research on {name} is complete. Ready for engineering! :rocket:", - "tags": [ - SLACK_ID_MAPPING["Xiang Li"], - SLACK_ID_MAPPING["Shravan Vishwanathan"], - SLACK_ID_MAPPING["Advait Yogaonkar"], - ], }, (WorkflowStatusChoices.ENGINEERING_IN_PROGRESS, WorkflowStatusChoices.READY_FOR_CURATION): { "message": "Engineering on {name} is complete. Ready for curation! :mag:", - "tags": [SLACK_ID_MAPPING["Emily Foshee"]], }, (WorkflowStatusChoices.CURATION_IN_PROGRESS, WorkflowStatusChoices.CURATED): { "message": "Curation on {name} is complete. It's now curated! :checkered_flag:", - "tags": [ - SLACK_ID_MAPPING["Carson Davis"], - SLACK_ID_MAPPING["Bishwas Praveen"], - SLACK_ID_MAPPING["Ashish Acharya"], - ], }, (WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED): { "message": "Alert: Secret deployment of {name} has failed! :warning:", - "tags": [ - SLACK_ID_MAPPING["Carson Davis"], - SLACK_ID_MAPPING["Bishwas Praveen"], - SLACK_ID_MAPPING["Ashish Acharya"], - ], }, (WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK): { "message": "Indexing of {name} on Secret Prod completed successfully. Ready for LRM QC! :clipboard:", - "tags": [SLACK_ID_MAPPING["Shravan Vishwanathan"], SLACK_ID_MAPPING["Advait Yogaonkar"]], }, (WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK): { "message": "LRM QC passed for {name}. Ready for final quality check! :white_check_mark:", - "tags": [SLACK_ID_MAPPING["Emily Foshee"]], }, (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_FAILED): { "message": "Quality check on {name} has failed. Changes needed! :x:", - "tags": [ - SLACK_ID_MAPPING["Xiang Li"], - SLACK_ID_MAPPING["Shravan Vishwanathan"], - SLACK_ID_MAPPING["Advait Yogaonkar"], - ], + "mention_users": ["Shravan Vishwanathan", "Advait Yogaonkar"], }, (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_PERFECT): { "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:", - "tags": [ - SLACK_ID_MAPPING["Carson Davis"], - SLACK_ID_MAPPING["Bishwas Praveen"], - SLACK_ID_MAPPING["Ashish Acharya"], - ], }, (WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, WorkflowStatusChoices.QUALITY_CHECK_MINOR): { "message": "{name} has passed all quality checks and is ready for public production! :white_check_mark:", - "tags": [ - SLACK_ID_MAPPING["Carson Davis"], - SLACK_ID_MAPPING["Bishwas Praveen"], - SLACK_ID_MAPPING["Ashish Acharya"], - ], }, (WorkflowStatusChoices.QUALITY_CHECK_PERFECT, WorkflowStatusChoices.PROD_PERFECT): { "message": "{name} is now live on Public Prod! Congrats team! :sparkles:", - "tags": [SLACK_ID_MAPPING["channel"]], + "mention_users": ["channel"], }, (WorkflowStatusChoices.QUALITY_CHECK_MINOR, WorkflowStatusChoices.PROD_MINOR): { "message": "{name} is now live on Public Prod! Congrats team! :sparkles:", - "tags": [SLACK_ID_MAPPING["channel"]], + "mention_users": ["channel"], }, } def format_slack_message(name, details, collection_id): message_template = details["message"] - tags = " ".join([f"<{user}>" for user in details["tags"]]) - link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/" + link = f"https://sde-indexing-helper.nasa-impact.net/{collection_id}/" # noqa: E231 linked_name = f"<{link}|{name}>" - return tags + " " + message_template.format(name=linked_name) + if "mention_users" in details: + slack_mentions = " ".join(SLACK_ID_MAPPING[user] for user in details["mention_users"]) + return slack_mentions + " " + message_template.format(name=linked_name) + return message_template.format(name=linked_name) def send_slack_message(message): @@ -101,5 +65,5 @@ def send_slack_message(message): response = requests.post(webhook_url, json=payload) if response.status_code != 200: raise ValueError( - f"Request to Slack returned an error {response.status_code}, the response is:\n{response.text}" + f"Request to Slack returned an error {response.status_code}, the response is:\n{response.text}" # noqa: E231, E501 ) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index b9171de3..c39b989a 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -17,7 +17,7 @@ def is_valid_xpath(xpath: str) -> bool: return False -def is_valid_fstring(pattern: str) -> bool: +def validate_fstring(pattern: str) -> bool: context = { "url": "", "title": "", @@ -32,7 +32,7 @@ def is_valid_fstring(pattern: str) -> bool: if node.value.id not in context: variables_allowed = ", ".join([key for key in context.keys()]) raise ValueError( - f"Variable '{node.value.id}' not allowed in f-string pattern." + f"Variable '{node.value.id}' not allowed in f-string pattern." # noqa: E713 f" Allowed variables are: {variables_allowed}" ) @@ -53,7 +53,7 @@ def resolve_brace(pattern: str, context: dict[str, Any]) -> str: """Safely interpolates the variables in an f-string pattern using the provided context.""" parsed = ast.parse(f"f'''{pattern}'''", mode="eval") - is_valid_fstring(pattern) # Refactor this + validate_fstring(pattern) compiled = compile(parsed, "", "eval") return str(eval(compiled, {}, context)) @@ -63,29 +63,33 @@ def resolve_xpath(xpath: str, url: str) -> str: if not is_valid_xpath(xpath): raise ValueError(f"The xpath, {xpath}, is not valid.") - response = requests.get(url) - - if response.ok: - tree = html.fromstring(response.content) - values = tree.xpath(xpath) - - if len(values) == 1: - if isinstance(values[0], str): - text_content = values[0] - else: - text_content = values[0].text - - if text_content: - text_content = clean_text(text_content) - return text_content + try: + response = requests.get(url) + + if response.ok: + tree = html.fromstring(response.content) + values = tree.xpath(xpath) + + if len(values) == 1: + if isinstance(values[0], str): + text_content = values[0] + else: + text_content = values[0].text + + if text_content: + text_content = clean_text(text_content) + return text_content + else: + raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.") + elif len(values) > 1: + raise ValueError(f"More than one element found for the xpath, {xpath}") else: - raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.") - elif len(values) > 1: - raise ValueError(f"More than one element found for the xpath, {xpath}") + raise ValueError(f"No element found for the xpath, {xpath}") else: - raise ValueError(f"No element found for the xpath, {xpath}") - else: - raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") + raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") + + except requests.RequestException as e: + raise ValueError(f"Network error while accessing {url}: {str(e)}") def parse_title(input_string: str) -> list[tuple[str, str]]: diff --git a/sde_collections/views.py b/sde_collections/views.py index 241979ba..fb268170 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -18,28 +18,33 @@ from rest_framework.views import APIView from .forms import CollectionGithubIssueForm, CommentsForm, RequiredUrlForm -from .models.candidate_url import CandidateURL, ResolvedTitle, ResolvedTitleError from .models.collection import Collection, Comments, RequiredUrls, WorkflowHistory from .models.collection_choice_fields import ( ConnectorChoices, CurationStatusChoices, Divisions, DocumentTypes, + ReindexingStatusChoices, WorkflowStatusChoices, ) -from .models.pattern import ( - DivisionPattern, - DocumentTypePattern, - ExcludePattern, - IncludePattern, - TitlePattern, +from .models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, + DeltaExcludePattern, + DeltaIncludePattern, + DeltaResolvedTitle, + DeltaResolvedTitleError, + DeltaTitlePattern, ) +from .models.delta_url import CuratedUrl, DeltaUrl from .serializers import ( - CandidateURLAPISerializer, - CandidateURLBulkCreateSerializer, - CandidateURLSerializer, CollectionReadSerializer, CollectionSerializer, + CuratedURLAPISerializer, + CuratedURLSerializer, + DeltaURLAPISerializer, + DeltaURLBulkCreateSerializer, + DeltaURLSerializer, DivisionPatternSerializer, DocumentTypePatternSerializer, ExcludePatternSerializer, @@ -66,8 +71,11 @@ def get_queryset(self): super() .get_queryset() .filter(delete=False) - .annotate(num_candidate_urls=models.Count("candidate_urls")) - .order_by("-num_candidate_urls") + .annotate( + num_delta_urls=models.Count("delta_urls", distinct=True), + num_curated_urls=models.Count("curated_urls", distinct=True), + ) + .order_by("-num_delta_urls") ) def get_context_data(self, **kwargs): @@ -76,6 +84,7 @@ def get_context_data(self, **kwargs): context["curators"] = User.objects.filter(groups__name="Curators") context["curation_status_choices"] = CurationStatusChoices context["workflow_status_choices"] = WorkflowStatusChoices + context["reindexing_status_choices"] = ReindexingStatusChoices return context @@ -173,6 +182,7 @@ def get_context_data(self, **kwargs): "-created_at" ) context["workflow_status_choices"] = WorkflowStatusChoices + context["reindexing_status_choices"] = ReindexingStatusChoices return context @@ -184,14 +194,14 @@ def get_success_url(self, *args, **kwargs): return reverse("sde_collections:detail", kwargs={"pk": self.object.collection.pk}) -class CandidateURLsListView(LoginRequiredMixin, ListView): +class DeltaURLsListView(LoginRequiredMixin, ListView): """ Display a list of collections in the system """ - model = CandidateURL - template_name = "sde_collections/candidate_urls_list.html" - context_object_name = "candidate_urls" + model = DeltaUrl + template_name = "sde_collections/delta_urls_list.html" + context_object_name = "delta_urls" # paginate_by = 1000 def _filter_by_is_exluded(self, queryset, is_excluded): @@ -214,13 +224,14 @@ def get_queryset(self): def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) - context["segment"] = "candidate-url-list" + context["segment"] = "delta-url-list" context["collection"] = self.collection context["regex_exclude_patterns"] = self.collection.excludepattern.filter( match_pattern_type=2 ) # 2=regex patterns context["title_patterns"] = self.collection.titlepattern.all() context["workflow_status_choices"] = WorkflowStatusChoices + context["reindexing_status_choices"] = ReindexingStatusChoices context["is_multi_division"] = self.collection.is_multi_division return context @@ -254,9 +265,9 @@ def get_queryset(self): return super().get_queryset().filter(collection=collection) -class CandidateURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet): - queryset = CandidateURL.objects.all() - serializer_class = CandidateURLSerializer +class DeltaURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet): + queryset = DeltaUrl.objects.all() + serializer_class = DeltaURLSerializer def _filter_by_is_excluded(self, queryset, is_excluded): if is_excluded == "false": @@ -268,25 +279,75 @@ def _filter_by_is_excluded(self, queryset, is_excluded): def get_queryset(self): queryset = super().get_queryset() if self.request.method == "GET": + collection_id = self.request.GET.get("collection_id") # Filter based on exclusion status is_excluded = self.request.GET.get("is_excluded") if is_excluded: queryset = self._filter_by_is_excluded(queryset, is_excluded) + + # Annotate queryset with two pieces of information: + # 1. exclude_pattern_type: Type of exclude pattern (1=Individual URL, 2=Multi-URL Pattern) + # Ordered by -match_pattern_type to prioritize multi-url patterns (type 2) + # 2. include_pattern_id: ID of any include pattern affecting this URL + # Used when we need to delete the include pattern during re-exclusion + queryset = queryset.annotate( + exclude_pattern_type=models.Subquery( + DeltaExcludePattern.objects.filter(delta_urls=models.OuterRef("pk"), collection_id=collection_id) + .order_by("-match_pattern_type") + .values("match_pattern_type")[:1] + ), + include_pattern_id=models.Subquery( + DeltaIncludePattern.objects.filter( + delta_urls=models.OuterRef("pk"), collection_id=collection_id + ).values("id")[:1] + ), + ) + return queryset.order_by("url") def update_division(self, request, pk=None): - candidate_url = get_object_or_404(CandidateURL, pk=pk) + delta_url = get_object_or_404(DeltaUrl, pk=pk) division = request.data.get("division") if division: - candidate_url.division = division - candidate_url.save() + delta_url.division = division + delta_url.save() return Response(status=status.HTTP_200_OK) return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) -class CandidateURLBulkCreateView(generics.ListCreateAPIView): - queryset = CandidateURL.objects.all() - serializer_class = CandidateURLBulkCreateSerializer +class CuratedURLViewSet(CollectionFilterMixin, viewsets.ModelViewSet): + queryset = CuratedUrl.objects.all() + serializer_class = CuratedURLSerializer + + def _filter_by_is_excluded(self, queryset, is_excluded): + if is_excluded == "false": + queryset = queryset.filter(excluded=False) + elif is_excluded == "true": + queryset = queryset.exclude(excluded=False) + return queryset + + def get_queryset(self): + queryset = super().get_queryset() + if self.request.method == "GET": + # Filter based on exclusion status + is_excluded = self.request.GET.get("is_excluded") + if is_excluded: + queryset = self._filter_by_is_excluded(queryset, is_excluded) + return queryset.order_by("url") + + def update_division(self, request, pk=None): + delta_url = get_object_or_404(CuratedUrl, pk=pk) + division = request.data.get("division") + if division: + delta_url.division = division + delta_url.save() + return Response(status=status.HTTP_200_OK) + return Response(status=status.HTTP_400_BAD_REQUEST, data={"error": "Division is required."}) + + +class DeltaURLBulkCreateView(generics.ListCreateAPIView): + queryset = DeltaUrl.objects.all() + serializer_class = DeltaURLBulkCreateSerializer def perform_create(self, serializer, collection_id=None): for validated_data in serializer.validated_data: @@ -296,7 +357,7 @@ def perform_create(self, serializer, collection_id=None): def create(self, request, *args, **kwargs): config_folder = kwargs.get("config_folder") collection = Collection.objects.get(config_folder=config_folder) - collection.candidate_urls.all().delete() + collection.delta_urls.all().delete() serializer = self.get_serializer(data=request.data, many=True) serializer.is_valid(raise_exception=True) @@ -307,8 +368,25 @@ def create(self, request, *args, **kwargs): return Response(serializer.data, status=status.HTTP_201_CREATED) -class CandidateURLAPIView(ListAPIView): - serializer_class = CandidateURLAPISerializer +class DeltaURLAPIView(ListAPIView): + serializer_class = DeltaURLAPISerializer + + def get(self, request, *args, **kwargs): + config_folder = kwargs.get("config_folder") + self.config_folder = config_folder + return super().get(request, *args, **kwargs) + + def get_queryset(self): + queryset = ( + DeltaUrl.objects.filter(collection__config_folder=self.config_folder) + .with_exclusion_status() + .filter(excluded=False) + ) + return queryset + + +class CuratedURLAPIView(ListAPIView): + serializer_class = CuratedURLAPISerializer def get(self, request, *args, **kwargs): config_folder = kwargs.get("config_folder") @@ -317,7 +395,7 @@ def get(self, request, *args, **kwargs): def get_queryset(self): queryset = ( - CandidateURL.objects.filter(collection__config_folder=self.config_folder) + CuratedUrl.objects.filter(collection__config_folder=self.config_folder) .with_exclusion_status() .filter(excluded=False) ) @@ -325,7 +403,7 @@ def get_queryset(self): class ExcludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet): - queryset = ExcludePattern.objects.all() + queryset = DeltaExcludePattern.objects.all() serializer_class = ExcludePatternSerializer def get_queryset(self): @@ -335,17 +413,17 @@ def create(self, request, *args, **kwargs): match_pattern = request.POST.get("match_pattern") collection_id = request.POST.get("collection") try: - ExcludePattern.objects.get( + DeltaExcludePattern.objects.get( collection_id=Collection.objects.get(id=collection_id), match_pattern=match_pattern, ).delete() return Response(status=status.HTTP_200_OK) - except ExcludePattern.DoesNotExist: + except DeltaExcludePattern.DoesNotExist: return super().create(request, *args, **kwargs) class IncludePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet): - queryset = IncludePattern.objects.all() + queryset = DeltaIncludePattern.objects.all() serializer_class = IncludePatternSerializer def get_queryset(self): @@ -355,17 +433,17 @@ def create(self, request, *args, **kwargs): match_pattern = request.POST.get("match_pattern") collection_id = request.POST.get("collection") try: - IncludePattern.objects.get( + DeltaIncludePattern.objects.get( collection_id=Collection.objects.get(id=collection_id), match_pattern=match_pattern, ).delete() return Response(status=status.HTTP_200_OK) - except IncludePattern.DoesNotExist: + except DeltaIncludePattern.DoesNotExist: return super().create(request, *args, **kwargs) class TitlePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet): - queryset = TitlePattern.objects.all() + queryset = DeltaTitlePattern.objects.all() serializer_class = TitlePatternSerializer def get_queryset(self): @@ -373,7 +451,7 @@ def get_queryset(self): class DocumentTypePatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet): - queryset = DocumentTypePattern.objects.all() + queryset = DeltaDocumentTypePattern.objects.all() serializer_class = DocumentTypePatternSerializer def get_queryset(self): @@ -387,18 +465,18 @@ def create(self, request, *args, **kwargs): collection_id = request.POST.get("collection") match_pattern = request.POST.get("match_pattern") try: - DocumentTypePattern.objects.get( + DeltaDocumentTypePattern.objects.get( collection_id=Collection.objects.get(id=collection_id), match_pattern=match_pattern, - match_pattern_type=DocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL, ).delete() return Response(status=status.HTTP_200_OK) - except DocumentTypePattern.DoesNotExist: + except DeltaDocumentTypePattern.DoesNotExist: return Response(status=status.HTTP_204_NO_CONTENT) class DivisionPatternViewSet(CollectionFilterMixin, viewsets.ModelViewSet): - queryset = DivisionPattern.objects.all() + queryset = DeltaDivisionPattern.objects.all() serializer_class = DivisionPatternSerializer def get_queryset(self): @@ -536,19 +614,19 @@ def get_context_data(self, **kwargs): class ResolvedTitleListView(ListView): - model = ResolvedTitle + model = DeltaResolvedTitle context_object_name = "resolved_titles" class ResolvedTitleErrorListView(ListView): - model = ResolvedTitleError + model = DeltaResolvedTitleError context_object_name = "resolved_title_errors" class TitlesAndErrorsView(View): def get(self, request, *args, **kwargs): - resolved_titles = ResolvedTitle.objects.select_related("title_pattern", "candidate_url").all() - resolved_title_errors = ResolvedTitleError.objects.select_related("title_pattern", "candidate_url").all() + resolved_titles = DeltaResolvedTitle.objects.select_related("title_pattern", "delta_url").all() + resolved_title_errors = DeltaResolvedTitleError.objects.select_related("title_pattern", "delta_url").all() context = { "resolved_titles": resolved_titles, "resolved_title_errors": resolved_title_errors, diff --git a/sde_collections/xml_templates/new_collection_template.xml b/sde_collections/xml_templates/new_collection_template.xml index 8c80f33b..0ee71927 100644 --- a/sde_collections/xml_templates/new_collection_template.xml +++ b/sde_collections/xml_templates/new_collection_template.xml @@ -145,7 +145,7 @@ false - 8 + 3 diff --git a/sde_indexing_helper/static/css/collections_list.css b/sde_indexing_helper/static/css/collections_list.css index caed7a4a..78796f98 100644 --- a/sde_indexing_helper/static/css/collections_list.css +++ b/sde_indexing_helper/static/css/collections_list.css @@ -313,3 +313,50 @@ margin-bottom: 0 !important; .search-container input:focus { font-style: italic; } + + .pattern-dropdown { + width: 100%; + background: #A7BACD !important; + font-size: 15px; + font-weight: 500; + line-height: 17.58px; + color: #1F2935; + display: flex; + justify-content: space-between; + align-items: center; + text-transform: capitalize; + border-radius: 4px; + margin-bottom: 0; + } + + .pattern-dropdown-input { + flex-direction: column; + width: 100%; + } + + .pattern-type-form { + width: 100%; + background: #15232E; + color: white; + border: 1px solid white; + padding: 24px 15px; + border-radius: 4px; + } + + .pattern-form-group { + margin-top: 40px; + } + + .form-label { + color: white; + display: flex; + font-size: 12px; + font-weight: 500; + letter-spacing: -0.02em; + margin-bottom: 8px; + } + + .form-label .asterik { + color: #C3001A; + margin-left: 4px; + } diff --git a/sde_indexing_helper/static/css/candidate_url_list.css b/sde_indexing_helper/static/css/delta_url_list.css similarity index 71% rename from sde_indexing_helper/static/css/candidate_url_list.css rename to sde_indexing_helper/static/css/delta_url_list.css index aa2d5d18..06689207 100644 --- a/sde_indexing_helper/static/css/candidate_url_list.css +++ b/sde_indexing_helper/static/css/delta_url_list.css @@ -18,7 +18,8 @@ text-decoration-thickness: 1px; } -.dataTables_scrollHead, .dataTables_scrollBody { +.dataTables_scrollHead, +.dataTables_scrollBody { overflow: visible !important; } @@ -43,7 +44,7 @@ background: #FFF; color: white; border-radius: 5px; - background-color:#15232E; + background-color: #15232E; } .custom-menu li { @@ -76,29 +77,31 @@ cursor: pointer } -.table_filter_row_input{ +.table_filter_row_input { width: 100%; } - .select-dropdown { +.select-dropdown { text-align: center; width: 100% !important; - color: #333333;; + color: #333333; + ; background-color: #fafafa; border-radius: 0.2rem; - border-color: #fafafa; + border-color: #fafafa; font-size: 0.6875rem; - box-shadow: 0 2px 2px 0 rgba(153, 153, 153, 0.14), 0 3px 1px -2px rgba(153, 153, 153, 0.2), 0 1px 5px 0 rgba(153, 153, 153, 0.12); } + box-shadow: 0 2px 2px 0 rgba(153, 153, 153, 0.14), 0 3px 1px -2px rgba(153, 153, 153, 0.2), 0 1px 5px 0 rgba(153, 153, 153, 0.12); +} - .select-dropdown:hover { +.select-dropdown:hover { box-shadow: 0 14px 26px -12px rgba(250, 250, 250, 0.42), 0 4px 23px 0px rgba(0, 0, 0, 0.12), 0 8px 10px -5px rgba(250, 250, 250, 0.2); - } +} - .select-dropdown:focus, - .select-dropdown.focus { +.select-dropdown:focus, +.select-dropdown.focus { box-shadow: none, 0 0 0 0.2rem rgba(76, 175, 80, 0.5); - } +} /* badge showing workflow status by header */ .badge { @@ -107,7 +110,8 @@ } -.table_filter_row_input, .doc-dropdown{ +.table_filter_row_input, +.doc-dropdown { width: 100%; } @@ -159,16 +163,17 @@ border-radius: 1px !important; } -.candidateUrlContainer { +.deltaUrlContainer { background: #15232E; padding: 40px 30px; border-radius: 15px; } + .modalTitle { -font-size: 24px; -font-weight: 600; -line-height: 36px; -letter-spacing: -0.03em; + font-size: 24px; + font-weight: 600; + line-height: 36px; + letter-spacing: -0.03em; } #hideShowColumnsModal { @@ -181,43 +186,45 @@ letter-spacing: -0.03em; z-index: 2000; } -#caption, #subTitle { -font-size: 14px; -font-weight: 400; -line-height: 21px; -letter-spacing: -0.02em; +#caption, +#subTitle { + font-size: 14px; + font-weight: 400; + line-height: 21px; + letter-spacing: -0.02em; } - .checkbox-wrapper { +.checkbox-wrapper { display: flex; align-items: baseline; - } +} - .checkbox-wrapper label { +.checkbox-wrapper label { font-weight: 600; font-size: 16px; line-height: 24px; margin-bottom: 0; color: rgba(31, 41, 53, 1); padding-left: 10px; - } +} - .modalFooter { +.modalFooter { position: sticky; bottom: 0; position: sticky; bottom: 0; padding: 10px 0; background: #FFFFFF; - } -.badge{ +} + +.badge { background-color: #FF3D57; } -.notifyBadge{ - margin-left:5px !important; +.notifyBadge { + margin-left: 5px !important; } .sorting_1 { @@ -227,102 +234,113 @@ letter-spacing: -0.02em; max-width: 600px; width: 600px; color: #65B1EF; - } +} .title-dropdown { width: fit-content !important; - margin-top:20px; - margin-bottom:20px; + margin-top: 20px; + margin-bottom: 20px; } + .table tbody tr:nth-child(odd) { background-color: #050E19 !important; - } +} - .table tbody tr:nth-child(even) { +.table tbody tr:nth-child(even) { background-color: #3F4A58 !important; - } - .candidateTitle{ - font-size:24px; +} + +.deltaTitle { + font-size: 24px; font-weight: 500; - } +} - .custom-select, .buttons-csv, .customizeColumns, .addPattern{ +.custom-select, +.buttons-csv, +.customizeColumns, +.addPattern { border-style: solid !important; border-color: #A7BACD !important; border-width: 1px !important; - color:#A7BACD !important; + color: #A7BACD !important; border-radius: 5px !important; padding: 11px 15px; - } +} - .addPattern { +.addPattern { background-color: #0066CA !important; border-color: #0066CA !important; color: #fff !important; - } +} - #exclude_patterns_table_wrapper .dt-buttons, #include_patterns_table_wrapper .dt-buttons, #document_type_patterns_table_wrapper .dt-buttons, #title_patterns_table_wrapper .dt-buttons { +#exclude_patterns_table_wrapper .dt-buttons, +#include_patterns_table_wrapper .dt-buttons, +#document_type_patterns_table_wrapper .dt-buttons, +#title_patterns_table_wrapper .dt-buttons { width: 89%; justify-content: end; - } +} - .customizeColumns { +.customizeColumns { margin-left: 10px !important; - } +} - .form-control:read-only { +.form-control:read-only { background-image: none; - } +} - .dt-container div.dt-length label { +.dt-container div.dt-length label { display: none; - } +} - div.dt-container div.dt-info { +div.dt-container div.dt-info { padding-top: 0; white-space: normal; } -.page-link{ - color:white !important; - border:0.5px solid !important; - margin-left:3px; - margin-right:3px; +.page-link { + color: white !important; + border: 0.5px solid !important; + margin-left: 3px; + margin-right: 3px; } -.page-link:hover{ + +.page-link:hover { background-color: #0066CA !important; } .page-item.disabled .page-link { - color:grey!important; + color: grey !important; } -.dt-paging-input{ - color:white; + +.dt-paging-input { + color: white; } -.dt-paging-input input{ +.dt-paging-input input { background-color: #3F4A58; color: white; - border:solid 0.5px !important; + border: solid 0.5px !important; } -.dt-inputpaging{ - position: absolute; - right: 16px; - top: -27px; +.dt-inputpaging { + position: absolute; + right: 16px; + top: -27px; } -.ml-auto{ - width:50%; + +.ml-auto { + width: 50%; } -.custom-select-sm{ - margin-left:5px; +.custom-select-sm { + margin-left: 5px; } -.selected{ +.selected { background-color: inherit !important; } @@ -334,26 +352,28 @@ div.dt-buttons .btn.processing:after { -webkit-animation: dtb-spinner 1500ms infinite linear; } -.document_type_dropdown, .division_dropdown, .dropdown-toggle { +.document_type_dropdown, +.division_dropdown, +.dropdown-toggle { width: 100%; display: flex; justify-content: center; } - .dropdown-toggle { +.dropdown-toggle { width: 80%; /* display: flex; */ align-items: center; /* justify-content: space-between; */ - } +} -.headerDiv{ +.headerDiv { display: flex; justify-content: space-between; } .url-cell { - display:flex; + display: flex; align-items: center; justify-content: space-between; word-wrap: break-word; @@ -362,17 +382,19 @@ div.dt-buttons .btn.processing:after { overflow-wrap: break-word; min-width: 100%; max-width: 100%; - } +} - .url-icon { +.url-icon { color: #65B1EF; - } -#match_pattern_input, #title_pattern_input { +} + +#match_pattern_input, +#title_pattern_input { background: #3F4A58; border-radius: 4px; } -.modal-body .bmd-label-static { +.modal-body .bmd-label-static { top: -20px !important; } @@ -396,25 +418,26 @@ div.dt-buttons .btn.processing:after { margin-top: 40px; } -.is-focused [class^='bmd-label']{ - color:#0066CA; - } - .form-control{ - color:white; - } +.is-focused [class^='bmd-label'] { + color: #0066CA; +} + +.form-control { + color: white; +} - .form-control:focus{ - color:white; - } +.form-control:focus { + color: white; +} - .is-focused .form-label{ - background-image:linear-gradient(to top, #0066CA 2px, rgba(156, 39, 176, 0) 2px), linear-gradient(to top, #d2d2d2 1px, rgba(210, 210, 210, 0) 1px); - color:#AAAAAA; - } +.is-focused .form-label { + background-image: linear-gradient(to top, #0066CA 2px, rgba(156, 39, 176, 0) 2px), linear-gradient(to top, #d2d2d2 1px, rgba(210, 210, 210, 0) 1px); + color: #AAAAAA; +} - .dropdown-item:hover{ +.dropdown-item:hover { background-color: #0066CA !important; - } +} /* pagination position */ diff --git a/sde_indexing_helper/static/js/collection_list.js b/sde_indexing_helper/static/js/collection_list.js index dc33ce3b..78fd4894 100644 --- a/sde_indexing_helper/static/js/collection_list.js +++ b/sde_indexing_helper/static/js/collection_list.js @@ -1,3 +1,19 @@ +// Define column constants for better maintainability +const COLUMNS = { + NAME: 0, + URL: 1, + DIVISION: 2, + DELTA_URLS: 3, + CURATED_URLS: 4, + WORKFLOW_STATUS: 5, + CURATOR: 6, + CONNECTOR_TYPE: 7, + REINDEXING_STATUS: 8, + WORKFLOW_STATUS_RAW: 9, + CURATOR_ID: 10, + REINDEXING_STATUS_RAW: 11 +}; + var uniqueId; //used for logic related to contents on column customization modal function modalContents(tableName) { @@ -107,147 +123,184 @@ let table = $("#collection_table").DataTable({ }, ], columnDefs: [ + // hide the data columns and search panes for these columns { - targets: 8, + targets: [COLUMNS.WORKFLOW_STATUS_RAW, COLUMNS.CURATOR_ID, COLUMNS.REINDEXING_STATUS_RAW], visible: false, + searchPanes: { + show: false, + }, }, - { width: "200px", targets: 1 }, + { width: "200px", targets: COLUMNS.URL }, { searchPanes: { options: [ { label: "0 URLs", value: function (rowData, rowIdx) { - return $(rowData[3]).text() == 0; + return $(rowData[COLUMNS.DELTA_URLS]).text() == 0; }, }, { label: "1 solo URL", value: function (rowData, rowIdx) { - return $(rowData[3]).text() == 1; + return $(rowData[COLUMNS.DELTA_URLS]).text() == 1; }, }, { label: "1 to 100 URLs", value: function (rowData, rowIdx) { - return $(rowData[3]).text() <= 100 && $(rowData[3]).text() > 1; + return $(rowData[COLUMNS.DELTA_URLS]).text() <= 100 && $(rowData[COLUMNS.DELTA_URLS]).text() > 1; }, }, { label: "100 to 1,000 URLs", value: function (rowData, rowIdx) { - return $(rowData[3]).text() <= 1000 && $(rowData[3]).text() > 100; + return $(rowData[COLUMNS.DELTA_URLS]).text() <= 1000 && $(rowData[COLUMNS.DELTA_URLS]).text() > 100; }, }, { label: "1,000 to 10,000 URLs", value: function (rowData, rowIdx) { - return ( - $(rowData[3]).text() <= 10000 && $(rowData[3]).text() > 1000 - ); + return $(rowData[COLUMNS.DELTA_URLS]).text() <= 10000 && $(rowData[COLUMNS.DELTA_URLS]).text() > 1000; }, }, { label: "10,000 to 100,000 URLs", value: function (rowData, rowIdx) { - return ( - $(rowData[3]).text() <= 100000 && $(rowData[3]).text() > 10000 - ); + return $(rowData[COLUMNS.DELTA_URLS]).text() <= 100000 && $(rowData[COLUMNS.DELTA_URLS]).text() > 10000; }, }, { label: "Over 100,000 URLs", value: function (rowData, rowIdx) { - return $(rowData[3]).text() > 100000; + return $(rowData[COLUMNS.DELTA_URLS]).text() > 100000; }, }, ], + show: true, }, - targets: [3], + targets: [COLUMNS.DELTA_URLS], type: "num-fmt", }, { searchPanes: { - show: false, + options: [ + { + label: "0 URLs", + value: function (rowData, rowIdx) { + return $(rowData[COLUMNS.CURATED_URLS]).text() == 0; + }, + }, + { + label: "1 solo URL", + value: function (rowData, rowIdx) { + return $(rowData[COLUMNS.CURATED_URLS]).text() == 1; + }, + }, + { + label: "1 to 100 URLs", + value: function (rowData, rowIdx) { + return $(rowData[COLUMNS.CURATED_URLS]).text() <= 100 && $(rowData[COLUMNS.CURATED_URLS]).text() > 1; + }, + }, + { + label: "100 to 1,000 URLs", + value: function (rowData, rowIdx) { + return $(rowData[COLUMNS.CURATED_URLS]).text() <= 1000 && $(rowData[COLUMNS.CURATED_URLS]).text() > 100; + }, + }, + { + label: "1,000 to 10,000 URLs", + value: function (rowData, rowIdx) { + return $(rowData[COLUMNS.CURATED_URLS]).text() <= 10000 && $(rowData[COLUMNS.CURATED_URLS]).text() > 1000; + }, + }, + { + label: "10,000 to 100,000 URLs", + value: function (rowData, rowIdx) { + return $(rowData[COLUMNS.CURATED_URLS]).text() <= 100000 && $(rowData[COLUMNS.CURATED_URLS]).text() > 10000; + }, + }, + { + label: "Over 100,000 URLs", + value: function (rowData, rowIdx) { + return $(rowData[COLUMNS.CURATED_URLS]).text() > 100000; + }, + }, + ], + show: true, }, - targets: [7, 8], + targets: [COLUMNS.CURATED_URLS], + type: "num-fmt", }, { searchPanes: { + show: true, dtOpts: { scrollY: "100%", }, }, - targets: [5], + targets: [COLUMNS.CURATOR], + }, + { + searchPanes: { + show: true, + dtOpts: { + scrollY: "100%", + }, + }, + targets: [COLUMNS.CONNECTOR_TYPE], + }, + // Explicitly enable required searchPanes – Selenium requires searchPanes to be explicitly enabled for proper functionality during testing. + { + searchPanes: { + show: true, + }, + targets: [COLUMNS.DIVISION, COLUMNS.DELTA_URLS, COLUMNS.CURATED_URLS, COLUMNS.WORKFLOW_STATUS, COLUMNS.CURATOR, COLUMNS.CONNECTOR_TYPE, COLUMNS.REINDEXING_STATUS ], }, ], }); $("#collection-dropdown-4").on("change", function () { table - .columns(7) + .columns(COLUMNS.WORKFLOW_STATUS_RAW) .search(this.value ? "^" + this.value + "$" : "", true, false) .draw(); }); $("#collection-dropdown-5").on("change", function () { table - .columns(8) + .columns(COLUMNS.CURATOR_ID) + .search(this.value ? "^" + this.value + "$" : "", true, false) + .draw(); +}); + +$("#collection-dropdown-6").on("change", function () { + table + .columns(COLUMNS.REINDEXING_STATUS_RAW) .search(this.value ? "^" + this.value + "$" : "", true, false) .draw(); }); $("#nameFilter").on("keyup", function () { - table.columns(0).search(this.value).draw(); + table.columns(COLUMNS.NAME).search(this.value).draw(); }); $("#urlFilter").on("keyup", function () { - table.columns(1).search(this.value).draw(); + table.columns(COLUMNS.URL).search(this.value).draw(); }); $("#divisionFilter").on("keyup", function () { - table.columns(2).search(this.value).draw(); + table.columns(COLUMNS.DIVISION).search(this.value).draw(); }); $("#connectorTypeFilter").on("keyup", function () { - table.columns(6).search(this.value).draw(); + table.columns(COLUMNS.CONNECTOR_TYPE).search(this.value).draw(); }); var csrftoken = $('input[name="csrfmiddlewaretoken"]').val(); -// I don't think this function is being used -// function handleCurationStatusSelect() { -// $("body").on("click", ".curation_status_select", function () { -// var collection_id = $(this).data('collection-id'); -// var curation_status = $(this).attr('value'); -// var curation_status_text = $(this).text(); -// var color_choices = { -// 1: "btn-light", -// 2: "btn-danger", -// 3: "btn-warning", -// 4: "btn-info", -// 5: "btn-success", -// 6: "btn-primary", -// 7: "btn-info", -// 8: "btn-secondary", -// } - -// $possible_buttons = $('body').find(`[id="curation-status-button-${collection_id}"]`); -// if ($possible_buttons.length > 1) { -// $button = $possible_buttons[1]; -// $button = $($button); -// } else { -// $button = $(`#curation-status-button-${collection_id}`); -// } -// $button.text(curation_status_text); -// $button.removeClass('btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary'); -// $button.addClass(color_choices[parseInt(curation_status)]); -// $('#collection_table').DataTable().searchPanes.rebuildPane(6); -// var collection_division = $(this).data('collection-division'); -// postCurationStatus(collection_id, curation_status, collection_division); -// }); -// } - function handleWorkflowStatusSelect() { $("body").on("click", ".workflow_status_select", function () { var collection_id = $(this).data("collection-id"); @@ -288,21 +341,67 @@ function handleWorkflowStatusSelect() { $button.addClass(color_choices[parseInt(workflow_status)]); var row = table.row("#" + collection_id); let index = row.index(); - var $html = $("
", { html: table.data()[index][4] }); - $html.find("button").html(workflow_status_text); + var $html = $("
", { html: table.data()[index][COLUMNS.WORKFLOW_STATUS] }); + $html.find("button").text(workflow_status_text); $html .find("button") .removeClass( "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary" ); $html.find("button").addClass(color_choices[parseInt(workflow_status)]); - table.data()[index][4] = $html.html(); - $("#collection_table").DataTable().searchPanes.rebuildPane(4); + table.data()[index][COLUMNS.WORKFLOW_STATUS] = $html.html(); + $("#collection_table").DataTable().searchPanes.rebuildPane(COLUMNS.WORKFLOW_STATUS); postWorkflowStatus(collection_id, workflow_status); }); } +function handleReindexingStatusSelect() { + $("body").on("click", ".reindexing_status_select", function () { + var collection_id = $(this).data("collection-id"); + var reindexing_status = $(this).attr("value"); + var reindexing_status_text = $(this).text(); + var color_choices = { + 1: "btn-light", // REINDEXING_NOT_NEEDED + 2: "btn-warning", // REINDEXING_NEEDED_ON_DEV + 3: "btn-secondary", // REINDEXING_FINISHED_ON_DEV + 4: "btn-info", // REINDEXING_READY_FOR_CURATION + 5: "btn-warning", // REINDEXING_CURATION_IN_PROGRESS + 6: "btn-primary", // REINDEXING_CURATED + 7: "btn-success" // REINDEXING_INDEXED_ON_PROD + }; + + $possible_buttons = $("body").find( + `[id="reindexing-status-button-${collection_id}"]` + ); + if ($possible_buttons.length > 1) { + $button = $possible_buttons[1]; + $button = $($button); + } else { + $button = $(`#reindexing-status-button-${collection_id}`); + } + $button.text(reindexing_status_text); + $button.removeClass( + "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary" + ); + $button.addClass(color_choices[parseInt(reindexing_status)]); + var row = table.row("#" + collection_id); + let index = row.index(); + var $html = $("
", { html: table.data()[index][COLUMNS.REINDEXING_STATUS] }); + $html.find("button").text(reindexing_status_text); + $html + .find("button") + .removeClass( + "btn-light btn-danger btn-warning btn-info btn-success btn-primary btn-secondary" + ); + $html.find("button").addClass(color_choices[parseInt(reindexing_status)]); + table.data()[index][COLUMNS.REINDEXING_STATUS] = $html.html(); + $("#collection_table").DataTable().searchPanes.rebuildPane(COLUMNS.REINDEXING_STATUS); + + postReindexingStatus(collection_id, reindexing_status); + }); +} + function handleCuratorSelect() { $("body").on("click", ".curator_select", function () { var collection_id = $(this).data("collection-id"); @@ -326,32 +425,33 @@ function handleCuratorSelect() { $button.addClass("btn-success"); var row = table.row("#" + collection_id); let index = row.index(); - var $html = $("
", { html: table.data()[index][5] }); - $html.find("button").html(curator_text); - table.data()[index][5] = $html.html(); - table.searchPanes.rebuildPane(5); + var $html = $("
", { html: table.data()[index][COLUMNS.CURATOR] }); + $html.find("button").text(curator_text); + table.data()[index][COLUMNS.CURATOR] = $html.html(); + table.searchPanes.rebuildPane(COLUMNS.CURATOR); postCurator(collection_id, curator_id); }); } -function postCurationStatus(collection_id, curation_status) { +function postReindexingStatus(collection_id, reindexing_status) { var url = `/api/collections/${collection_id}/`; $.ajax({ url: url, type: "PUT", data: { - curation_status: curation_status, + reindexing_status: reindexing_status, csrfmiddlewaretoken: csrftoken, }, headers: { "X-CSRFToken": csrftoken, }, success: function (data) { - toastr.success("Curation Status Updated!"); + toastr.success("Reindexing Status Updated!"); }, }); } + function postWorkflowStatus(collection_id, workflow_status) { var url = `/api/collections/${collection_id}/`; $.ajax({ @@ -399,10 +499,12 @@ $(document).ready(function () { null, null, "Division", - "Candidate URLs", + "Delta URLs", + "Curated URLs", "Workflow Status", "Curator", "Connector Type", + "Reindexing Status", ]; // Event listener for the collection search input @@ -413,18 +515,19 @@ $(document).ready(function () { // Clear previous search table.search('').columns().search(''); + // TODO: this section might still need to be refactored to align with our column index definitions // Filter the table based on the query in the collection name and config folder data attribute table.rows().every(function () { - let row = $(this.node()); - let name = row.find('td').first().text().toLowerCase(); - let configFolder = row.data('config-folder').toLowerCase(); - let url = row.find('td').eq(1).text().toLowerCase(); - - if (name.includes(query) || configFolder.includes(query) || url.includes(query)) { - row.show(); - } else { - row.hide(); - } + let row = $(this.node()); + let name = row.find('td').first().text().toLowerCase(); + let configFolder = row.data('config-folder').toLowerCase(); + let url = row.find('td').eq(1).text().toLowerCase(); + + if (name.includes(query) || configFolder.includes(query) || url.includes(query)) { + row.show(); + } else { + row.hide(); + } }); }); @@ -446,8 +549,8 @@ $(document).ready(function () { }); function setupClickHandlers() { - // handleCurationStatusSelect(); handleWorkflowStatusSelect(); + handleReindexingStatusSelect(); handleCuratorSelect(); } diff --git a/sde_indexing_helper/static/js/candidate_url_list.js b/sde_indexing_helper/static/js/delta_url_list.js similarity index 61% rename from sde_indexing_helper/static/js/candidate_url_list.js rename to sde_indexing_helper/static/js/delta_url_list.js index ed6d3e4b..33e7850d 100644 --- a/sde_indexing_helper/static/js/candidate_url_list.js +++ b/sde_indexing_helper/static/js/delta_url_list.js @@ -86,7 +86,7 @@ function modalContents(tableName) { .attr("for", "checkbox_" + columnName.replace(/\s+/g, "_")) .text(columnName); var $caption = $("

") - .text(candidateTableHeaderDefinitons[columnName]) + .text(deltaTableHeaderDefinitons[columnName]) .attr({ id: "caption", }); @@ -107,7 +107,7 @@ function initializeDataTable() { var true_icon = 'check'; var false_icon = 'close'; - var candidate_urls_table = $("#candidate_urls_table").DataTable({ + var delta_urls_table = $("#delta_urls_table").DataTable({ pageLength: 100, colReorder: true, stateSave: true, @@ -145,13 +145,13 @@ function initializeDataTable() { lines[0] = reorderedHeaders.join(","); const appliedFilt = [ - [`URL:`, `${$("#candidateUrlFilter").val()}`.trim()], + [`URL:`, `${$("#deltaUrlFilter").val()}`.trim()], [`Exclude:`, `${$(".dropdown-1").val()}`.trim()], [ `Scraped Title:`, - `${$("#candidateScrapedTitleFilter").val()}`.trim(), + `${$("#deltaScrapedTitleFilter").val()}`.trim(), ], - [`New Title:`, `${$("#candidateNewTitleFilter").val()}`.trim()], + [`New Title:`, `${$("#deltaNewTitleFilter").val()}`.trim()], [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()], [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()], ]; @@ -175,7 +175,7 @@ function initializeDataTable() { else { // Add filter information to the first row const secondRowFilters = [ - "Export of SDE Candidate URLs", + "Export of SDE Delta URLs", `"(Applied Filters: ${appliedFilt .reduce((acc, curr) => { if ( @@ -201,7 +201,7 @@ function initializeDataTable() { text: "Customize Columns", className: "customizeColumns", action: function () { - modalContents("#candidate_urls_table"); + modalContents("#delta_urls_table"); }, }, ], @@ -214,7 +214,7 @@ function initializeDataTable() { stateLoadCallback: function (settings) { var state = JSON.parse( localStorage.getItem( - "DataTables_candidate_urls_" + window.location.pathname + "DataTables_delta_urls_" + window.location.pathname ) ); if (!state) { @@ -223,13 +223,13 @@ function initializeDataTable() { return state; }, ajax: { - url: `/api/candidate-urls/?format=datatables&collection_id=${collection_id}`, + url: `/api/delta-urls/?format=datatables&collection_id=${collection_id}`, data: function (d) { d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; }, }, initComplete: function (data) { - const addDropdownSelect = [1, 4, 5]; + const addDropdownSelect = [1, 2, 4, 5]; const dict = { 1: "Images", 2: "Data", @@ -253,14 +253,17 @@ function initializeDataTable() { columns: [ getURLColumn(), getExcludedColumn(true_icon, false_icon), + getDeletedColumn(true_icon, false_icon), getScrapedTitleColumn(), getGeneratedTitleColumn(), getDocumentTypeColumn(), getDivisionColumn(), { data: "id", visible: false, searchable: false }, + { data: "exclude_pattern_type", visible: false, searchable: false }, + { data: "include_pattern_id", visible: false, searchable: false }, { data: "generated_title_id", visible: false, searchable: false }, { data: "match_pattern_type", visible: false, searchable: false }, - { data: "candidate_urls_count", visible: false, searchable: false }, + { data: "delta_urls_count", visible: false, searchable: false }, { data: "excluded", visible: false, searchable: false }, { data: null, @@ -301,29 +304,245 @@ function initializeDataTable() { }, }); - $("#candidateUrlFilter").on( + $("#deltaUrlFilter").on( "beforeinput", DataTable.util.debounce(function (val) { - candidate_urls_table.columns(0).search(this.value).draw(); + delta_urls_table.columns(0).search(this.value).draw(); }, 1000) ); - $("#candidateScrapedTitleFilter").on( + $("#deltaScrapedTitleFilter").on( "beforeinput", DataTable.util.debounce(function (val) { - candidate_urls_table.columns(2).search(this.value).draw(); + delta_urls_table.columns(2).search(this.value).draw(); }, 1000) ); - $("#candidateNewTitleFilter").on( + $("#deltaNewTitleFilter").on( "beforeinput", DataTable.util.debounce(function (val) { - candidate_urls_table.columns(3).search(this.value).draw(); + delta_urls_table.columns(3).search(this.value).draw(); + }, 1000) + ); + + var curated_urls_table = $("#curated_urls_table").DataTable({ + pageLength: 100, + colReorder: true, + stateSave: true, + layout: { + bottomEnd: "inputPaging", + topEnd: null, + topStart: { + info: true, + pageLength: { + menu: [ + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], + ], + }, + buttons: [ + { + extend: "csv", + exportOptions: { + columns: [0, 11, 2, 12, 10], + }, + customize: function (csv) { + var lines = csv.split("\n"); + + // Reorder the header columns + var headers = lines[0].split(","); + headers[4] = "New Title"; + var reorderedHeaders = [ + headers[0], + headers[3], + headers[1], + headers[4], + headers[5], + headers[2], + ]; + lines[0] = reorderedHeaders.join(","); + + const appliedFilt = [ + [`URL:`, `${$("#curatedUrlFilter").val()}`.trim()], + [`Exclude:`, `${$(".dropdown-1").val()}`.trim()], + [ + `Scraped Title:`, + `${$("#curatedScrapedTitleFilter").val()}`.trim(), + ], + [`New Title:`, `${$("#curatedNewTitleFilter").val()}`.trim()], + [`Document Type:`, `${dict[$(".dropdown-4").val()]}`.trim()], + [`Division By URL:`, `${dict[$(".dropdown-5").val()]}`.trim()], + ]; + + const filtersAreEmpty = appliedFilt.every((filter) => { + return filter[1] === "" || filter[1] === "undefined"; + }); + + // Remove the second row with the filters + if (lines.length > 2) { + lines.splice(1, 1); + } + let alteredLines = []; + lines.forEach((line) => { + let newLine = ""; + newLine = line.replace("open_in_new", ""); + alteredLines.push(newLine); + }); + + if (filtersAreEmpty) return alteredLines.join("\n"); + else { + // Add filter information to the first row + const secondRowFilters = [ + "Export of SDE Curated URLs", + `"(Applied Filters: ${appliedFilt + .reduce((acc, curr) => { + if ( + curr[1] !== " undefined" && + curr[1] !== " " && + curr[1] !== "" && + curr[1] !== "undefined" + ) { + acc = `${acc}, ${curr[0]} ${curr[1]}`; + } + return acc; + }, "") + .slice(2)})"`, + ]; + + var appliedFiltersInfo = secondRowFilters.join("\n"); + return appliedFiltersInfo + "\n" + alteredLines.join("\n"); + } + }, + }, + "spacer", + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#curated_urls_table"); + }, + }, + ], + }, + }, + serverSide: true, + orderCellsTop: true, + pagingType: "input", + rowId: "url", + stateLoadCallback: function (settings) { + var state = JSON.parse( + localStorage.getItem( + "DataTables_curated_urls_" + window.location.pathname + ) + ); + if (!state) { + settings.oInit.pageLength = 1; + } + return state; + }, + ajax: { + url: `/api/curated-urls/?format=datatables&collection_id=${collection_id}`, + data: function (d) { + d.is_excluded = $("#filter-checkbox").is(":checked") ? false : null; + }, + }, + initComplete: function (data) { + const addDropdownSelect = [1, 4, 5]; + const dict = { + 1: "Images", + 2: "Data", + 3: "Documentation", + 4: "Software and Tools", + 5: "Missions and Instruments", + }; + this.api() + .columns() + .every(function (index) { + let column = this; + if (addDropdownSelect.includes(index)) { + $("thead tr td select.dropdown-" + index).on("change", function () { + var val = $.fn.dataTable.util.escapeRegex($(this).val()); + column.search(val ? "^" + val + "$" : "", true, false).draw(); + }); + } + }); + }, + + columns: [ + getCuratedURLColumn(), + getCuratedExcludedColumn(true_icon, false_icon), + getCuratedScrapedTitleColumn(), + getCuratedGeneratedTitleColumn(), + getCuratedDocumentTypeColumn(), + getCuratedDivisionColumn(), + { data: "id", visible: false, searchable: false }, + { data: "generated_title_id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false, searchable: false }, + { data: "curated_urls_count", visible: false, searchable: false }, + { data: "excluded", visible: false, searchable: false }, + { + data: null, + render: function (data, type, row) { + if (!row.document_type) return "Select"; + return dict[row.document_type]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + const excludedDict = { + true: "Yes", + false: "No", + }; + return excludedDict[row.excluded]; + }, + visible: false, + }, + { + data: null, + render: function (data, type, row) { + return row.generated_title; + }, + visible: false, + }, + // ...(is_multi_division === 'true' ? [getDivisionColumn()] : []), + // getDivisionColumn(), + ], + createdRow: function (row, data, dataIndex) { + if (data["excluded"]) { + $(row).attr( + "style", + "background-color: rgba(255, 61, 87, 0.36) !important" + ); + } + }, + }); + + $("#curatedUrlFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(0).search(this.value).draw(); + }, 1000) + ); + + $("#curatedScrapedTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(2).search(this.value).draw(); + }, 1000) + ); + + $("#curatedNewTitleFilter").on( + "beforeinput", + DataTable.util.debounce(function (val) { + curated_urls_table.columns(3).search(this.value).draw(); }, 1000) ); var exclude_patterns_table = $("#exclude_patterns_table").DataTable({ - // scrollY: true, + serverSide: true, + paging: true, dom: "lBrtip", buttons: [ { @@ -346,7 +565,7 @@ function initializeDataTable() { ["Show 25", "Show 50", "Show 100", "Show 500"], ], orderCellsTop: true, - pageLength: 100, + pageLength: 50, ajax: `/api/exclude-patterns/?format=datatables&collection_id=${collection_id}`, initComplete: function (data) { var table = $("#exclude_patterns_table").DataTable(); @@ -359,12 +578,10 @@ function initializeDataTable() { $("#exclude-patterns-dropdown-1").prop("disabled", true); } else if (index === 1) { $("#exclude-patterns-dropdown-1").on("change", function () { - if ($(this).val() === "") table.columns(6).search("").draw(); + if ($(this).val() === "") table.columns(7).search("").draw(); else { - table - .column(6) - .search(matchPatternTypeMap[$(this).val()]) - .draw(); + const patternType = matchPatternTypeMap[$(this).val()]; + table.column(7).search(patternType).draw(); } }); } @@ -384,7 +601,12 @@ function initializeDataTable() { visible: false, }, { - data: "candidate_urls_count", + data: "delta_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: "curated_urls_count", class: "text-center whiteText", sortable: true, }, @@ -397,20 +619,21 @@ function initializeDataTable() { }, }, { data: "id", visible: false, searchable: false }, - { data: "match_pattern_type", visible: false }, + { data: "match_pattern_type", visible: false, searchable: true }, ], }); - $("#candidateMatchPatternFilter").on("beforeinput", function () { + $("#deltaMatchPatternFilter").on("beforeinput", function () { exclude_patterns_table.columns(0).search(this.value).draw(); }); - $("#candidateReasonFilter").on("beforeinput", function () { + $("#deltaReasonFilter").on("beforeinput", function () { exclude_patterns_table.columns(2).search(this.value).draw(); }); var include_patterns_table = $("#include_patterns_table").DataTable({ - // scrollY: true, + serverSide: true, + paging: true, lengthMenu: [ [25, 50, 100, 500], ["Show 25", "Show 50", "Show 100", "Show 500"], @@ -432,7 +655,7 @@ function initializeDataTable() { }, }, ], - pageLength: 100, + pageLength: 50, orderCellsTop: true, ajax: `/api/include-patterns/?format=datatables&collection_id=${collection_id}`, initComplete: function (data) { @@ -446,11 +669,11 @@ function initializeDataTable() { } else { if (index === 1) { $("#include-patterns-dropdown-1").on("change", function () { - if ($(this).val() === "") table.columns(5).search("").draw(); - table - .column(5) - .search(matchPatternTypeMap[$(this).val()]) - .draw(); + if ($(this).val() === "") table.columns(6).search("").draw(); + else { + const patternType = matchPatternTypeMap[$(this).val()]; + table.column(6).search(patternType).draw(); + } }); } } @@ -464,7 +687,12 @@ function initializeDataTable() { sortable: false, }, { - data: "candidate_urls_count", + data: "delta_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: "curated_urls_count", class: "text-center whiteText", sortable: true, }, @@ -477,11 +705,11 @@ function initializeDataTable() { }, }, { data: "id", visible: false, searchable: false }, - { data: "match_pattern_type", visible: false }, + { data: "match_pattern_type", visible: false, searchable: true }, ], }); - $("#candidateIncludeMatchPatternFilter").on("beforeinput", function () { + $("#deltaIncludeMatchPatternFilter").on("beforeinput", function () { include_patterns_table.columns(0).search(this.value).draw(); }); @@ -524,12 +752,10 @@ function initializeDataTable() { $("#title-patterns-dropdown-1").prop("disabled", true); } else if (index === 1) { $("#title-patterns-dropdown-1").on("change", function () { - if ($(this).val() === "") table.columns(6).search("").draw(); + if ($(this).val() === "") table.columns(7).search("").draw(); else { - table - .column(6) - .search(matchPatternTypeMap[$(this).val()]) - .draw(); + const patternType = matchPatternTypeMap[$(this).val()]; + table.column(7).search(patternType).draw(); } }); } @@ -544,7 +770,12 @@ function initializeDataTable() { }, { data: "title_pattern", class: "whiteText" }, { - data: "candidate_urls_count", + data: "delta_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: "curated_urls_count", class: "text-center whiteText", sortable: true, }, @@ -557,15 +788,15 @@ function initializeDataTable() { }, }, { data: "id", visible: false, searchable: false }, - { data: "match_pattern_type", visible: false }, + { data: "match_pattern_type", visible: false, searchable: true }, ], }); - $("#candidateTitleMatchPatternFilter").on("beforeinput", function (val) { + $("#deltaTitleMatchPatternFilter").on("beforeinput", function (val) { title_patterns_table.columns(0).search(this.value).draw(); }); - $("#candidateTitlePatternTypeFilter").on("beforeinput", function (val) { + $("#deltaTitlePatternTypeFilter").on("beforeinput", function (val) { title_patterns_table.columns(2).search(this.value).draw(); }); @@ -598,53 +829,19 @@ function initializeDataTable() { pageLength: 100, ajax: `/api/document-type-patterns/?format=datatables&collection_id=${collection_id}`, initComplete: function (data) { - this.api() - .columns() - .every(function (index) { - var table = $("#document_type_patterns_table").DataTable(); - - let addDropdownSelect = { - 1: { - columnToSearch: 6, - matchPattern: { - "Individual URL Pattern": 1, - "Multi-URL Pattern": 2, - }, - }, - 2: { - columnToSearch: 7, - matchPattern: { - Images: 1, - Data: 2, - Documentation: 3, - "Software and Tools": 4, - "Missions and Instruments": 5, - }, - }, - }; - - let column = this; - if (column.data().length === 0) { - $(`#document-type-patterns-dropdown-${index}`).prop( - "disabled", - true - ); - } else if (index in addDropdownSelect) { - $("#document-type-patterns-dropdown-" + index).on( - "change", - function () { - let col = addDropdownSelect[index].columnToSearch; - let searchInput = - addDropdownSelect[index].matchPattern[$(this).val()]; - if ($(this).val() === "" || $(this).val() === undefined) - table.columns(col).search("").draw(); - else { - table.columns(col).search(searchInput).draw(); - } - } - ); - } - }); + var table = $("#document_type_patterns_table").DataTable(); + this.api().columns().every(function (index) { + if (index === 1) { + $("#document-type-patterns-dropdown-1").on("change", function () { + if ($(this).val() === "") { + table.column(7).search("").draw(); + } else { + const patternType = matchPatternTypeMap[$(this).val()]; + table.column(7).search(patternType).draw(); + } + }); + } + }); }, columns: [ @@ -656,7 +853,12 @@ function initializeDataTable() { }, { data: "document_type_display", class: "whiteText" }, { - data: "candidate_urls_count", + data: "delta_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: "curated_urls_count", class: "text-center whiteText", sortable: true, }, @@ -669,12 +871,12 @@ function initializeDataTable() { }, }, { data: "id", visible: false, searchable: false }, - { data: "match_pattern_type", visible: false }, + { data: "match_pattern_type", visible: false, searchable: true }, { data: "document_type", visible: false }, ], }); - $("#candidateDocTypeMatchPatternFilter").on("beforeinput", function (val) { + $("#deltaDocTypeMatchPatternFilter").on("beforeinput", function (val) { document_type_patterns_table.columns(0).search(this.value).draw(); }); } @@ -682,100 +884,78 @@ function initializeDataTable() { var division_patterns_table = $("#division_patterns_table").DataTable({ dom: "lBrtip", buttons: [ - { - text: "Add Pattern", - className: "addPattern", - action: function () { - $modal = $("#divisionPatternModal").modal(); - }, + { + text: "Add Pattern", + className: "addPattern", + action: function () { + $modal = $("#divisionPatternModal").modal(); }, - { - text: "Customize Columns", - className: "customizeColumns", - action: function () { - modalContents("#division_patterns_table"); - }, + }, + { + text: "Customize Columns", + className: "customizeColumns", + action: function () { + modalContents("#division_patterns_table"); }, + }, ], lengthMenu: [ - [25, 50, 100, 500], - ["Show 25", "Show 50", "Show 100", "Show 500"], + [25, 50, 100, 500], + ["Show 25", "Show 50", "Show 100", "Show 500"], ], orderCellsTop: true, pageLength: 100, ajax: `/api/division-patterns/?format=datatables&collection_id=${collection_id}`, initComplete: function (data) { - this.api() - .columns() - .every(function (index) { - var table = $("#division_patterns_table").DataTable(); - - let addDropdownSelect = { - 1: { - columnToSearch: 6, - matchPattern: { - "Individual URL Pattern": 1, - "Multi-URL Pattern": 2, - }, - }, - 2: { - columnToSearch: 7, - matchPattern: { - "Astrophysics": 1, - "Biological and Physical Sciences": 2, - "Earth Science": 3, - "Heliophysics": 4, - "Planetary Science": 5, - }, - }, - }; - - let column = this; - if (column.data().length === 0) { - $(`#division-patterns-dropdown-${index}`).prop("disabled", true); - } else if (index in addDropdownSelect) { - $("#division-patterns-dropdown-" + index).on("change", function () { - let col = addDropdownSelect[index].columnToSearch; - let searchInput = - addDropdownSelect[index].matchPattern[$(this).val()]; - if ($(this).val() === "" || $(this).val() === undefined) - table.columns(col).search("").draw(); - else { - table.columns(col).search(searchInput).draw(); - } - }); - } - }); - }, - - columns: [ - { data: "match_pattern", class: "whiteText" }, - { - data: "match_pattern_type_display", - class: "text-center whiteText", - sortable: false, - }, - { data: "division_display", class: "whiteText" }, - { - data: "candidate_urls_count", - class: "text-center whiteText", - sortable: true, - }, - { - data: null, - sortable: false, - class: "text-center", - render: function (data, type, row) { - return ``; - }, + var table = $("#division_patterns_table").DataTable(); + this.api().columns().every(function (index) { + if (index === 1) { + // Updated pattern type dropdown handler + $("#division-patterns-dropdown-1").on("change", function () { + if ($(this).val() === "") { + table.column(7).search("").draw(); + } else { + const patternType = matchPatternTypeMap[$(this).val()]; + table.column(7).search(patternType).draw(); + } + }); + } + }); + }, + + columns: [ + { data: "match_pattern", class: "whiteText" }, + { + data: "match_pattern_type_display", + class: "text-center whiteText", + sortable: false, + }, + { data: "division_display", class: "whiteText" }, + { + data: "delta_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: "curated_urls_count", + class: "text-center whiteText", + sortable: true, + }, + { + data: null, + sortable: false, + class: "text-center", + render: function (data, type, row) { + return ``; }, - { data: "id", visible: false, searchable: false }, - { data: "match_pattern_type", visible: false }, - { data: "division", visible: false }, + }, + { data: "id", visible: false, searchable: false }, + { data: "match_pattern_type", visible: false, searchable: true }, + { data: "division", visible: false }, ], }); -$("#candidateDivisionMatchPatternFilter").on("beforeinput", function (val) { +$("#deltaDivisionMatchPatternFilter").on("beforeinput", function (val) { division_patterns_table.columns(0).search(this.value).draw(); }); @@ -823,6 +1003,7 @@ function setupClickHandlers() { handleTabsClick(); handleWorkflowStatusSelect(); + handleReindexingStatusSelect(); } function getDivisionColumn() { @@ -841,8 +1022,32 @@ function getDivisionColumn() {

+
`; + }, + }; +} + +function getCuratedDivisionColumn() { + return { + data: "division", + width: "10%", + visible: (is_multi_division === "true"), searchable: is_multi_division, + render: function (data, type, row) { + let button_text = data ? divisionDict[data] : "Select"; + let button_color = data ? "btn-success" : "btn-secondary"; + return ` + `; }, @@ -854,6 +1059,8 @@ function handleDivisionSelect() { $("body").on("click", ".division_select", function () { var match_pattern = $(this).closest(".document_type_dropdown").data("match-pattern"); var division = $(this).attr("value"); + // var match_pattern_type = $(this).attr("match-pattern-type"); + // postDivisionPatterns(match_pattern, match_pattern_type, division); postDivisionPatterns(match_pattern, 1, division); }); } @@ -867,7 +1074,7 @@ function postDivision(urlId, division) { csrfmiddlewaretoken: csrftoken, }, success: function (data) { - $('#candidate_urls_table').DataTable().ajax.reload(null, false); + $('#delta_urls_table').DataTable().ajax.reload(null, false); toastr.success("Division assigned successfully!"); }, error: function (xhr, status, error) { @@ -882,12 +1089,12 @@ $("#division_pattern_form").on("submit", function (e) { inputs = {}; input_serialized = $(this).serializeArray(); input_serialized.forEach((field) => { - inputs[field.name] = field.value; + inputs[field.name] = field.value; }); console.log("Form Inputs:", inputs); // Debugging line to check inputs - postDivisionPatterns(inputs.match_pattern, 2, inputs.division_pattern); + postDivisionPatterns(inputs.match_pattern, inputs.match_pattern_type, inputs.division_pattern); // Close the modal if it is open $("#divisionPatternModal").modal("hide"); @@ -902,43 +1109,43 @@ $(".division_form_select").on("click", function (e) { function postDivisionPatterns(match_pattern, match_pattern_type, division) { if (!match_pattern) { - toastr.error("Please highlight a pattern to add division."); - return; + toastr.error("Please highlight a pattern to add division."); + return; } $.ajax({ - url: "/api/division-patterns/", - type: "POST", - data: { - collection: collection_id, - match_pattern: match_pattern, - match_pattern_type: match_pattern_type, - division: division, - csrfmiddlewaretoken: csrftoken, - }, - success: function (data) { - $("#candidate_urls_table").DataTable().ajax.reload(null, false); - $("#division_patterns_table").DataTable().ajax.reload(null, false); - if (currentTab === "") { // Only add a notification if we are on the first tab - newDivisionPatternsCount = newDivisionPatternsCount + 1; - $("#divisionPatternsTab").html( - `Division Patterns ` + - newDivisionPatternsCount + " new" + - `` - ); - } - }, - error: function (xhr, status, error) { - var errorMessage = xhr.responseText; - if ( - errorMessage == - '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}' - ) { - toastr.success("Pattern already exists"); - return; - } - toastr.error(errorMessage); - }, + url: "/api/division-patterns/", + type: "POST", + data: { + collection: collection_id, + match_pattern: match_pattern, + match_pattern_type: match_pattern_type, + division: division, + csrfmiddlewaretoken: csrftoken, + }, + success: function (data) { + $("#delta_urls_table").DataTable().ajax.reload(null, false); + $("#division_patterns_table").DataTable().ajax.reload(null, false); + if (currentTab === "") { // Only add a notification if we are on the first tab + newDivisionPatternsCount = newDivisionPatternsCount + 1; + $("#divisionPatternsTab").html( + `Division Patterns ` + + newDivisionPatternsCount + " new" + + `` + ); + } + }, + error: function (xhr, status, error) { + var errorMessage = xhr.responseText; + if ( + errorMessage == + '{"error":{"non_field_errors":["The fields collection, match_pattern must make a unique set."]},"status_code":400}' + ) { + toastr.success("Pattern already exists"); + return; + } + toastr.error(errorMessage); + }, }); } @@ -947,12 +1154,36 @@ function getURLColumn() { data: "url", width: "30%", render: function (data, type, row) { - return `
${remove_protocol( + return `
${remove_protocol( + data + )} + open_in_new
`; + }, + }; +} + +function getCuratedURLColumn() { + return { + data: "url", + width: "30%", + render: function (data, type, row) { + return `
${remove_protocol( data )} - open_in_new
`; + open_in_new
`; + }, + }; +} + +function getDeletedColumn(true_icon, false_icon) { + return { + data: "to_delete", + width: "10%", + class: "col-1 text-center", + render: function (data, type, row) { + return data === true ? true_icon : false_icon; }, }; } @@ -967,18 +1198,38 @@ function getScrapedTitleColumn() { }; } +function getCuratedScrapedTitleColumn() { + return { + data: "scraped_title", + width: "30%", + render: function (data, type, row) { + return `${data}`; + }, + }; +} + function getGeneratedTitleColumn() { return { data: "generated_title", width: "20%", render: function (data, type, row) { - return ``; + return ``; + }, + }; +} + +function getCuratedGeneratedTitleColumn() { + return { + data: "generated_title", + width: "20%", + render: function (data, type, row) { + return ``; }, }; } @@ -991,11 +1242,28 @@ function getExcludedColumn(true_icon, false_icon) { render: function (data, type, row) { return data === true ? `${true_icon}` + row["url"] + )}>${true_icon}` + : `${false_icon}`; + }, + }; +} + +function getCuratedExcludedColumn(true_icon, false_icon) { + return { + data: "excluded", + width: "10%", + class: "col-1 text-center", + render: function (data, type, row) { + return data === true + ? `${true_icon}` : `${false_icon}`; + row["url"] + )}>${false_icon}`; }, }; } @@ -1016,8 +1284,42 @@ function getDocumentTypeColumn() { button_color = data ? "btn-success" : "btn-secondary"; return ` `; + }, + }; +} + +function getCuratedDocumentTypeColumn() { + return { + data: "document_type", + width: "10%", + render: function (data, type, row) { + var dict = { + 1: "Images", + 2: "Data", + 3: "Documentation", + 4: "Software and Tools", + 5: "Missions and Instruments", + }; + button_text = data ? dict[data] : "Select"; + button_color = data ? "btn-success" : "btn-secondary"; + return ` +