Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test_app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ jobs:

- name: Run tests
run: |
uv run pytest tests/test_automated
uv run pytest tests/test_alembic
uv run pytest tests/automated
uv run pytest tests/alembic
8 changes: 1 addition & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ RUN playwright install chromium

# Copy project files
COPY api ./api
COPY collector_db ./collector_db
COPY db ./collector_db
COPY collector_manager ./collector_manager
COPY core ./core
COPY html_tag_collector ./html_tag_collector
Expand All @@ -32,12 +32,6 @@ COPY pdap_api_client ./pdap_api_client
COPY execute.sh ./execute.sh
COPY .project-root ./.project-root

COPY tests/conftest.py ./tests/conftest.py
COPY tests/__init__.py ./tests/__init__.py
COPY tests/test_automated ./tests/test_automated
COPY tests/test_alembic ./tests/test_alembic
COPY tests/helpers ./tests/helpers

COPY llm_api_logic ./llm_api_logic

# Expose the application port
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ To access the API documentation, visit `http://{host}:8000/docs`.
To run tests on the container, run:

```bash
docker exec data-source-identification-app-1 pytest /app/tests/test_automated
docker exec data-source-identification-app-1 pytest /app/tests/automated
```

Be sure to inspect the `docker-compose.yml` file in the root directory -- some environment variables are dependant upon the Operating System you are using.
Expand Down
4 changes: 2 additions & 2 deletions alembic/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from sqlalchemy import engine_from_config
from sqlalchemy import pool

from collector_db.helper_functions import get_postgres_connection_string
from collector_db.models import Base
from db.helper_functions import get_postgres_connection_string
from db.models import Base

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from alembic import op
import sqlalchemy as sa

from collector_db.enums import PGEnum
from db.enums import PGEnum

# revision identifiers, used by Alembic.
revision: str = '072b32a45b1c'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from alembic import op
import sqlalchemy as sa
from collector_db.enums import PGEnum
from db.enums import PGEnum
# revision identifiers, used by Alembic.
revision: str = '19bf57df581a'
down_revision: Union[str, None] = '072b32a45b1c'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sqlalchemy as sa
from alembic import op

from collector_db.enums import PGEnum
from db.enums import PGEnum

# revision identifiers, used by Alembic.
revision: str = '9afd8a5633c9'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from alembic import op
import sqlalchemy as sa

from collector_db.enums import PGEnum
from db.enums import PGEnum

# revision identifiers, used by Alembic.
revision: str = 'd7eb670edaf0'
Expand Down
4 changes: 2 additions & 2 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from api.routes.search import search_router
from api.routes.task import task_router
from api.routes.url import url_router
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DatabaseClient import DatabaseClient
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DatabaseClient import DatabaseClient
from collector_manager.AsyncCollectorManager import AsyncCollectorManager
from core.AsyncCore import AsyncCore
from core.AsyncCoreLogger import AsyncCoreLogger
Expand Down
2 changes: 1 addition & 1 deletion api/routes/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from fastapi.params import Query, Depends

from api.dependencies import get_core, get_async_core
from collector_db.DTOs.BatchInfo import BatchInfo
from db.DTOs.BatchInfo import BatchInfo
from collector_manager.enums import CollectorType
from core.AsyncCore import AsyncCore
from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse
Expand Down
6 changes: 3 additions & 3 deletions api/routes/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from fastapi import APIRouter, Depends, Query, Path

from api.dependencies import get_async_core
from collector_db.DTOs.GetTaskStatusResponseInfo import GetTaskStatusResponseInfo
from collector_db.DTOs.TaskInfo import TaskInfo
from collector_db.enums import TaskType
from db.DTOs.GetTaskStatusResponseInfo import GetTaskStatusResponseInfo
from db.DTOs.TaskInfo import TaskInfo
from db.enums import TaskType
from core.AsyncCore import AsyncCore
from core.enums import BatchStatus
from security_manager.SecurityManager import AccessInfo, get_access_info
Expand Down
2 changes: 1 addition & 1 deletion apply_migrations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from alembic import command
from alembic.config import Config

from collector_db.helper_functions import get_postgres_connection_string
from db.helper_functions import get_postgres_connection_string

def apply_migrations():
print("Applying migrations...")
Expand Down
6 changes: 3 additions & 3 deletions collector_manager/AsyncCollectorBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

from pydantic import BaseModel

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo
from collector_db.DTOs.LogInfo import LogInfo
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DTOs.InsertURLsInfo import InsertURLsInfo
from db.DTOs.LogInfo import LogInfo
from collector_manager.enums import CollectorType
from core.AsyncCoreLogger import AsyncCoreLogger
from core.FunctionTrigger import FunctionTrigger
Expand Down
2 changes: 1 addition & 1 deletion collector_manager/AsyncCollectorManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fastapi import HTTPException
from pydantic import BaseModel

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_manager.AsyncCollectorBase import AsyncCollectorBase
from collector_manager.CollectorManager import InvalidCollectorError
from collector_manager.collector_mapping import COLLECTOR_MAPPING
Expand Down
8 changes: 4 additions & 4 deletions core/AsyncCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from pydantic import BaseModel
from sqlalchemy.exc import IntegrityError

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.BatchInfo import BatchInfo
from collector_db.DTOs.GetTaskStatusResponseInfo import GetTaskStatusResponseInfo
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DTOs.BatchInfo import BatchInfo
from db.DTOs.GetTaskStatusResponseInfo import GetTaskStatusResponseInfo
from db.enums import TaskType
from collector_manager.AsyncCollectorManager import AsyncCollectorManager
from collector_manager.enums import CollectorType
from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo
Expand Down
4 changes: 2 additions & 2 deletions core/AsyncCoreLogger.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.LogInfo import LogInfo
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DTOs.LogInfo import LogInfo


class AsyncCoreLogger:
Expand Down
4 changes: 2 additions & 2 deletions core/DTOs/CollectionLifecycleInfo.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pydantic import BaseModel

from collector_db.DTOs.DuplicateInfo import DuplicateInfo
from collector_db.DTOs.URLMapping import URLMapping
from db.DTOs.DuplicateInfo import DuplicateInfo
from db.DTOs.URLMapping import URLMapping


class CollectionLifecycleInfo(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetBatchLogsResponse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pydantic import BaseModel

from collector_db.DTOs.LogInfo import LogOutputInfo
from db.DTOs.LogInfo import LogOutputInfo


class GetBatchLogsResponse(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetBatchStatusResponse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pydantic import BaseModel

from collector_db.DTOs.BatchInfo import BatchInfo
from db.DTOs.BatchInfo import BatchInfo


class GetBatchStatusResponse(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetDuplicatesByBatchResponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel

from collector_db.DTOs.DuplicateInfo import DuplicateInfo
from db.DTOs.DuplicateInfo import DuplicateInfo


class GetDuplicatesByBatchResponse(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetNextRecordTypeAnnotationResponseInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import Field, BaseModel

from collector_db.DTOs.URLMapping import URLMapping
from db.DTOs.URLMapping import URLMapping
from core.enums import RecordType
from html_tag_collector.DataClassTags import ResponseHTMLInfo

Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetNextRelevanceAnnotationResponseInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel, Field

from collector_db.DTOs.URLMapping import URLMapping
from db.DTOs.URLMapping import URLMapping
from core.DTOs.ResponseURLInfo import ResponseURLInfo
from html_tag_collector.DataClassTags import ResponseHTMLInfo

Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetTasksResponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel

from collector_db.enums import TaskType
from db.enums import TaskType
from core.enums import BatchStatus


Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetURLsByBatchResponse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pydantic import BaseModel

from collector_db.DTOs.URLInfo import URLInfo
from db.DTOs.URLInfo import URLInfo


class GetURLsByBatchResponse(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/GetURLsResponseInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from pydantic import BaseModel

from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource
from db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource
from collector_manager.enums import URLStatus

class GetURLsResponseErrorInfo(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/task_data_objects/URLRecordTypeTDO.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel

from collector_db.DTOs.URLWithHTML import URLWithHTML
from db.DTOs.URLWithHTML import URLWithHTML
from core.enums import RecordType


Expand Down
2 changes: 1 addition & 1 deletion core/DTOs/task_data_objects/UrlHtmlTDO.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel

from collector_db.DTOs.URLInfo import URLInfo
from db.DTOs.URLInfo import URLInfo
from html_tag_collector.DataClassTags import ResponseHTMLInfo
from html_tag_collector.URLRequestInterface import URLResponseInfo

Expand Down
2 changes: 1 addition & 1 deletion core/SourceCollectorCore.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional, Any


from collector_db.DatabaseClient import DatabaseClient
from db.DatabaseClient import DatabaseClient
from core.enums import BatchStatus


Expand Down
6 changes: 3 additions & 3 deletions core/TaskManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from core.classes.task_operators.URL404ProbeTaskOperator import URL404ProbeTaskOperator
from core.classes.task_operators.URLDuplicateTaskOperator import URLDuplicateTaskOperator
from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.TaskInfo import TaskInfo
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DTOs.TaskInfo import TaskInfo
from db.enums import TaskType
from core.DTOs.GetTasksResponse import GetTasksResponse
from core.DTOs.TaskOperatorRunInfo import TaskOperatorRunInfo, TaskOperatorOutcome
from core.FunctionTrigger import FunctionTrigger
Expand Down
2 changes: 1 addition & 1 deletion core/classes/HTMLContentInfoGetter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from collector_db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType
from db.DTOs.URLHTMLContentInfo import URLHTMLContentInfo, HTMLContentType

Check warning on line 1 in core/classes/HTMLContentInfoGetter.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/HTMLContentInfoGetter.py#L1 <100>

Missing docstring in public module
Raw output
./core/classes/HTMLContentInfoGetter.py:1:1: D100 Missing docstring in public module
from html_tag_collector.DataClassTags import ResponseHTMLInfo


Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from aiohttp import ClientSession

from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from db.enums import TaskType
from collector_manager.enums import CollectorType
from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient

Check warning on line 1 in core/classes/task_operators/SubmitApprovedURLTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/SubmitApprovedURLTaskOperator.py#L1 <100>

Missing docstring in public module
Raw output
./core/classes/task_operators/SubmitApprovedURLTaskOperator.py:1:1: D100 Missing docstring in public module
from db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from db.enums import TaskType
from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO
from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase
from pdap_api_client.PDAPClient import PDAPClient
Expand Down
4 changes: 2 additions & 2 deletions core/classes/task_operators/TaskOperatorBase.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import traceback
from abc import ABC, abstractmethod
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.enums import TaskType
from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome, TaskOperatorRunInfo
from core.enums import BatchStatus

Expand Down
4 changes: 2 additions & 2 deletions core/classes/task_operators/URL404ProbeTaskOperator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from pydantic import BaseModel

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.enums import TaskType
from core.DTOs.task_data_objects.URL404ProbeTDO import URL404ProbeTDO
from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase
from html_tag_collector.URLRequestInterface import URLRequestInterface
Expand Down
4 changes: 2 additions & 2 deletions core/classes/task_operators/URLDuplicateTaskOperator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from aiohttp import ClientResponseError

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.enums import TaskType
from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO
from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase
from pdap_api_client.PDAPClient import PDAPClient
Expand Down
8 changes: 4 additions & 4 deletions core/classes/task_operators/URLHTMLTaskOperator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from http import HTTPStatus

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from collector_db.DTOs.URLInfo import URLInfo
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from db.DTOs.URLInfo import URLInfo
from db.enums import TaskType
from core.DTOs.task_data_objects.UrlHtmlTDO import UrlHtmlTDO
from core.classes.HTMLContentInfoGetter import HTMLContentInfoGetter
from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import Optional

from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient
from db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from db.enums import TaskType
from collector_manager.enums import CollectorType
from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO
from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase
Expand Down
6 changes: 3 additions & 3 deletions core/classes/task_operators/URLRecordTypeTaskOperator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from collector_db.enums import TaskType
from db.AsyncDatabaseClient import AsyncDatabaseClient

Check warning on line 1 in core/classes/task_operators/URLRecordTypeTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/URLRecordTypeTaskOperator.py#L1 <100>

Missing docstring in public module
Raw output
./core/classes/task_operators/URLRecordTypeTaskOperator.py:1:1: D100 Missing docstring in public module
from db.DTOs.URLErrorInfos import URLErrorPydanticInfo
from db.enums import TaskType
from core.DTOs.task_data_objects.URLRecordTypeTDO import URLRecordTypeTDO
from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase
from core.enums import RecordType
Expand Down
2 changes: 1 addition & 1 deletion core/preprocessors/AutoGooglerPreprocessor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List

from collector_db.DTOs.URLInfo import URLInfo
from db.DTOs.URLInfo import URLInfo
from core.preprocessors.PreprocessorBase import PreprocessorBase


Expand Down
Loading