Skip to content

Commit 2a58765

Browse files
committed
feat(app): remove huggingface logic
1 parent 82ce183 commit 2a58765

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1
-15837
lines changed

Dockerfile

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ COPY collector_db ./collector_db
2222
COPY collector_manager ./collector_manager
2323
COPY core ./core
2424
COPY html_tag_collector ./html_tag_collector
25-
COPY hugging_face/url_relevance ./hugging_face/url_relevance
26-
COPY hugging_face/url_record_type_labeling ./hugging_face/url_record_type_labeling
27-
COPY hugging_face/HuggingFaceInterface.py ./hugging_face/HuggingFaceInterface.py
2825
COPY source_collectors ./source_collectors
2926
COPY util ./util
3027
COPY alembic.ini ./alembic.ini

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ name | description of purpose
88
agency_identifier | Matches URLs with an agency from the PDAP database
99
annotation_pipeline | Automated pipeline for generating training data in our ML data source identification models. Manages common crawl, HTML tag collection, and Label Studio import/export
1010
html_tag_collector | Collects HTML header, meta, and title tags and appends them to a JSON file. The idea is to make a richer dataset for algorithm training and data labeling.
11-
hugging_face | Utilities for interacting with our machine learning space at [Hugging Face](https://huggingface.co/PDAP)
1211
identification_pipeline.py | The core python script uniting this modular pipeline. More details below.
1312
openai-playground | Scripts for accessing the openai API on PDAP's shared account
1413
source_collectors| Tools for extracting metadata from different sources, including CKAN data portals and Common Crawler

api/main.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from html_tag_collector.ResponseParser import HTMLResponseParser
2727
from html_tag_collector.RootURLCache import RootURLCache
2828
from html_tag_collector.URLRequestInterface import URLRequestInterface
29-
from hugging_face.HuggingFaceInterface import HuggingFaceInterface
3029
from pdap_access_manager import AccessManager
3130
from pdap_api_client.PDAPClient import PDAPClient
3231
from util.DiscordNotifier import DiscordPoster
@@ -54,7 +53,6 @@ async def lifespan(app: FastAPI):
5453
)
5554
task_manager = TaskManager(
5655
adb_client=adb_client,
57-
huggingface_interface=HuggingFaceInterface(),
5856
url_request_interface=URLRequestInterface(),
5957
html_parser=HTMLResponseParser(
6058
root_url_cache=RootURLCache()

core/DTOs/task_data_objects/URLRelevanceHuggingfaceTDO.py

Lines changed: 0 additions & 10 deletions
This file was deleted.

core/TaskManager.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@
1515
from core.classes.task_operators.URLHTMLTaskOperator import URLHTMLTaskOperator
1616
from core.classes.task_operators.URLMiscellaneousMetadataTaskOperator import URLMiscellaneousMetadataTaskOperator
1717
from core.classes.task_operators.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator
18-
from core.classes.task_operators.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator
1918
from core.enums import BatchStatus
2019
from html_tag_collector.ResponseParser import HTMLResponseParser
2120
from html_tag_collector.URLRequestInterface import URLRequestInterface
22-
from hugging_face.HuggingFaceInterface import HuggingFaceInterface
2321
from llm_api_logic.OpenAIRecordClassifier import OpenAIRecordClassifier
2422
from pdap_api_client.PDAPClient import PDAPClient
2523
from util.DiscordNotifier import DiscordPoster
@@ -31,16 +29,14 @@ class TaskManager:
3129
def __init__(
3230
self,
3331
adb_client: AsyncDatabaseClient,
34-
huggingface_interface: HuggingFaceInterface,
3532
url_request_interface: URLRequestInterface,
3633
html_parser: HTMLResponseParser,
3734
discord_poster: DiscordPoster,
38-
pdap_client: PDAPClient
35+
pdap_client: PDAPClient,
3936
):
4037
# Dependencies
4138
self.adb_client = adb_client
4239
self.pdap_client = pdap_client
43-
self.huggingface_interface = huggingface_interface
4440
self.url_request_interface = url_request_interface
4541
self.html_parser = html_parser
4642
self.discord_poster = discord_poster
@@ -62,13 +58,6 @@ async def get_url_html_task_operator(self):
6258
)
6359
return operator
6460

65-
async def get_url_relevance_huggingface_task_operator(self):
66-
operator = URLRelevanceHuggingfaceTaskOperator(
67-
adb_client=self.adb_client,
68-
huggingface_interface=self.huggingface_interface
69-
)
70-
return operator
71-
7261
async def get_url_record_type_task_operator(self):
7362
operator = URLRecordTypeTaskOperator(
7463
adb_client=self.adb_client,
@@ -117,7 +106,6 @@ async def get_task_operators(self) -> list[TaskOperatorBase]:
117106
await self.get_url_html_task_operator(),
118107
await self.get_url_duplicate_task_operator(),
119108
await self.get_url_404_probe_task_operator(),
120-
# await self.get_url_relevance_huggingface_task_operator(),
121109
await self.get_url_record_type_task_operator(),
122110
await self.get_agency_identification_task_operator(),
123111
await self.get_url_miscellaneous_metadata_task_operator(),

core/classes/task_operators/URLRelevanceHuggingfaceTaskOperator.py

Lines changed: 0 additions & 63 deletions
This file was deleted.

hugging_face/HuggingFaceInterface.py

Lines changed: 0 additions & 40 deletions
This file was deleted.

hugging_face/README.md

Lines changed: 0 additions & 58 deletions
This file was deleted.

hugging_face/__init__.py

Whitespace-only changes.

hugging_face/example/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)