diff --git a/README.md b/README.md index 33440d64..0b9d167c 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,11 @@ name | description of purpose .github/workflows | Scheduling and automation agency_identifier | Matches URLs with an agency from the PDAP database annotation_pipeline | Automated pipeline for generating training data in our ML data source identification models. Manages common crawl, HTML tag collection, and Label Studio import/export -common_crawler | Interfaces with the Common Crawl dataset to extract urls, creating batches to identify or annotate html_tag_collector | Collects HTML header, meta, and title tags and appends them to a JSON file. The idea is to make a richer dataset for algorithm training and data labeling. hugging_face | Utilities for interacting with our machine learning space at [Hugging Face](https://huggingface.co/PDAP) identification_pipeline.py | The core python script uniting this modular pipeline. More details below. openai-playground | Scripts for accessing the openai API on PDAP's shared account +source_collectors| Tools for extracting metadata from different sources, including CKAN data portals and Common Crawler ## How to use diff --git a/source_collectors/README.md b/source_collectors/README.md new file mode 100644 index 00000000..1b02215c --- /dev/null +++ b/source_collectors/README.md @@ -0,0 +1,11 @@ +# Source Collectors + +The Source Collectors are tools that collect URLs from various sources for the purposes of storing as batches. + +Each tool is intended to be used in conjunction with the other tools in this repo. + +Currently, the following tools are available: +- Common Crawler: Interfaces with the Common Crawl dataset to extract urls +- Auto-Googler: A tool for automating the process of fetching URLs from Google Search and processing them for source collection. +- CKAN Scraper: A tool for retrieving packages from CKAN data portals and parsing relevant information +- MuckRock Scraper: A tool for retrieving FOIA requests from MuckRock and parsing relevant information \ No newline at end of file diff --git a/source_collectors/auto_googler/AutoGoogler.py b/source_collectors/auto_googler/AutoGoogler.py new file mode 100644 index 00000000..0141ce23 --- /dev/null +++ b/source_collectors/auto_googler/AutoGoogler.py @@ -0,0 +1,28 @@ +from source_collectors.auto_googler.GoogleSearcher import GoogleSearcher +from source_collectors.auto_googler.SearchConfig import SearchConfig + + +class AutoGoogler: + """ + The AutoGoogler orchestrates the process of fetching urls from Google Search + and processing them for source collection + + """ + def __init__(self, search_config: SearchConfig, google_searcher: GoogleSearcher): + self.search_config = search_config + self.google_searcher = google_searcher + self.data = {query : [] for query in search_config.queries} + + def run(self) -> str: + """ + Runs the AutoGoogler + Yields status messages + """ + for query in self.search_config.queries: + yield f"Searching for '{query}' ..." + results = self.google_searcher.search(query) + yield f"Found {len(results)} results for '{query}'." + if results is not None: + self.data[query] = results + yield "Done." + diff --git a/source_collectors/auto_googler/GoogleSearcher.py b/source_collectors/auto_googler/GoogleSearcher.py new file mode 100644 index 00000000..6638770a --- /dev/null +++ b/source_collectors/auto_googler/GoogleSearcher.py @@ -0,0 +1,65 @@ +from typing import Union + +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +class QuotaExceededError(Exception): + pass + +class GoogleSearcher: + """ + A class that provides a GoogleSearcher object for performing searches using the Google Custom Search API. + + Attributes: + api_key (str): The API key required for accessing the Google Custom Search API. + cse_id (str): The CSE (Custom Search Engine) ID required for identifying the specific search engine to use. + service (Google API service): The Google API service object for performing the search. + + Methods: + __init__(api_key: str, cse_id: str) + Initializes a GoogleSearcher object with the provided API key and CSE ID. Raises a RuntimeError if either + the API key or CSE ID is None. + + search(query: str) -> Union[list[dict], None] + Performs a search using the Google Custom Search API with the provided query string. Returns a list of + search results as dictionaries or None if the daily quota for the API has been exceeded. Raises a RuntimeError + if any other error occurs during the search. + """ + GOOGLE_SERVICE_NAME = "customsearch" + GOOGLE_SERVICE_VERSION = "v1" + + def __init__( + self, + api_key: str, + cse_id: str + ): + if api_key is None or cse_id is None: + raise RuntimeError("Custom search API key and CSE ID cannot be None.") + self.api_key = api_key + self.cse_id = cse_id + + self.service = build(self.GOOGLE_SERVICE_NAME, + self.GOOGLE_SERVICE_VERSION, + developerKey=self.api_key) + + def search(self, query: str) -> Union[list[dict], None]: + """ + Searches for results using the specified query. + + Args: + query (str): The query to search for. + + Returns: Union[list[dict], None]: A list of dictionaries representing the search results. + If the daily quota is exceeded, None is returned. + """ + try: + res = self.service.cse().list(q=query, cx=self.cse_id).execute() + if "items" not in res: + return None + return res['items'] + # Process your results + except HttpError as e: + if "Quota exceeded" in str(e): + raise QuotaExceededError("Quota exceeded for the day") + else: + raise RuntimeError(f"An error occurred: {str(e)}") \ No newline at end of file diff --git a/source_collectors/auto_googler/README.md b/source_collectors/auto_googler/README.md new file mode 100644 index 00000000..99cc96dc --- /dev/null +++ b/source_collectors/auto_googler/README.md @@ -0,0 +1,13 @@ +The Auto-Googler is a tool that automates the process of fetching URLs from Google Search and processing them for source collection. + +Auto-Googler logic consists of: + +1. `GoogleSearcher`, the class that interfaces with the Google Search API. +2. `AutoGoogler`, the class that orchestrates the search process. +3. `SearchConfig`, the class that holds the configuration for the search queries. + +The following environment variables must be set in an `.env` file in the root directory: + +- GOOGLE_API_KEY: The API key required for accessing the Google Custom Search API. + +The Auto-Googler is intended to be used in conjunction with the other tools in this repo. diff --git a/source_collectors/auto_googler/SearchConfig.py b/source_collectors/auto_googler/SearchConfig.py new file mode 100644 index 00000000..b2daf8fd --- /dev/null +++ b/source_collectors/auto_googler/SearchConfig.py @@ -0,0 +1,24 @@ +from typing import Annotated + +from pydantic import BaseModel, Field + + +class SearchConfig(BaseModel): + """ + A class that holds the configuration for the AutoGoogler + Simple now, but might be extended in the future + """ + urls_per_result: Annotated[ + int, + "Maximum number of URLs returned per result. Minimum is 1. Default is 10" + ] = Field( + default=10, + ge=1 + ) + queries: Annotated[ + list[str], + "List of queries to search for." + ] = Field( + min_length=1 + ) + diff --git a/source_collectors/auto_googler/__init__.py b/source_collectors/auto_googler/__init__.py new file mode 100644 index 00000000..e69de29b