From 71c1b02b77f354280db43855bb5c1104bfa5039f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 22 Dec 2024 09:57:01 -0500 Subject: [PATCH 1/3] Create AutoGoogler.py --- source_collectors/auto_googler/AutoGoogler.py | 28 ++++++++ .../auto_googler/GoogleSearcher.py | 65 +++++++++++++++++++ source_collectors/auto_googler/README.md | 6 ++ .../auto_googler/SearchConfig.py | 24 +++++++ source_collectors/auto_googler/__init__.py | 0 5 files changed, 123 insertions(+) create mode 100644 source_collectors/auto_googler/AutoGoogler.py create mode 100644 source_collectors/auto_googler/GoogleSearcher.py create mode 100644 source_collectors/auto_googler/README.md create mode 100644 source_collectors/auto_googler/SearchConfig.py create mode 100644 source_collectors/auto_googler/__init__.py diff --git a/source_collectors/auto_googler/AutoGoogler.py b/source_collectors/auto_googler/AutoGoogler.py new file mode 100644 index 00000000..0141ce23 --- /dev/null +++ b/source_collectors/auto_googler/AutoGoogler.py @@ -0,0 +1,28 @@ +from source_collectors.auto_googler.GoogleSearcher import GoogleSearcher +from source_collectors.auto_googler.SearchConfig import SearchConfig + + +class AutoGoogler: + """ + The AutoGoogler orchestrates the process of fetching urls from Google Search + and processing them for source collection + + """ + def __init__(self, search_config: SearchConfig, google_searcher: GoogleSearcher): + self.search_config = search_config + self.google_searcher = google_searcher + self.data = {query : [] for query in search_config.queries} + + def run(self) -> str: + """ + Runs the AutoGoogler + Yields status messages + """ + for query in self.search_config.queries: + yield f"Searching for '{query}' ..." + results = self.google_searcher.search(query) + yield f"Found {len(results)} results for '{query}'." + if results is not None: + self.data[query] = results + yield "Done." + diff --git a/source_collectors/auto_googler/GoogleSearcher.py b/source_collectors/auto_googler/GoogleSearcher.py new file mode 100644 index 00000000..6638770a --- /dev/null +++ b/source_collectors/auto_googler/GoogleSearcher.py @@ -0,0 +1,65 @@ +from typing import Union + +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +class QuotaExceededError(Exception): + pass + +class GoogleSearcher: + """ + A class that provides a GoogleSearcher object for performing searches using the Google Custom Search API. + + Attributes: + api_key (str): The API key required for accessing the Google Custom Search API. + cse_id (str): The CSE (Custom Search Engine) ID required for identifying the specific search engine to use. + service (Google API service): The Google API service object for performing the search. + + Methods: + __init__(api_key: str, cse_id: str) + Initializes a GoogleSearcher object with the provided API key and CSE ID. Raises a RuntimeError if either + the API key or CSE ID is None. + + search(query: str) -> Union[list[dict], None] + Performs a search using the Google Custom Search API with the provided query string. Returns a list of + search results as dictionaries or None if the daily quota for the API has been exceeded. Raises a RuntimeError + if any other error occurs during the search. + """ + GOOGLE_SERVICE_NAME = "customsearch" + GOOGLE_SERVICE_VERSION = "v1" + + def __init__( + self, + api_key: str, + cse_id: str + ): + if api_key is None or cse_id is None: + raise RuntimeError("Custom search API key and CSE ID cannot be None.") + self.api_key = api_key + self.cse_id = cse_id + + self.service = build(self.GOOGLE_SERVICE_NAME, + self.GOOGLE_SERVICE_VERSION, + developerKey=self.api_key) + + def search(self, query: str) -> Union[list[dict], None]: + """ + Searches for results using the specified query. + + Args: + query (str): The query to search for. + + Returns: Union[list[dict], None]: A list of dictionaries representing the search results. + If the daily quota is exceeded, None is returned. + """ + try: + res = self.service.cse().list(q=query, cx=self.cse_id).execute() + if "items" not in res: + return None + return res['items'] + # Process your results + except HttpError as e: + if "Quota exceeded" in str(e): + raise QuotaExceededError("Quota exceeded for the day") + else: + raise RuntimeError(f"An error occurred: {str(e)}") \ No newline at end of file diff --git a/source_collectors/auto_googler/README.md b/source_collectors/auto_googler/README.md new file mode 100644 index 00000000..744ac52b --- /dev/null +++ b/source_collectors/auto_googler/README.md @@ -0,0 +1,6 @@ + + +Auto-Googler logic consists of: + +1. `GoogleSearcher`, the class that interfaces with the Google Search API. +2. `AutoGoogler`, the class that orchestrates the search process. \ No newline at end of file diff --git a/source_collectors/auto_googler/SearchConfig.py b/source_collectors/auto_googler/SearchConfig.py new file mode 100644 index 00000000..b2daf8fd --- /dev/null +++ b/source_collectors/auto_googler/SearchConfig.py @@ -0,0 +1,24 @@ +from typing import Annotated + +from pydantic import BaseModel, Field + + +class SearchConfig(BaseModel): + """ + A class that holds the configuration for the AutoGoogler + Simple now, but might be extended in the future + """ + urls_per_result: Annotated[ + int, + "Maximum number of URLs returned per result. Minimum is 1. Default is 10" + ] = Field( + default=10, + ge=1 + ) + queries: Annotated[ + list[str], + "List of queries to search for." + ] = Field( + min_length=1 + ) + diff --git a/source_collectors/auto_googler/__init__.py b/source_collectors/auto_googler/__init__.py new file mode 100644 index 00000000..e69de29b From cc70461d14b4c73c30d8fec4c86e81a5ea583db7 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 22 Dec 2024 15:33:34 -0500 Subject: [PATCH 2/3] Create and update README documents. --- source_collectors/README.md | 11 +++++++++++ source_collectors/auto_googler/README.md | 11 +++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 source_collectors/README.md diff --git a/source_collectors/README.md b/source_collectors/README.md new file mode 100644 index 00000000..6e5bf346 --- /dev/null +++ b/source_collectors/README.md @@ -0,0 +1,11 @@ +# Source Collectors + +The Source Collectors are tools that collect URLs from various sources for the purposes of storing. + +Each tool is intended to be used in conjunction with the other tools in this repo. + +Currently, the following tools are available: +- Common Crawler: A tool for crawling the Common Crawl dataset. +- Auto-Googler: A tool for automating the process of fetching URLs from Google Search and processing them for source collection. +- CKAN Scraper: A tool for retrieving packages from CKAN data portals and parsing relevant information +- MuckRock Scraper: A tool for retrieving FOIA requests from MuckRock and parsing relevant information \ No newline at end of file diff --git a/source_collectors/auto_googler/README.md b/source_collectors/auto_googler/README.md index 744ac52b..99cc96dc 100644 --- a/source_collectors/auto_googler/README.md +++ b/source_collectors/auto_googler/README.md @@ -1,6 +1,13 @@ - +The Auto-Googler is a tool that automates the process of fetching URLs from Google Search and processing them for source collection. Auto-Googler logic consists of: 1. `GoogleSearcher`, the class that interfaces with the Google Search API. -2. `AutoGoogler`, the class that orchestrates the search process. \ No newline at end of file +2. `AutoGoogler`, the class that orchestrates the search process. +3. `SearchConfig`, the class that holds the configuration for the search queries. + +The following environment variables must be set in an `.env` file in the root directory: + +- GOOGLE_API_KEY: The API key required for accessing the Google Custom Search API. + +The Auto-Googler is intended to be used in conjunction with the other tools in this repo. From 98cb9a55dfa444c09726b9cdb114d2d1d77eb27b Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 22 Dec 2024 15:51:35 -0500 Subject: [PATCH 3/3] Add mention of `source_collectors` directory in repo-level readme, move common_crawler description to `source_collectors` README.md --- README.md | 2 +- source_collectors/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 33440d64..0b9d167c 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,11 @@ name | description of purpose .github/workflows | Scheduling and automation agency_identifier | Matches URLs with an agency from the PDAP database annotation_pipeline | Automated pipeline for generating training data in our ML data source identification models. Manages common crawl, HTML tag collection, and Label Studio import/export -common_crawler | Interfaces with the Common Crawl dataset to extract urls, creating batches to identify or annotate html_tag_collector | Collects HTML header, meta, and title tags and appends them to a JSON file. The idea is to make a richer dataset for algorithm training and data labeling. hugging_face | Utilities for interacting with our machine learning space at [Hugging Face](https://huggingface.co/PDAP) identification_pipeline.py | The core python script uniting this modular pipeline. More details below. openai-playground | Scripts for accessing the openai API on PDAP's shared account +source_collectors| Tools for extracting metadata from different sources, including CKAN data portals and Common Crawler ## How to use diff --git a/source_collectors/README.md b/source_collectors/README.md index 6e5bf346..1b02215c 100644 --- a/source_collectors/README.md +++ b/source_collectors/README.md @@ -1,11 +1,11 @@ # Source Collectors -The Source Collectors are tools that collect URLs from various sources for the purposes of storing. +The Source Collectors are tools that collect URLs from various sources for the purposes of storing as batches. Each tool is intended to be used in conjunction with the other tools in this repo. Currently, the following tools are available: -- Common Crawler: A tool for crawling the Common Crawl dataset. +- Common Crawler: Interfaces with the Common Crawl dataset to extract urls - Auto-Googler: A tool for automating the process of fetching URLs from Google Search and processing them for source collection. - CKAN Scraper: A tool for retrieving packages from CKAN data portals and parsing relevant information - MuckRock Scraper: A tool for retrieving FOIA requests from MuckRock and parsing relevant information \ No newline at end of file