Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions bases/ecoindex/backend/routers/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import requests
from celery.result import AsyncResult
import ua_generator
from ecoindex.backend.dependencies.validation import validate_api_key_batch
from ecoindex.backend.models.dependencies_parameters.id import IdParameter
from ecoindex.backend.utils import check_quota
Expand All @@ -17,6 +16,7 @@
example_host_unreachable,
)
from ecoindex.models.tasks import QueueTaskApi, QueueTaskApiBatch, QueueTaskResult
from ecoindex.scraper.scrap import EcoindexScraper
from ecoindex.worker.tasks import ecoindex_batch_import_task, ecoindex_task
from ecoindex.worker_component import app as task_app
from fastapi import APIRouter, Depends, HTTPException, Response, status
Expand Down Expand Up @@ -49,6 +49,13 @@ async def add_ecoindex_analysis_task(
example=WebPage(url="https://www.ecoindex.fr", width=1920, height=1080),
),
],
custom_headers: Annotated[
dict[str, str],
Body(
description="Custom headers to add to the request",
example={"X-My-Custom-Header": "MyValue"},
),
] = {},
session: AsyncSession = Depends(get_session),
) -> str:
if Settings().DAILY_LIMIT_PER_HOST:
Expand All @@ -68,20 +75,29 @@ async def add_ecoindex_analysis_task(
detail="This host is excluded from the analysis",
)

ua = EcoindexScraper.get_user_agent()
headers = {**custom_headers, **ua.headers.get()}

try:
ua = ua_generator.generate()
r = requests.head(url=web_page.url, timeout=5, headers=ua.headers.get())
r = requests.head(
url=web_page.url,
timeout=5,
headers=headers,
)
r.raise_for_status()
except requests.exceptions.RequestException as e:
raise HTTPException(
status_code=e.response.status_code
if e.response
else status.HTTP_400_BAD_REQUEST,
detail=f"The URL {web_page.url} is unreachable. Are you really sure of this url? 🤔",
detail=f"The URL {web_page.url} is unreachable. Are you really sure of this url? 🤔 ({e.response.status_code if e.response else ''})",
)

task_result = ecoindex_task.delay( # type: ignore
url=str(web_page.url), width=web_page.width, height=web_page.height
url=str(web_page.url),
width=web_page.width,
height=web_page.height,
custom_headers=headers,
)

return task_result.id
Expand Down
10 changes: 8 additions & 2 deletions bases/ecoindex/worker/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@
queue="ecoindex",
dont_autoretry_for=[EcoindexScraperStatusException, TypeError],
)
def ecoindex_task(self, url: str, width: int, height: int) -> str:
def ecoindex_task(
self, url: str, width: int, height: int, custom_headers: dict[str, str]
) -> str:
queue_task_result = run(
async_ecoindex_task(self, url=url, width=width, height=height)
async_ecoindex_task(
self, url=url, width=width, height=height, custom_headers=custom_headers
)
)

return queue_task_result.model_dump_json()
Expand All @@ -50,6 +54,7 @@ async def async_ecoindex_task(
url: str,
width: int,
height: int,
custom_headers: dict[str, str],
) -> QueueTaskResult:
try:
session_generator = get_session()
Expand All @@ -69,6 +74,7 @@ async def async_ecoindex_task(
else None,
screenshot_gid=Settings().SCREENSHOTS_GID,
screenshot_uid=Settings().SCREENSHOTS_UID,
custom_headers=custom_headers,
).get_page_analysis()

db_result = await save_ecoindex_result_db(
Expand Down
14 changes: 14 additions & 0 deletions components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from time import sleep
from uuid import uuid4

from ua_generator.user_agent import UserAgent
from ua_generator import generate as ua_generate

from ecoindex.compute import compute_ecoindex
from ecoindex.exceptions.scraper import EcoindexScraperStatusException
from ecoindex.models.compute import PageMetrics, Result, ScreenShot, WindowSize
Expand All @@ -28,6 +31,7 @@ def __init__(
headless: bool = True,
basic_auth: str | None = None,
cookies: list[SetCookieParam] = [],
custom_headers: dict[str, str] = {},
):
self.url = url
self.window_size = window_size
Expand All @@ -45,6 +49,15 @@ def __init__(
self.headless = headless
self.basic_auth = basic_auth
self.cookies = cookies
self.custom_headers = custom_headers

@staticmethod
def get_user_agent() -> UserAgent:
return ua_generate(
device="desktop",
browser="chrome",
platform="linux",
)

@deprecated("This method is useless with new version of EcoindexScraper")
def init_chromedriver(self):
Expand Down Expand Up @@ -86,6 +99,7 @@ async def scrap_page(self) -> PageMetrics:
}
if self.basic_auth
else None,
extra_http_headers=self.custom_headers,
)
await self.context.add_cookies(self.cookies)
self.page = await self.context.new_page()
Expand Down
49 changes: 23 additions & 26 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion projects/ecoindex_api/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion projects/ecoindex_api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ sqlmodel = "^0.0.14"
sentry-sdk = "^2.8.0"
setuptools = "^75.6.0"
cryptography = "^44.0.2"
ua-generator = "^2.0.5"

[tool.poetry.group.backend.dependencies]
uvicorn = "^0.23.2"
ua-generator = "^2.0.3"

[tool.poetry.group.worker.dependencies]
pillow = "^10.3.0"
Expand Down
13 changes: 12 additions & 1 deletion projects/ecoindex_cli/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions projects/ecoindex_cli/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pyyaml = "^6.0.1"
rich = "^13.6.0"
scrapy = "^2.11.0"
typer = "^0.9.0"
ua-generator = "^2.0.5"

[tool.poetry.scripts]
ecoindex-cli = "ecoindex.cli.app:app"
Expand Down
13 changes: 12 additions & 1 deletion projects/ecoindex_scraper/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions projects/ecoindex_scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ typing-extensions = "^4.8.0"
pyyaml = "^6.0.1"
pillow = "^10.1.0"
setuptools = ">=69.5.1,<71.0.0"
ua-generator = "^2.0.5"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
1 change: 0 additions & 1 deletion tasks/PoetryTaskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,4 @@ tasks:
cmds:
- poetry run playwright install chromium --with-deps
silent: true
internal: true
interactive: true