diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 594021092..83a51f2b2 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -59,6 +59,7 @@ from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2 +from vulnerabilities.pipelines.v2_importers import libreoffice_importer as libreoffice_importer_v2 from vulnerabilities.pipelines.v2_importers import mattermost_importer as mattermost_importer_v2 from vulnerabilities.pipelines.v2_importers import mozilla_importer as mozilla_importer_v2 from vulnerabilities.pipelines.v2_importers import nginx_importer as nginx_importer_v2 @@ -118,6 +119,7 @@ retiredotnet_importer_v2.RetireDotnetImporterPipeline, ubuntu_osv_importer_v2.UbuntuOSVImporterPipeline, alpine_linux_importer_v2.AlpineLinuxImporterPipeline, + libreoffice_importer_v2.LibreOfficeImporterPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, diff --git a/vulnerabilities/pipelines/v2_importers/libreoffice_importer.py b/vulnerabilities/pipelines/v2_importers/libreoffice_importer.py new file mode 100644 index 000000000..7a4f8d614 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/libreoffice_importer.py @@ -0,0 +1,132 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging +import re +from typing import Iterable + +import dateparser +import requests +from bs4 import BeautifulSoup + +from vulnerabilities.importer import AdvisoryDataV2 +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 + +logger = logging.getLogger(__name__) + +ADVISORIES_URL = "https://www.libreoffice.org/about-us/security/advisories/" + + +class LibreOfficeImporterPipeline(VulnerableCodeBaseImporterPipelineV2): + """Collect LibreOffice security advisories from libreoffice.org.""" + + pipeline_id = "libreoffice_importer" + spdx_license_expression = "LicenseRef-scancode-proprietary-license" + license_url = "https://www.libreoffice.org/about-us/security/" + precedence = 200 + + @classmethod + def steps(cls): + return ( + cls.fetch, + cls.collect_and_store_advisories, + ) + + def fetch(self): + self.log(f"Fetch `{ADVISORIES_URL}`") + resp = requests.get(ADVISORIES_URL, timeout=30) + resp.raise_for_status() + self.advisory_urls = parse_advisory_urls(resp.text) + + def advisories_count(self): + return len(self.advisory_urls) + + def collect_advisories(self) -> Iterable[AdvisoryDataV2]: + for url in self.advisory_urls: + try: + resp = requests.get(url, timeout=30) + resp.raise_for_status() + except Exception as e: + logger.error("Failed to fetch %s: %s", url, e) + continue + advisory = parse_advisory(resp.text, url) + if advisory: + yield advisory + + +def parse_advisory_urls(html: str) -> list: + """Return deduplicated advisory page URLs from the listing page.""" + slugs = re.findall(r"/about-us/security/advisories/(cve-[\d-]+)/", html) + seen = dict.fromkeys(slugs) + return [f"https://www.libreoffice.org/about-us/security/advisories/{slug}/" for slug in seen] + + +def parse_advisory(html: str, url: str): + """Parse a LibreOffice individual advisory page; return None if advisory id is missing.""" + soup = BeautifulSoup(html, features="lxml") + body = soup.find("body") + body_id = body.get("id", "") if body else "" + if not body_id.startswith("cve-"): + return None + advisory_id = body_id.upper() + + content = soup.select_one("section#content1 div.margin-20") + if not content: + return None + + text = content.get_text(separator="\n") + + title = _get_field(text, "Title") + date_str = _get_field(text, "Announced") + + date_published = None + if date_str: + date_published = dateparser.parse( + date_str, + settings={"TIMEZONE": "UTC", "RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"}, + ) + if date_published is None: + logger.warning("Could not parse date %r for %s", date_str, advisory_id) + + desc_m = re.search( + r"Description\s*\n?\s*:\s*\n+(.*?)(?=\nCredits\b|\nReferences\b|$)", + text, + re.DOTALL, + ) + description = " ".join(desc_m.group(1).split()).strip() if desc_m else "" + + references = [] + in_refs = False + for tag in content.descendants: + tag_name = getattr(tag, "name", None) + if tag_name == "strong" and "References" in tag.get_text(): + in_refs = True + if in_refs and tag_name == "a": + href = tag.get("href", "") + if href.startswith("http"): + references.append(ReferenceV2(url=href)) + + return AdvisoryDataV2( + advisory_id=advisory_id, + aliases=[], + summary=description or title, + affected_packages=[], + references=references, + date_published=date_published, + weaknesses=[], + severities=[], + url=url, + original_advisory_text=str(content), + ) + + +def _get_field(text: str, label: str) -> str: + m = re.search(rf"{re.escape(label)}\s*:\s*\n?\s*([^\n]+)", text) + return m.group(1).strip() if m else "" diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_libreoffice_importer.py b/vulnerabilities/tests/pipelines/v2_importers/test_libreoffice_importer.py new file mode 100644 index 000000000..070381287 --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_libreoffice_importer.py @@ -0,0 +1,152 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from unittest import TestCase +from unittest.mock import MagicMock +from unittest.mock import patch + +from vulnerabilities.pipelines.v2_importers.libreoffice_importer import LibreOfficeImporterPipeline +from vulnerabilities.pipelines.v2_importers.libreoffice_importer import parse_advisory +from vulnerabilities.pipelines.v2_importers.libreoffice_importer import parse_advisory_urls + +LISTING_HTML = """ +CVE-2025-1080 +CVE-2023-2255 +CVE-2023-4863 +""" + +ADVISORY_HTML = """\ +
+Title: Macro URL arbitrary script execution
+Announced: March 4, 2025
Description:
LibreOffice supports Office URI Schemes to enable browser integration.
+Credits:
+References:
CVE-2025-1080