Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2
from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2
from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2
from vulnerabilities.pipelines.v2_importers import libreoffice_importer as libreoffice_importer_v2
from vulnerabilities.pipelines.v2_importers import mattermost_importer as mattermost_importer_v2
from vulnerabilities.pipelines.v2_importers import mozilla_importer as mozilla_importer_v2
from vulnerabilities.pipelines.v2_importers import nginx_importer as nginx_importer_v2
Expand Down Expand Up @@ -118,6 +119,7 @@
retiredotnet_importer_v2.RetireDotnetImporterPipeline,
ubuntu_osv_importer_v2.UbuntuOSVImporterPipeline,
alpine_linux_importer_v2.AlpineLinuxImporterPipeline,
libreoffice_importer_v2.LibreOfficeImporterPipeline,
nvd_importer.NVDImporterPipeline,
github_importer.GitHubAPIImporterPipeline,
gitlab_importer.GitLabImporterPipeline,
Expand Down
132 changes: 132 additions & 0 deletions vulnerabilities/pipelines/v2_importers/libreoffice_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
import re
from typing import Iterable

import dateparser
import requests
from bs4 import BeautifulSoup

from vulnerabilities.importer import AdvisoryDataV2
from vulnerabilities.importer import ReferenceV2
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2

logger = logging.getLogger(__name__)

ADVISORIES_URL = "https://www.libreoffice.org/about-us/security/advisories/"


class LibreOfficeImporterPipeline(VulnerableCodeBaseImporterPipelineV2):
"""Collect LibreOffice security advisories from libreoffice.org."""

pipeline_id = "libreoffice_importer"
spdx_license_expression = "LicenseRef-scancode-proprietary-license"
license_url = "https://www.libreoffice.org/about-us/security/"
precedence = 200

@classmethod
def steps(cls):
return (
cls.fetch,
cls.collect_and_store_advisories,
)

def fetch(self):
self.log(f"Fetch `{ADVISORIES_URL}`")
resp = requests.get(ADVISORIES_URL, timeout=30)
resp.raise_for_status()
self.advisory_urls = parse_advisory_urls(resp.text)

def advisories_count(self):
return len(self.advisory_urls)

def collect_advisories(self) -> Iterable[AdvisoryDataV2]:
for url in self.advisory_urls:
try:
resp = requests.get(url, timeout=30)
resp.raise_for_status()
except Exception as e:
logger.error("Failed to fetch %s: %s", url, e)
continue
advisory = parse_advisory(resp.text, url)
if advisory:
yield advisory


def parse_advisory_urls(html: str) -> list:
"""Return deduplicated advisory page URLs from the listing page."""
slugs = re.findall(r"/about-us/security/advisories/(cve-[\d-]+)/", html)
seen = dict.fromkeys(slugs)
return [f"https://www.libreoffice.org/about-us/security/advisories/{slug}/" for slug in seen]


def parse_advisory(html: str, url: str):
"""Parse a LibreOffice individual advisory page; return None if advisory id is missing."""
soup = BeautifulSoup(html, features="lxml")
body = soup.find("body")
body_id = body.get("id", "") if body else ""
if not body_id.startswith("cve-"):
return None
advisory_id = body_id.upper()

content = soup.select_one("section#content1 div.margin-20")
if not content:
return None

text = content.get_text(separator="\n")

title = _get_field(text, "Title")
date_str = _get_field(text, "Announced")

date_published = None
if date_str:
date_published = dateparser.parse(
date_str,
settings={"TIMEZONE": "UTC", "RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"},
)
if date_published is None:
logger.warning("Could not parse date %r for %s", date_str, advisory_id)

desc_m = re.search(
r"Description\s*\n?\s*:\s*\n+(.*?)(?=\nCredits\b|\nReferences\b|$)",
text,
re.DOTALL,
)
description = " ".join(desc_m.group(1).split()).strip() if desc_m else ""

references = []
in_refs = False
for tag in content.descendants:
tag_name = getattr(tag, "name", None)
if tag_name == "strong" and "References" in tag.get_text():
in_refs = True
if in_refs and tag_name == "a":
href = tag.get("href", "")
if href.startswith("http"):
references.append(ReferenceV2(url=href))

return AdvisoryDataV2(
advisory_id=advisory_id,
aliases=[],
summary=description or title,
affected_packages=[],
references=references,
date_published=date_published,
weaknesses=[],
severities=[],
url=url,
original_advisory_text=str(content),
)


def _get_field(text: str, label: str) -> str:
m = re.search(rf"{re.escape(label)}\s*:\s*\n?\s*([^\n]+)", text)
return m.group(1).strip() if m else ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

from unittest import TestCase
from unittest.mock import MagicMock
from unittest.mock import patch

from vulnerabilities.pipelines.v2_importers.libreoffice_importer import LibreOfficeImporterPipeline
from vulnerabilities.pipelines.v2_importers.libreoffice_importer import parse_advisory
from vulnerabilities.pipelines.v2_importers.libreoffice_importer import parse_advisory_urls

LISTING_HTML = """
<a href="/about-us/security/advisories/cve-2025-1080/">CVE-2025-1080</a>
<a href="/about-us/security/advisories/cve-2023-2255/">CVE-2023-2255</a>
<a href="/about-us/security/advisories/cve-2023-4863/">CVE-2023-4863</a>
"""

ADVISORY_HTML = """\
<html><body id="cve-2025-1080">
<section id="content1">
<div class="row col-sm-10 margin-20">
<p><strong><span class="label">Title:</span></strong> Macro URL arbitrary script execution</p>
<p><strong>Announced:</strong> March 4, 2025<br/><strong>Description</strong>:</p>
<p>LibreOffice supports Office URI Schemes to enable browser integration.</p>
<p><strong>Credits</strong>:</p>
<p><strong>References</strong>:<br/><a href="https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2025-1080">CVE-2025-1080</a></p>
</div>
</section>
</body></html>
"""


class TestParseAdvisoryUrls(TestCase):
def test_extracts_urls_from_html(self):
urls = parse_advisory_urls(LISTING_HTML)
self.assertIn(
"https://www.libreoffice.org/about-us/security/advisories/cve-2025-1080/", urls
)
self.assertIn(
"https://www.libreoffice.org/about-us/security/advisories/cve-2023-2255/", urls
)
self.assertIn(
"https://www.libreoffice.org/about-us/security/advisories/cve-2023-4863/", urls
)

def test_deduplicates_repeated_urls(self):
html = '<a href="/about-us/security/advisories/cve-2025-1080/">x</a>' * 2
urls = parse_advisory_urls(html)
self.assertEqual(len(urls), 1)

def test_empty_html_returns_empty_list(self):
self.assertEqual(parse_advisory_urls("<html></html>"), [])


class TestParseAdvisory(TestCase):
URL = "https://www.libreoffice.org/about-us/security/advisories/cve-2025-1080/"

def test_parses_advisory_id(self):
advisory = parse_advisory(ADVISORY_HTML, self.URL)
self.assertIsNotNone(advisory)
self.assertEqual(advisory.advisory_id, "CVE-2025-1080")

def test_parses_description_as_summary(self):
advisory = parse_advisory(ADVISORY_HTML, self.URL)
self.assertIn("Office URI Schemes", advisory.summary)

def test_parses_date(self):
advisory = parse_advisory(ADVISORY_HTML, self.URL)
self.assertIsNotNone(advisory.date_published)
self.assertEqual(advisory.date_published.year, 2025)
self.assertEqual(advisory.date_published.month, 3)
self.assertEqual(advisory.date_published.day, 4)

def test_extracts_reference_url(self):
advisory = parse_advisory(ADVISORY_HTML, self.URL)
urls = [r.url for r in advisory.references]
self.assertIn("https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2025-1080", urls)

def test_severities_and_weaknesses_are_empty(self):
advisory = parse_advisory(ADVISORY_HTML, self.URL)
self.assertEqual(advisory.severities, [])
self.assertEqual(advisory.weaknesses, [])

def test_missing_body_id_returns_none(self):
html = (
"<html><body id='not-a-cve'>"
"<section id='content1'><div class='margin-20'></div></section>"
"</body></html>"
)
self.assertIsNone(parse_advisory(html, self.URL))

def test_missing_content_div_returns_none(self):
html = "<html><body id='cve-2025-1080'><section id='other'></section></body></html>"
self.assertIsNone(parse_advisory(html, self.URL))

def test_original_advisory_text_contains_advisory_id(self):
advisory = parse_advisory(ADVISORY_HTML, self.URL)
self.assertIn("CVE-2025-1080", advisory.original_advisory_text)


class TestLibreOfficeImporterPipeline(TestCase):
@patch("vulnerabilities.pipelines.v2_importers.libreoffice_importer.requests.get")
def test_fetch_stores_advisory_urls(self, mock_get):
resp = MagicMock()
resp.text = LISTING_HTML
resp.raise_for_status.return_value = None
mock_get.return_value = resp
pipeline = LibreOfficeImporterPipeline()
pipeline.fetch()
self.assertTrue(any("cve-2025-1080" in u for u in pipeline.advisory_urls))
self.assertTrue(any("cve-2023-2255" in u for u in pipeline.advisory_urls))

@patch("vulnerabilities.pipelines.v2_importers.libreoffice_importer.requests.get")
def test_collect_advisories_yields_advisory(self, mock_get):
resp = MagicMock()
resp.text = ADVISORY_HTML
resp.raise_for_status.return_value = None
mock_get.return_value = resp
pipeline = LibreOfficeImporterPipeline()
pipeline.advisory_urls = [
"https://www.libreoffice.org/about-us/security/advisories/cve-2025-1080/"
]
advisories = list(pipeline.collect_advisories())
self.assertEqual(len(advisories), 1)
self.assertEqual(advisories[0].advisory_id, "CVE-2025-1080")

@patch("vulnerabilities.pipelines.v2_importers.libreoffice_importer.requests.get")
def test_collect_advisories_skips_on_http_error(self, mock_get):
mock_get.side_effect = Exception("timeout")
pipeline = LibreOfficeImporterPipeline()
pipeline.advisory_urls = [
"https://www.libreoffice.org/about-us/security/advisories/cve-2025-1080/"
]
logger_name = "vulnerabilities.pipelines.v2_importers.libreoffice_importer"
with self.assertLogs(logger_name, level="ERROR") as cm:
advisories = list(pipeline.collect_advisories())
self.assertEqual(advisories, [])
self.assertTrue(any("cve-2025-1080" in msg for msg in cm.output))

def test_advisories_count(self):
pipeline = LibreOfficeImporterPipeline()
pipeline.advisory_urls = [
"https://www.libreoffice.org/about-us/security/advisories/cve-2025-1080/",
"https://www.libreoffice.org/about-us/security/advisories/cve-2023-2255/",
]
self.assertEqual(pipeline.advisories_count(), 2)
Loading