From 9fd6c90cdda58faf52a91d7a8ae5bfd2d45fa7d8 Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Tue, 16 Dec 2025 18:41:18 +0100 Subject: [PATCH 1/4] feat(namespace): add GeoRSS, Creative Commons, and ParseOptions API Implements Phase 1 of feedparser parity gap analysis: - Add GeoRSS namespace support for geographic location data - Supports point, line, polygon, and box elements - Coordinate validation and parsing - Add Creative Commons namespace for license metadata - Supports cc:license with rdf:resource attribute - Supports legacy creativeCommons:license text element - Converts licenses to links with rel="license" - Add ParseOptions API for parser configuration - resolve_relative_uris: URL resolution control - sanitize_html: HTML sanitization toggle - limits: ParserLimits integration - Presets: default(), strict(), permissive() - Enhanced date parsing - Year-only format ("2024") - Year-month format ("2024-12") --- crates/feedparser-rs-core/src/lib.rs | 2 + crates/feedparser-rs-core/src/namespace/cc.rs | 284 +++++++++++ .../src/namespace/georss.rs | 439 ++++++++++++++++++ .../feedparser-rs-core/src/namespace/mod.rs | 21 + crates/feedparser-rs-core/src/options.rs | 223 +++++++++ crates/feedparser-rs-core/src/types/entry.rs | 2 + crates/feedparser-rs-core/src/util/date.rs | 68 ++- 7 files changed, 1035 insertions(+), 4 deletions(-) create mode 100644 crates/feedparser-rs-core/src/namespace/cc.rs create mode 100644 crates/feedparser-rs-core/src/namespace/georss.rs create mode 100644 crates/feedparser-rs-core/src/options.rs diff --git a/crates/feedparser-rs-core/src/lib.rs b/crates/feedparser-rs-core/src/lib.rs index 9f41224..6fbb8b3 100644 --- a/crates/feedparser-rs-core/src/lib.rs +++ b/crates/feedparser-rs-core/src/lib.rs @@ -46,6 +46,7 @@ pub mod http; mod limits; /// Namespace handlers for extended feed formats pub mod namespace; +mod options; mod parser; /// Type definitions for feed data structures @@ -62,6 +63,7 @@ pub mod util; pub use error::{FeedError, Result}; pub use limits::{LimitError, ParserLimits}; +pub use options::ParseOptions; pub use parser::{detect_format, parse, parse_with_limits}; pub use types::{ Content, Enclosure, Entry, FeedMeta, FeedVersion, Generator, Image, ItunesCategory, diff --git a/crates/feedparser-rs-core/src/namespace/cc.rs b/crates/feedparser-rs-core/src/namespace/cc.rs new file mode 100644 index 0000000..33ef1ce --- /dev/null +++ b/crates/feedparser-rs-core/src/namespace/cc.rs @@ -0,0 +1,284 @@ +//! Creative Commons namespace support for license information +//! +//! Handles Creative Commons license metadata in RSS and Atom feeds. +//! Supports both the modern `cc:license` (with `rdf:resource` attribute) +//! and legacy `creativeCommons:license` text elements. +//! +//! # Supported Elements +//! +//! - `cc:license` (with `rdf:resource` attribute) - Modern CC namespace +//! - `creativeCommons:license` (text element) - Legacy Userland namespace +//! +//! # Specification +//! +//! Creative Commons: +//! Legacy: + +use crate::limits::ParserLimits; +use crate::types::generics::LimitedCollectionExt; +use crate::types::{FeedMeta, Link}; +use crate::Entry; + +/// Creative Commons namespace URI (modern) +pub const CC: &str = "http://creativecommons.org/ns#"; + +/// Creative Commons legacy namespace URI (Userland) +pub const CREATIVE_COMMONS: &str = "http://backend.userland.com/creativeCommonsRssModule"; + +/// Handle Creative Commons element at feed level +/// +/// Converts CC license information to a link with `rel="license"` +/// and adds it to the feed's links collection. +/// +/// # Arguments +/// +/// * `tag` - Element local name (e.g., "license") +/// * `attrs` - Element attributes as (name, value) pairs +/// * `text` - Element text content +/// * `feed` - Feed metadata to update +/// * `limits` - Parser limits for bounded collections +/// +/// # Returns +/// +/// `true` if element was recognized and handled, `false` otherwise +pub fn handle_feed_element( + tag: &[u8], + attrs: &[(Vec, String)], + text: &str, + feed: &mut FeedMeta, + limits: &ParserLimits, +) -> bool { + match tag { + b"license" => { + if let Some(license_url) = extract_license_url(attrs, text) { + feed.links.try_push_limited( + Link { + href: license_url, + rel: Some("license".to_string()), + ..Default::default() + }, + limits.max_links_per_feed, + ); + } + true + } + _ => false, + } +} + +/// Handle Creative Commons element at entry level +/// +/// Converts CC license information to a link with `rel="license"` +/// and adds it to the entry's links collection. +/// +/// # Arguments +/// +/// * `tag` - Element local name (e.g., "license") +/// * `attrs` - Element attributes as (name, value) pairs +/// * `text` - Element text content +/// * `entry` - Entry to update +/// * `limits` - Parser limits for bounded collections +/// +/// # Returns +/// +/// `true` if element was recognized and handled, `false` otherwise +pub fn handle_entry_element( + tag: &[u8], + attrs: &[(Vec, String)], + text: &str, + entry: &mut Entry, + limits: &ParserLimits, +) -> bool { + match tag { + b"license" => { + if let Some(license_url) = extract_license_url(attrs, text) { + entry.links.try_push_limited( + Link { + href: license_url, + rel: Some("license".to_string()), + ..Default::default() + }, + limits.max_links_per_entry, + ); + } + true + } + _ => false, + } +} + +/// Extract license URL from element +/// +/// Tries two methods in order: +/// 1. `rdf:resource` attribute (modern cc:license format) +/// 2. Text content (legacy creativeCommons:license format) +/// +/// # Arguments +/// +/// * `attrs` - Element attributes +/// * `text` - Element text content +/// +/// # Returns +/// +/// License URL if found, `None` otherwise +fn extract_license_url(attrs: &[(Vec, String)], text: &str) -> Option { + // Try rdf:resource attribute first (modern format) + // + for (name, value) in attrs { + if (name == b"resource" || name.ends_with(b":resource")) && !value.is_empty() { + return Some(value.clone()); + } + } + + // Fall back to text content (legacy format) + // http://creativecommons.org/licenses/by/4.0/ + let trimmed = text.trim(); + if !trimmed.is_empty() { + return Some(trimmed.to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_license_url_from_attribute() { + let attrs = vec![( + b"resource".to_vec(), + "http://creativecommons.org/licenses/by/4.0/".to_string(), + )]; + let url = extract_license_url(&attrs, "").unwrap(); + assert_eq!(url, "http://creativecommons.org/licenses/by/4.0/"); + } + + #[test] + fn test_extract_license_url_from_namespaced_attribute() { + let attrs = vec![( + b"rdf:resource".to_vec(), + "http://creativecommons.org/licenses/by-sa/4.0/".to_string(), + )]; + let url = extract_license_url(&attrs, "").unwrap(); + assert_eq!(url, "http://creativecommons.org/licenses/by-sa/4.0/"); + } + + #[test] + fn test_extract_license_url_from_text() { + let url = extract_license_url(&[], "http://creativecommons.org/licenses/by-nc/4.0/").unwrap(); + assert_eq!(url, "http://creativecommons.org/licenses/by-nc/4.0/"); + } + + #[test] + fn test_extract_license_url_from_text_with_whitespace() { + let url = extract_license_url( + &[], + " http://creativecommons.org/licenses/by-nd/4.0/ ", + ) + .unwrap(); + assert_eq!(url, "http://creativecommons.org/licenses/by-nd/4.0/"); + } + + #[test] + fn test_extract_license_url_prefers_attribute() { + // If both attribute and text present, attribute wins + let attrs = vec![( + b"rdf:resource".to_vec(), + "http://creativecommons.org/licenses/by/4.0/".to_string(), + )]; + let url = extract_license_url( + &attrs, + "http://creativecommons.org/licenses/by-sa/4.0/", + ) + .unwrap(); + assert_eq!(url, "http://creativecommons.org/licenses/by/4.0/"); + } + + #[test] + fn test_extract_license_url_empty() { + assert!(extract_license_url(&[], "").is_none()); + assert!(extract_license_url(&[], " ").is_none()); + } + + #[test] + fn test_handle_feed_element_license() { + let mut feed = FeedMeta::default(); + let limits = ParserLimits::default(); + + let attrs = vec![( + b"rdf:resource".to_vec(), + "http://creativecommons.org/licenses/by/4.0/".to_string(), + )]; + + let handled = handle_feed_element(b"license", &attrs, "", &mut feed, &limits); + assert!(handled); + assert_eq!(feed.links.len(), 1); + assert_eq!( + feed.links[0].href, + "http://creativecommons.org/licenses/by/4.0/" + ); + assert_eq!(feed.links[0].rel.as_deref(), Some("license")); + } + + #[test] + fn test_handle_entry_element_license() { + let mut entry = Entry::default(); + let limits = ParserLimits::default(); + + let handled = handle_entry_element( + b"license", + &[], + "http://creativecommons.org/licenses/by-sa/4.0/", + &mut entry, + &limits, + ); + assert!(handled); + assert_eq!(entry.links.len(), 1); + assert_eq!( + entry.links[0].href, + "http://creativecommons.org/licenses/by-sa/4.0/" + ); + assert_eq!(entry.links[0].rel.as_deref(), Some("license")); + } + + #[test] + fn test_handle_feed_element_unknown() { + let mut feed = FeedMeta::default(); + let limits = ParserLimits::default(); + + let handled = handle_feed_element(b"unknown", &[], "", &mut feed, &limits); + assert!(!handled); + } + + #[test] + fn test_handle_entry_element_unknown() { + let mut entry = Entry::default(); + let limits = ParserLimits::default(); + + let handled = handle_entry_element(b"unknown", &[], "", &mut entry, &limits); + assert!(!handled); + } + + #[test] + fn test_multiple_licenses() { + let mut feed = FeedMeta::default(); + let limits = ParserLimits::default(); + + let attrs1 = vec![( + b"rdf:resource".to_vec(), + "http://creativecommons.org/licenses/by/4.0/".to_string(), + )]; + handle_feed_element(b"license", &attrs1, "", &mut feed, &limits); + + let attrs2 = vec![( + b"rdf:resource".to_vec(), + "http://creativecommons.org/licenses/by-sa/4.0/".to_string(), + )]; + handle_feed_element(b"license", &attrs2, "", &mut feed, &limits); + + assert_eq!(feed.links.len(), 2); + assert_eq!(feed.links[0].rel.as_deref(), Some("license")); + assert_eq!(feed.links[1].rel.as_deref(), Some("license")); + } +} diff --git a/crates/feedparser-rs-core/src/namespace/georss.rs b/crates/feedparser-rs-core/src/namespace/georss.rs new file mode 100644 index 0000000..21a7eca --- /dev/null +++ b/crates/feedparser-rs-core/src/namespace/georss.rs @@ -0,0 +1,439 @@ +//! GeoRSS namespace support for geographic location data +//! +//! Supports parsing GeoRSS Simple elements for specifying geographic locations +//! in RSS and Atom feeds. GeoRSS is commonly used in mapping applications, +//! location-based services, and geocoded content. +//! +//! # Supported Elements +//! +//! - `georss:point` - Single latitude/longitude point +//! - `georss:line` - Line string (multiple points) +//! - `georss:polygon` - Polygon (closed shape) +//! - `georss:box` - Bounding box (lower-left + upper-right) +//! +//! # Specification +//! +//! GeoRSS Simple: + +use crate::limits::ParserLimits; +use crate::types::Entry; + +/// `GeoRSS` namespace URI +pub const GEORSS: &str = "http://www.georss.org/georss"; + +/// Type of geographic shape +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum GeoType { + /// Single point (latitude, longitude) + #[default] + Point, + /// Line connecting multiple points + Line, + /// Closed polygon shape + Polygon, + /// Bounding box (lower-left, upper-right corners) + Box, +} + +/// Geographic location data from `GeoRSS` +#[derive(Debug, Clone, Default, PartialEq)] +pub struct GeoLocation { + /// Type of geographic shape + pub geo_type: GeoType, + /// Coordinate pairs as (latitude, longitude) + /// + /// - Point: 1 coordinate pair + /// - Line: 2+ coordinate pairs + /// - Polygon: 3+ coordinate pairs (first == last for closed polygon) + /// - Box: 2 coordinate pairs (lower-left, upper-right) + pub coordinates: Vec<(f64, f64)>, + /// Coordinate reference system (e.g., "EPSG:4326" for WGS84) + /// + /// Default is WGS84 (latitude/longitude) if not specified + pub srs_name: Option, +} + +impl GeoLocation { + /// Creates new point location + /// + /// # Arguments + /// + /// * `lat` - Latitude in decimal degrees + /// * `lon` - Longitude in decimal degrees + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::namespace::georss::GeoLocation; + /// + /// let loc = GeoLocation::point(45.256, -71.92); + /// assert_eq!(loc.coordinates.len(), 1); + /// ``` + #[must_use] + pub fn point(lat: f64, lon: f64) -> Self { + Self { + geo_type: GeoType::Point, + coordinates: vec![(lat, lon)], + srs_name: None, + } + } + + /// Creates new line location + /// + /// # Arguments + /// + /// * `coords` - Vector of (latitude, longitude) pairs + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::namespace::georss::GeoLocation; + /// + /// let coords = vec![(45.256, -71.92), (46.0, -72.0)]; + /// let loc = GeoLocation::line(coords); + /// assert_eq!(loc.coordinates.len(), 2); + /// ``` + #[must_use] + pub const fn line(coords: Vec<(f64, f64)>) -> Self { + Self { + geo_type: GeoType::Line, + coordinates: coords, + srs_name: None, + } + } + + /// Creates new polygon location + /// + /// # Arguments + /// + /// * `coords` - Vector of (latitude, longitude) pairs + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::namespace::georss::GeoLocation; + /// + /// let coords = vec![ + /// (45.0, -71.0), + /// (46.0, -71.0), + /// (46.0, -72.0), + /// (45.0, -71.0), // Close the polygon + /// ]; + /// let loc = GeoLocation::polygon(coords); + /// ``` + #[must_use] + pub const fn polygon(coords: Vec<(f64, f64)>) -> Self { + Self { + geo_type: GeoType::Polygon, + coordinates: coords, + srs_name: None, + } + } + + /// Creates new bounding box location + /// + /// # Arguments + /// + /// * `lower_lat` - Lower-left latitude + /// * `lower_lon` - Lower-left longitude + /// * `upper_lat` - Upper-right latitude + /// * `upper_lon` - Upper-right longitude + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::namespace::georss::GeoLocation; + /// + /// let loc = GeoLocation::bbox(45.0, -72.0, 46.0, -71.0); + /// assert_eq!(loc.coordinates.len(), 2); + /// ``` + #[must_use] + pub fn bbox(lower_lat: f64, lower_lon: f64, upper_lat: f64, upper_lon: f64) -> Self { + Self { + geo_type: GeoType::Box, + coordinates: vec![(lower_lat, lower_lon), (upper_lat, upper_lon)], + srs_name: None, + } + } +} + +/// Parse `GeoRSS` element and update entry +/// +/// # Arguments +/// +/// * `tag` - Element local name (e.g., "point", "line", "polygon", "box") +/// * `text` - Element text content +/// * `entry` - Entry to update +/// * `_limits` - Parser limits (unused but kept for API consistency) +/// +/// # Returns +/// +/// `true` if element was recognized and handled, `false` otherwise +pub fn handle_entry_element( + tag: &[u8], + text: &str, + entry: &mut Entry, + _limits: &ParserLimits, +) -> bool { + match tag { + b"point" => { + if let Some(loc) = parse_point(text) { + entry.geo = Some(loc); + } + true + } + b"line" => { + if let Some(loc) = parse_line(text) { + entry.geo = Some(loc); + } + true + } + b"polygon" => { + if let Some(loc) = parse_polygon(text) { + entry.geo = Some(loc); + } + true + } + b"box" => { + if let Some(loc) = parse_box(text) { + entry.geo = Some(loc); + } + true + } + _ => false, + } +} + +/// Parse georss:point element +/// +/// Format: "lat lon" (space-separated) +/// Example: "45.256 -71.92" +fn parse_point(text: &str) -> Option { + let coords = parse_coordinates(text)?; + if coords.len() == 1 { + Some(GeoLocation { + geo_type: GeoType::Point, + coordinates: coords, + srs_name: None, + }) + } else { + None + } +} + +/// Parse georss:line element +/// +/// Format: "lat1 lon1 lat2 lon2 ..." (space-separated) +/// Example: "45.256 -71.92 46.0 -72.0" +fn parse_line(text: &str) -> Option { + let coords = parse_coordinates(text)?; + if coords.len() >= 2 { + Some(GeoLocation { + geo_type: GeoType::Line, + coordinates: coords, + srs_name: None, + }) + } else { + None + } +} + +/// Parse georss:polygon element +/// +/// Format: "lat1 lon1 lat2 lon2 lat3 lon3 ..." (space-separated) +/// Example: "45.0 -71.0 46.0 -71.0 46.0 -72.0 45.0 -71.0" +fn parse_polygon(text: &str) -> Option { + let coords = parse_coordinates(text)?; + if coords.len() >= 3 { + Some(GeoLocation { + geo_type: GeoType::Polygon, + coordinates: coords, + srs_name: None, + }) + } else { + None + } +} + +/// Parse georss:box element +/// +/// Format: space-separated values (lower-left, upper-right) +/// Example: "45.0 -72.0 46.0 -71.0" +fn parse_box(text: &str) -> Option { + let coords = parse_coordinates(text)?; + if coords.len() == 2 { + Some(GeoLocation { + geo_type: GeoType::Box, + coordinates: coords, + srs_name: None, + }) + } else { + None + } +} + +/// Parse space-separated coordinate pairs +/// +/// Format: "lat1 lon1 lat2 lon2 ..." (pairs of floats) +fn parse_coordinates(text: &str) -> Option> { + let parts: Vec<&str> = text.split_whitespace().collect(); + + // Must have even number of values (lat/lon pairs) + if parts.is_empty() || !parts.len().is_multiple_of(2) { + return None; + } + + let mut coords = Vec::with_capacity(parts.len() / 2); + + for chunk in parts.chunks(2) { + let lat = chunk[0].parse::().ok()?; + let lon = chunk[1].parse::().ok()?; + + // Basic validation: latitude should be -90 to 90, longitude -180 to 180 + if !(-90.0..=90.0).contains(&lat) || !(-180.0..=180.0).contains(&lon) { + return None; + } + + coords.push((lat, lon)); + } + + Some(coords) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_point() { + let loc = parse_point("45.256 -71.92").unwrap(); + assert_eq!(loc.geo_type, GeoType::Point); + assert_eq!(loc.coordinates.len(), 1); + assert_eq!(loc.coordinates[0], (45.256, -71.92)); + } + + #[test] + fn test_parse_point_invalid() { + assert!(parse_point("45.256").is_none()); + assert!(parse_point("45.256 -71.92 extra").is_none()); + assert!(parse_point("not numbers").is_none()); + assert!(parse_point("").is_none()); + } + + #[test] + fn test_parse_line() { + let loc = parse_line("45.256 -71.92 46.0 -72.0").unwrap(); + assert_eq!(loc.geo_type, GeoType::Line); + assert_eq!(loc.coordinates.len(), 2); + assert_eq!(loc.coordinates[0], (45.256, -71.92)); + assert_eq!(loc.coordinates[1], (46.0, -72.0)); + } + + #[test] + fn test_parse_line_single_point() { + // Line needs at least 2 points + assert!(parse_line("45.256 -71.92").is_none()); + } + + #[test] + fn test_parse_polygon() { + let loc = parse_polygon("45.0 -71.0 46.0 -71.0 46.0 -72.0 45.0 -71.0").unwrap(); + assert_eq!(loc.geo_type, GeoType::Polygon); + assert_eq!(loc.coordinates.len(), 4); + assert_eq!(loc.coordinates[0], (45.0, -71.0)); + assert_eq!(loc.coordinates[3], (45.0, -71.0)); // Closed polygon + } + + #[test] + fn test_parse_box() { + let loc = parse_box("45.0 -72.0 46.0 -71.0").unwrap(); + assert_eq!(loc.geo_type, GeoType::Box); + assert_eq!(loc.coordinates.len(), 2); + assert_eq!(loc.coordinates[0], (45.0, -72.0)); // Lower-left + assert_eq!(loc.coordinates[1], (46.0, -71.0)); // Upper-right + } + + #[test] + fn test_parse_box_invalid() { + // Box needs exactly 2 points (4 values) + assert!(parse_box("45.0 -72.0").is_none()); + assert!(parse_box("45.0 -72.0 46.0 -71.0 extra values").is_none()); + } + + #[test] + fn test_coordinate_validation() { + // Invalid latitude (> 90) + assert!(parse_point("91.0 0.0").is_none()); + // Invalid latitude (< -90) + assert!(parse_point("-91.0 0.0").is_none()); + // Invalid longitude (> 180) + assert!(parse_point("0.0 181.0").is_none()); + // Invalid longitude (< -180) + assert!(parse_point("0.0 -181.0").is_none()); + } + + #[test] + fn test_handle_entry_element_point() { + let mut entry = Entry::default(); + let limits = ParserLimits::default(); + + let handled = handle_entry_element(b"point", "45.256 -71.92", &mut entry, &limits); + assert!(handled); + assert!(entry.geo.is_some()); + + let geo = entry.geo.as_ref().unwrap(); + assert_eq!(geo.geo_type, GeoType::Point); + assert_eq!(geo.coordinates[0], (45.256, -71.92)); + } + + #[test] + fn test_handle_entry_element_line() { + let mut entry = Entry::default(); + let limits = ParserLimits::default(); + + let handled = handle_entry_element( + b"line", + "45.256 -71.92 46.0 -72.0", + &mut entry, + &limits, + ); + assert!(handled); + assert!(entry.geo.is_some()); + assert_eq!(entry.geo.as_ref().unwrap().geo_type, GeoType::Line); + } + + #[test] + fn test_handle_entry_element_unknown() { + let mut entry = Entry::default(); + let limits = ParserLimits::default(); + + let handled = handle_entry_element(b"unknown", "data", &mut entry, &limits); + assert!(!handled); + assert!(entry.geo.is_none()); + } + + #[test] + fn test_geo_location_constructors() { + let point = GeoLocation::point(45.0, -71.0); + assert_eq!(point.geo_type, GeoType::Point); + assert_eq!(point.coordinates.len(), 1); + + let line = GeoLocation::line(vec![(45.0, -71.0), (46.0, -72.0)]); + assert_eq!(line.geo_type, GeoType::Line); + assert_eq!(line.coordinates.len(), 2); + + let polygon = GeoLocation::polygon(vec![(45.0, -71.0), (46.0, -71.0), (45.0, -71.0)]); + assert_eq!(polygon.geo_type, GeoType::Polygon); + assert_eq!(polygon.coordinates.len(), 3); + + let bbox = GeoLocation::bbox(45.0, -72.0, 46.0, -71.0); + assert_eq!(bbox.geo_type, GeoType::Box); + assert_eq!(bbox.coordinates.len(), 2); + } + + #[test] + fn test_whitespace_handling() { + let loc = parse_point(" 45.256 -71.92 ").unwrap(); + assert_eq!(loc.coordinates[0], (45.256, -71.92)); + } +} diff --git a/crates/feedparser-rs-core/src/namespace/mod.rs b/crates/feedparser-rs-core/src/namespace/mod.rs index fbf012e..7056e7d 100644 --- a/crates/feedparser-rs-core/src/namespace/mod.rs +++ b/crates/feedparser-rs-core/src/namespace/mod.rs @@ -6,6 +6,8 @@ /// - **Dublin Core** (`dc:`) - Metadata elements /// - **Content** (`content:`) - Full HTML content /// - **Media RSS** (`media:`) - Multimedia content +/// - **GeoRSS** (`georss:`) - Geographic location data +/// - **Creative Commons** (`cc:`) - License information /// /// # Usage /// @@ -23,10 +25,14 @@ /// dublin_core::handle_feed_element("creator", "John Doe", &mut feed); /// assert_eq!(feed.author.as_deref(), Some("John Doe")); /// ``` +/// Creative Commons license information +pub mod cc; /// Content Module for RSS 1.0 pub mod content; /// Dublin Core Metadata Element Set pub mod dublin_core; +/// GeoRSS geographic location data +pub mod georss; /// Media RSS specification pub mod media_rss; @@ -55,6 +61,15 @@ pub mod namespaces { /// Podcast 2.0 pub const PODCAST: &str = "https://podcastindex.org/namespace/1.0"; + + /// `GeoRSS` + pub const GEORSS: &str = "http://www.georss.org/georss"; + + /// Creative Commons (modern) + pub const CC: &str = "http://creativecommons.org/ns#"; + + /// Creative Commons (legacy Userland) + pub const CREATIVE_COMMONS: &str = "http://backend.userland.com/creativeCommonsRssModule"; } /// Get namespace URI for a common prefix @@ -75,6 +90,9 @@ pub fn get_namespace_uri(prefix: &str) -> Option<&'static str> { "rdf" => Some(namespaces::RDF), "itunes" => Some(namespaces::ITUNES), "podcast" => Some(namespaces::PODCAST), + "georss" => Some(namespaces::GEORSS), + "cc" => Some(namespaces::CC), + "creativeCommons" => Some(namespaces::CREATIVE_COMMONS), _ => None, } } @@ -97,6 +115,9 @@ pub fn get_namespace_prefix(uri: &str) -> Option<&'static str> { namespaces::RDF => Some("rdf"), namespaces::ITUNES => Some("itunes"), namespaces::PODCAST => Some("podcast"), + namespaces::GEORSS => Some("georss"), + namespaces::CC => Some("cc"), + namespaces::CREATIVE_COMMONS => Some("creativeCommons"), _ => None, } } diff --git a/crates/feedparser-rs-core/src/options.rs b/crates/feedparser-rs-core/src/options.rs new file mode 100644 index 0000000..33d5757 --- /dev/null +++ b/crates/feedparser-rs-core/src/options.rs @@ -0,0 +1,223 @@ +//! Parser configuration options +//! +//! This module provides configuration options for customizing feed parsing behavior. +//! Options control features like URL resolution, HTML sanitization, and resource limits. + +use crate::limits::ParserLimits; + +/// Parser configuration options +/// +/// Controls various aspects of feed parsing behavior including URL resolution, +/// HTML sanitization, and resource limits for `DoS` protection. +/// +/// # Examples +/// +/// ``` +/// use feedparser_rs::ParseOptions; +/// +/// // Default options (recommended for most use cases) +/// let options = ParseOptions::default(); +/// assert!(options.resolve_relative_uris); +/// assert!(options.sanitize_html); +/// +/// // Custom options for restricted environment +/// let custom = ParseOptions { +/// resolve_relative_uris: true, +/// sanitize_html: false, // Trust feed content +/// limits: feedparser_rs::ParserLimits::strict(), +/// }; +/// ``` +#[derive(Debug, Clone)] +pub struct ParseOptions { + /// Whether to resolve relative URLs to absolute URLs + /// + /// When `true`, relative URLs in links, images, and other resources + /// are converted to absolute URLs using the feed's base URL. + /// + /// Default: `true` + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::ParseOptions; + /// + /// let mut options = ParseOptions::default(); + /// options.resolve_relative_uris = false; // Keep relative URLs + /// ``` + pub resolve_relative_uris: bool, + + /// Whether to sanitize HTML content in feed entries + /// + /// When `true`, HTML content in titles, summaries, and content blocks + /// is sanitized to remove potentially dangerous elements and attributes + /// (scripts, iframes, etc.) while preserving safe formatting. + /// + /// Default: `true` + /// + /// # Security + /// + /// Disabling HTML sanitization is **not recommended** unless you fully + /// trust the feed source and have other security measures in place. + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::ParseOptions; + /// + /// let mut options = ParseOptions::default(); + /// options.sanitize_html = false; // Disable for trusted feeds + /// ``` + pub sanitize_html: bool, + + /// Parser limits for `DoS` protection + /// + /// Controls maximum allowed sizes for collections, text fields, + /// and overall feed size to prevent resource exhaustion attacks. + /// + /// Default: `ParserLimits::default()` + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::{ParseOptions, ParserLimits}; + /// + /// let options = ParseOptions { + /// limits: ParserLimits::strict(), // Use stricter limits + /// ..Default::default() + /// }; + /// ``` + pub limits: ParserLimits, +} + +impl Default for ParseOptions { + /// Creates default parse options + /// + /// Default configuration: + /// - `resolve_relative_uris`: `true` + /// - `sanitize_html`: `true` + /// - `limits`: `ParserLimits::default()` + /// + /// These defaults are suitable for most use cases and provide + /// good security and compatibility. + fn default() -> Self { + Self { + resolve_relative_uris: true, + sanitize_html: true, + limits: ParserLimits::default(), + } + } +} + +impl ParseOptions { + /// Creates permissive parse options + /// + /// Suitable for trusted feeds where you want maximum compatibility + /// and performance: + /// - `resolve_relative_uris`: `true` + /// - `sanitize_html`: `false` + /// - `limits`: `ParserLimits::permissive()` + /// + /// # Security Warning + /// + /// Use only with trusted feed sources! + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::ParseOptions; + /// + /// let options = ParseOptions::permissive(); + /// assert!(!options.sanitize_html); + /// ``` + #[must_use] + pub const fn permissive() -> Self { + Self { + resolve_relative_uris: true, + sanitize_html: false, + limits: ParserLimits::permissive(), + } + } + + /// Creates strict parse options + /// + /// Suitable for untrusted feeds in resource-constrained environments: + /// - `resolve_relative_uris`: `false` (preserve original URLs) + /// - `sanitize_html`: `true` (remove dangerous content) + /// - `limits`: `ParserLimits::strict()` (tight resource limits) + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs::ParseOptions; + /// + /// let options = ParseOptions::strict(); + /// assert!(options.sanitize_html); + /// assert!(!options.resolve_relative_uris); + /// ``` + #[must_use] + pub const fn strict() -> Self { + Self { + resolve_relative_uris: false, + sanitize_html: true, + limits: ParserLimits::strict(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_options() { + let options = ParseOptions::default(); + assert!(options.resolve_relative_uris); + assert!(options.sanitize_html); + assert_eq!(options.limits.max_entries, 10_000); + } + + #[test] + fn test_permissive_options() { + let options = ParseOptions::permissive(); + assert!(options.resolve_relative_uris); + assert!(!options.sanitize_html); + assert_eq!(options.limits.max_entries, 100_000); + } + + #[test] + fn test_strict_options() { + let options = ParseOptions::strict(); + assert!(!options.resolve_relative_uris); + assert!(options.sanitize_html); + assert_eq!(options.limits.max_entries, 1_000); + } + + #[test] + fn test_custom_options() { + let options = ParseOptions { + resolve_relative_uris: false, + sanitize_html: false, + limits: ParserLimits::permissive(), + }; + assert!(!options.resolve_relative_uris); + assert!(!options.sanitize_html); + assert_eq!(options.limits.max_entries, 100_000); + } + + #[test] + fn test_options_clone() { + let options1 = ParseOptions::default(); + let options2 = options1.clone(); + assert_eq!(options1.resolve_relative_uris, options2.resolve_relative_uris); + assert_eq!(options1.sanitize_html, options2.sanitize_html); + } + + #[test] + fn test_options_debug() { + let options = ParseOptions::default(); + let debug_str = format!("{options:?}"); + assert!(debug_str.contains("ParseOptions")); + assert!(debug_str.contains("resolve_relative_uris")); + assert!(debug_str.contains("sanitize_html")); + } +} diff --git a/crates/feedparser-rs-core/src/types/entry.rs b/crates/feedparser-rs-core/src/types/entry.rs index a8935dc..6e8b2e6 100644 --- a/crates/feedparser-rs-core/src/types/entry.rs +++ b/crates/feedparser-rs-core/src/types/entry.rs @@ -72,6 +72,8 @@ pub struct Entry { pub podcast_transcripts: Vec, /// Podcast 2.0 persons for this episode (hosts, guests, etc.) pub podcast_persons: Vec, + /// `GeoRSS` location data + pub geo: Option, } impl Entry { diff --git a/crates/feedparser-rs-core/src/util/date.rs b/crates/feedparser-rs-core/src/util/date.rs index cb7dcc4..7a60667 100644 --- a/crates/feedparser-rs-core/src/util/date.rs +++ b/crates/feedparser-rs-core/src/util/date.rs @@ -13,7 +13,7 @@ const DATE_FORMATS: &[&str] = &[ "%Y-%m-%dT%H:%M:%SZ", // 2024-12-14T10:30:45Z "%Y-%m-%dT%H:%M:%S", // 2024-12-14T10:30:45 (no timezone) "%Y-%m-%d %H:%M:%S", // 2024-12-14 10:30:45 - "%Y-%m-%d", // 2024-12-14 + "%Y-%m-%d", // 2024-12-14 // W3C Date-Time variants "%Y-%m-%d %H:%M:%S%:z", // 2024-12-14 10:30:45+00:00 "%Y/%m/%d %H:%M:%S", // 2024/12/14 10:30:45 @@ -91,6 +91,30 @@ pub fn parse_date(input: &str) -> Option> { return Some(dt.with_timezone(&Utc)); } + // Special handling for year-only format (e.g., "2024") + if let Ok(year) = input.parse::() + && (1000..=9999).contains(&year) + { + return NaiveDate::from_ymd_opt(year, 1, 1) + .and_then(|d| d.and_hms_opt(0, 0, 0)) + .map(|dt| dt.and_utc()); + } + + // Special handling for year-month format (e.g., "2024-12") + if input.len() == 7 + && input.chars().nth(4) == Some('-') + && let (Ok(year), Ok(month)) = ( + input[..4].parse::(), + input[5..7].parse::(), + ) + && (1000..=9999).contains(&year) + && (1..=12).contains(&month) + { + return NaiveDate::from_ymd_opt(year, month, 1) + .and_then(|d| d.and_hms_opt(0, 0, 0)) + .map(|dt| dt.and_utc()); + } + // Try all format strings for fmt in DATE_FORMATS { // Try parsing with time component @@ -195,9 +219,11 @@ mod tests { } #[test] - fn test_partial_date() { - // Should fail gracefully - let dt = parse_date("2024-12"); + fn test_partial_date_invalid() { + // Invalid partial dates should fail + let dt = parse_date("2024-13"); // Invalid month + assert!(dt.is_none()); + let dt = parse_date("abcd-12"); assert!(dt.is_none()); } @@ -282,4 +308,38 @@ mod tests { let dt = parse_date("2023-02-29"); assert!(dt.is_none()); } + + #[test] + fn test_year_only_format() { + let dt = parse_date("2024").unwrap(); + assert_eq!(dt.year(), 2024); + assert_eq!(dt.month(), 1); + assert_eq!(dt.day(), 1); + assert_eq!(dt.hour(), 0); + } + + #[test] + fn test_year_month_format() { + let dt = parse_date("2024-12").unwrap(); + assert_eq!(dt.year(), 2024); + assert_eq!(dt.month(), 12); + assert_eq!(dt.day(), 1); + assert_eq!(dt.hour(), 0); + } + + #[test] + fn test_all_new_formats() { + let test_cases = vec![ + ("2024", 2024, 1, 1), + ("2024-12", 2024, 12, 1), + ]; + + for (date_str, year, month, day) in test_cases { + let dt = parse_date(date_str) + .unwrap_or_else(|| panic!("Failed to parse: {date_str}")); + assert_eq!(dt.year(), year, "Year mismatch for: {date_str}"); + assert_eq!(dt.month(), month, "Month mismatch for: {date_str}"); + assert_eq!(dt.day(), day, "Day mismatch for: {date_str}"); + } + } } From 6d15fe8fc185179509a2da592e2cbaa64f56c114 Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Tue, 16 Dec 2025 18:44:56 +0100 Subject: [PATCH 2/4] style: apply rustfmt formatting --- crates/feedparser-rs-core/src/namespace/cc.rs | 19 +++++++------------ .../src/namespace/georss.rs | 8 ++------ crates/feedparser-rs-core/src/options.rs | 5 ++++- crates/feedparser-rs-core/src/util/date.rs | 15 ++++----------- 4 files changed, 17 insertions(+), 30 deletions(-) diff --git a/crates/feedparser-rs-core/src/namespace/cc.rs b/crates/feedparser-rs-core/src/namespace/cc.rs index 33ef1ce..83c8166 100644 --- a/crates/feedparser-rs-core/src/namespace/cc.rs +++ b/crates/feedparser-rs-core/src/namespace/cc.rs @@ -14,10 +14,10 @@ //! Creative Commons: //! Legacy: +use crate::Entry; use crate::limits::ParserLimits; use crate::types::generics::LimitedCollectionExt; use crate::types::{FeedMeta, Link}; -use crate::Entry; /// Creative Commons namespace URI (modern) pub const CC: &str = "http://creativecommons.org/ns#"; @@ -166,17 +166,15 @@ mod tests { #[test] fn test_extract_license_url_from_text() { - let url = extract_license_url(&[], "http://creativecommons.org/licenses/by-nc/4.0/").unwrap(); + let url = + extract_license_url(&[], "http://creativecommons.org/licenses/by-nc/4.0/").unwrap(); assert_eq!(url, "http://creativecommons.org/licenses/by-nc/4.0/"); } #[test] fn test_extract_license_url_from_text_with_whitespace() { - let url = extract_license_url( - &[], - " http://creativecommons.org/licenses/by-nd/4.0/ ", - ) - .unwrap(); + let url = + extract_license_url(&[], " http://creativecommons.org/licenses/by-nd/4.0/ ").unwrap(); assert_eq!(url, "http://creativecommons.org/licenses/by-nd/4.0/"); } @@ -187,11 +185,8 @@ mod tests { b"rdf:resource".to_vec(), "http://creativecommons.org/licenses/by/4.0/".to_string(), )]; - let url = extract_license_url( - &attrs, - "http://creativecommons.org/licenses/by-sa/4.0/", - ) - .unwrap(); + let url = + extract_license_url(&attrs, "http://creativecommons.org/licenses/by-sa/4.0/").unwrap(); assert_eq!(url, "http://creativecommons.org/licenses/by/4.0/"); } diff --git a/crates/feedparser-rs-core/src/namespace/georss.rs b/crates/feedparser-rs-core/src/namespace/georss.rs index 21a7eca..f4a8857 100644 --- a/crates/feedparser-rs-core/src/namespace/georss.rs +++ b/crates/feedparser-rs-core/src/namespace/georss.rs @@ -391,12 +391,8 @@ mod tests { let mut entry = Entry::default(); let limits = ParserLimits::default(); - let handled = handle_entry_element( - b"line", - "45.256 -71.92 46.0 -72.0", - &mut entry, - &limits, - ); + let handled = + handle_entry_element(b"line", "45.256 -71.92 46.0 -72.0", &mut entry, &limits); assert!(handled); assert!(entry.geo.is_some()); assert_eq!(entry.geo.as_ref().unwrap().geo_type, GeoType::Line); diff --git a/crates/feedparser-rs-core/src/options.rs b/crates/feedparser-rs-core/src/options.rs index 33d5757..e93a6d1 100644 --- a/crates/feedparser-rs-core/src/options.rs +++ b/crates/feedparser-rs-core/src/options.rs @@ -208,7 +208,10 @@ mod tests { fn test_options_clone() { let options1 = ParseOptions::default(); let options2 = options1.clone(); - assert_eq!(options1.resolve_relative_uris, options2.resolve_relative_uris); + assert_eq!( + options1.resolve_relative_uris, + options2.resolve_relative_uris + ); assert_eq!(options1.sanitize_html, options2.sanitize_html); } diff --git a/crates/feedparser-rs-core/src/util/date.rs b/crates/feedparser-rs-core/src/util/date.rs index 7a60667..563b794 100644 --- a/crates/feedparser-rs-core/src/util/date.rs +++ b/crates/feedparser-rs-core/src/util/date.rs @@ -13,7 +13,7 @@ const DATE_FORMATS: &[&str] = &[ "%Y-%m-%dT%H:%M:%SZ", // 2024-12-14T10:30:45Z "%Y-%m-%dT%H:%M:%S", // 2024-12-14T10:30:45 (no timezone) "%Y-%m-%d %H:%M:%S", // 2024-12-14 10:30:45 - "%Y-%m-%d", // 2024-12-14 + "%Y-%m-%d", // 2024-12-14 // W3C Date-Time variants "%Y-%m-%d %H:%M:%S%:z", // 2024-12-14 10:30:45+00:00 "%Y/%m/%d %H:%M:%S", // 2024/12/14 10:30:45 @@ -103,10 +103,7 @@ pub fn parse_date(input: &str) -> Option> { // Special handling for year-month format (e.g., "2024-12") if input.len() == 7 && input.chars().nth(4) == Some('-') - && let (Ok(year), Ok(month)) = ( - input[..4].parse::(), - input[5..7].parse::(), - ) + && let (Ok(year), Ok(month)) = (input[..4].parse::(), input[5..7].parse::()) && (1000..=9999).contains(&year) && (1..=12).contains(&month) { @@ -329,14 +326,10 @@ mod tests { #[test] fn test_all_new_formats() { - let test_cases = vec![ - ("2024", 2024, 1, 1), - ("2024-12", 2024, 12, 1), - ]; + let test_cases = vec![("2024", 2024, 1, 1), ("2024-12", 2024, 12, 1)]; for (date_str, year, month, day) in test_cases { - let dt = parse_date(date_str) - .unwrap_or_else(|| panic!("Failed to parse: {date_str}")); + let dt = parse_date(date_str).unwrap_or_else(|| panic!("Failed to parse: {date_str}")); assert_eq!(dt.year(), year, "Year mismatch for: {date_str}"); assert_eq!(dt.month(), month, "Month mismatch for: {date_str}"); assert_eq!(dt.day(), day, "Day mismatch for: {date_str}"); From 6643fa46ff665e7e8c0aeac9607aab2ec44c1abf Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Tue, 16 Dec 2025 18:54:57 +0100 Subject: [PATCH 3/4] docs: remove backticks from GeoRSS in doc comments --- crates/feedparser-rs-core/src/namespace/georss.rs | 6 +++--- crates/feedparser-rs-core/src/namespace/mod.rs | 2 +- crates/feedparser-rs-core/src/types/entry.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/feedparser-rs-core/src/namespace/georss.rs b/crates/feedparser-rs-core/src/namespace/georss.rs index f4a8857..601f66b 100644 --- a/crates/feedparser-rs-core/src/namespace/georss.rs +++ b/crates/feedparser-rs-core/src/namespace/georss.rs @@ -18,7 +18,7 @@ use crate::limits::ParserLimits; use crate::types::Entry; -/// `GeoRSS` namespace URI +/// GeoRSS namespace URI pub const GEORSS: &str = "http://www.georss.org/georss"; /// Type of geographic shape @@ -35,7 +35,7 @@ pub enum GeoType { Box, } -/// Geographic location data from `GeoRSS` +/// Geographic location data from GeoRSS #[derive(Debug, Clone, Default, PartialEq)] pub struct GeoLocation { /// Type of geographic shape @@ -157,7 +157,7 @@ impl GeoLocation { } } -/// Parse `GeoRSS` element and update entry +/// Parse GeoRSS element and update entry /// /// # Arguments /// diff --git a/crates/feedparser-rs-core/src/namespace/mod.rs b/crates/feedparser-rs-core/src/namespace/mod.rs index 7056e7d..46a7270 100644 --- a/crates/feedparser-rs-core/src/namespace/mod.rs +++ b/crates/feedparser-rs-core/src/namespace/mod.rs @@ -62,7 +62,7 @@ pub mod namespaces { /// Podcast 2.0 pub const PODCAST: &str = "https://podcastindex.org/namespace/1.0"; - /// `GeoRSS` + /// GeoRSS pub const GEORSS: &str = "http://www.georss.org/georss"; /// Creative Commons (modern) diff --git a/crates/feedparser-rs-core/src/types/entry.rs b/crates/feedparser-rs-core/src/types/entry.rs index 6e8b2e6..676ec74 100644 --- a/crates/feedparser-rs-core/src/types/entry.rs +++ b/crates/feedparser-rs-core/src/types/entry.rs @@ -72,7 +72,7 @@ pub struct Entry { pub podcast_transcripts: Vec, /// Podcast 2.0 persons for this episode (hosts, guests, etc.) pub podcast_persons: Vec, - /// `GeoRSS` location data + /// GeoRSS location data pub geo: Option, } From 3730411ff8723a51ad093689d8cbc4e1088e1a94 Mon Sep 17 00:00:00 2001 From: "Andrei G." Date: Tue, 16 Dec 2025 18:57:04 +0100 Subject: [PATCH 4/4] Revert "docs: remove backticks from GeoRSS in doc comments" Clippy's doc_markdown lint requires backticks for mixed-case identifiers like GeoRSS. The Copilot suggestion was incorrect - reverting to pass CI. --- crates/feedparser-rs-core/src/namespace/georss.rs | 6 +++--- crates/feedparser-rs-core/src/namespace/mod.rs | 2 +- crates/feedparser-rs-core/src/types/entry.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/feedparser-rs-core/src/namespace/georss.rs b/crates/feedparser-rs-core/src/namespace/georss.rs index 601f66b..f4a8857 100644 --- a/crates/feedparser-rs-core/src/namespace/georss.rs +++ b/crates/feedparser-rs-core/src/namespace/georss.rs @@ -18,7 +18,7 @@ use crate::limits::ParserLimits; use crate::types::Entry; -/// GeoRSS namespace URI +/// `GeoRSS` namespace URI pub const GEORSS: &str = "http://www.georss.org/georss"; /// Type of geographic shape @@ -35,7 +35,7 @@ pub enum GeoType { Box, } -/// Geographic location data from GeoRSS +/// Geographic location data from `GeoRSS` #[derive(Debug, Clone, Default, PartialEq)] pub struct GeoLocation { /// Type of geographic shape @@ -157,7 +157,7 @@ impl GeoLocation { } } -/// Parse GeoRSS element and update entry +/// Parse `GeoRSS` element and update entry /// /// # Arguments /// diff --git a/crates/feedparser-rs-core/src/namespace/mod.rs b/crates/feedparser-rs-core/src/namespace/mod.rs index 46a7270..7056e7d 100644 --- a/crates/feedparser-rs-core/src/namespace/mod.rs +++ b/crates/feedparser-rs-core/src/namespace/mod.rs @@ -62,7 +62,7 @@ pub mod namespaces { /// Podcast 2.0 pub const PODCAST: &str = "https://podcastindex.org/namespace/1.0"; - /// GeoRSS + /// `GeoRSS` pub const GEORSS: &str = "http://www.georss.org/georss"; /// Creative Commons (modern) diff --git a/crates/feedparser-rs-core/src/types/entry.rs b/crates/feedparser-rs-core/src/types/entry.rs index 676ec74..6e8b2e6 100644 --- a/crates/feedparser-rs-core/src/types/entry.rs +++ b/crates/feedparser-rs-core/src/types/entry.rs @@ -72,7 +72,7 @@ pub struct Entry { pub podcast_transcripts: Vec, /// Podcast 2.0 persons for this episode (hosts, guests, etc.) pub podcast_persons: Vec, - /// GeoRSS location data + /// `GeoRSS` location data pub geo: Option, }