diff --git a/CHANGELOG.md b/CHANGELOG.md index ebeee66..451fe77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.4.0] - 2025-12-28 + +### Added +- **Python feedparser compatibility improvements**: + - Field alias mappings for deprecated field names (`description` → `subtitle`, `guid` → `id`, etc.) + - Dict-style access on feed objects (`d['feed']['title']`, `d['entries'][0]['link']`) + - Container aliases (`channel` → `feed`, `items` → `entries`) + - Auto-URL detection in `parse()` function (URLs are automatically fetched when http feature enabled) + - Optional HTTP parameters (`etag`, `modified`, `user_agent`) for `parse()` and `parse_with_limits()` + +### Changed +- `parse_with_limits()` now uses keyword-only `limits` parameter for consistency + ## [0.3.0] - 2025-12-18 ### Added @@ -147,7 +160,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Comprehensive test coverage - Documentation with examples -[Unreleased]: https://github.com/bug-ops/feedparser-rs/compare/v0.3.0...HEAD +[Unreleased]: https://github.com/bug-ops/feedparser-rs/compare/v0.4.0...HEAD +[0.4.0]: https://github.com/bug-ops/feedparser-rs/compare/v0.3.0...v0.4.0 [0.3.0]: https://github.com/bug-ops/feedparser-rs/compare/v0.2.1...v0.3.0 [0.2.1]: https://github.com/bug-ops/feedparser-rs/compare/v0.2.0...v0.2.1 [0.2.0]: https://github.com/bug-ops/feedparser-rs/compare/v0.1.8...v0.2.0 diff --git a/Cargo.lock b/Cargo.lock index cfa1984..4774667 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -536,7 +536,7 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "feedparser-rs" -version = "0.3.0" +version = "0.4.0" dependencies = [ "ammonia", "chrono", @@ -559,7 +559,7 @@ dependencies = [ [[package]] name = "feedparser-rs-node" -version = "0.3.0" +version = "0.4.0" dependencies = [ "feedparser-rs", "napi", @@ -569,10 +569,11 @@ dependencies = [ [[package]] name = "feedparser-rs-py" -version = "0.3.0" +version = "0.4.0" dependencies = [ "chrono", "feedparser-rs", + "once_cell", "pyo3", ] diff --git a/Cargo.toml b/Cargo.toml index 9006134..ad81061 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.3.0" +version = "0.4.0" edition = "2024" rust-version = "1.88.0" authors = ["bug-ops"] @@ -29,6 +29,7 @@ memchr = "2.7" mockito = "1.6" napi = "3.7" napi-derive = "3.4" +once_cell = "1.20" pyo3 = "0.27" quick-xml = "0.38" regex = "1.11" diff --git a/README.md b/README.md index fd95e6f..f93638a 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ High-performance RSS/Atom/JSON Feed parser written in Rust, with Python and Node - **Conditional GET** — ETag/Last-Modified support for bandwidth-efficient polling - **Podcast support** — iTunes and Podcast 2.0 namespace extensions - **Multi-language bindings** — Native Python (PyO3) and Node.js (napi-rs) bindings -- **Familiar API** — Inspired by Python's feedparser, easy to migrate existing code +- **feedparser drop-in** — Dict-style access, field aliases, same API patterns as Python feedparser ## Supported Formats @@ -146,18 +146,28 @@ See [Node.js API documentation](crates/feedparser-rs-node/README.md) for complet ### Python ```python -import feedparser_rs +import feedparser_rs as feedparser # Drop-in replacement -# Parse from bytes or string -d = feedparser_rs.parse(b'...') +# Parse from bytes, string, or URL (auto-detected) +d = feedparser.parse(b'...') +d = feedparser.parse('https://example.com/feed.xml') # URL auto-detected + +# Attribute-style access print(d.version) # 'rss20' print(d.feed.title) print(d.bozo) # True if parsing had issues -print(d.entries[0].published_parsed) # time.struct_time + +# Dict-style access (feedparser-compatible) +print(d['feed']['title']) +print(d['entries'][0]['link']) + +# Deprecated field aliases work +print(d.feed.description) # → d.feed.subtitle +print(d.channel.title) # → d.feed.title ``` > [!NOTE] -> Python bindings provide `time.struct_time` for date fields, matching feedparser's API for easy migration. +> Python bindings provide full feedparser compatibility: dict-style access, field aliases, and `time.struct_time` for date fields. ## Cargo Features diff --git a/crates/feedparser-rs-node/package.json b/crates/feedparser-rs-node/package.json index daf8458..65877ed 100644 --- a/crates/feedparser-rs-node/package.json +++ b/crates/feedparser-rs-node/package.json @@ -1,6 +1,6 @@ { "name": "feedparser-rs", - "version": "0.3.0", + "version": "0.4.0", "description": "High-performance RSS/Atom/JSON Feed parser for Node.js", "main": "index.js", "types": "index.d.ts", diff --git a/crates/feedparser-rs-py/Cargo.toml b/crates/feedparser-rs-py/Cargo.toml index 53ab369..11bfaad 100644 --- a/crates/feedparser-rs-py/Cargo.toml +++ b/crates/feedparser-rs-py/Cargo.toml @@ -18,6 +18,7 @@ crate-type = ["cdylib"] feedparser-rs = { path = "../feedparser-rs-core" } pyo3 = { workspace = true, features = ["extension-module", "chrono"] } chrono = { workspace = true, features = ["clock"] } +once_cell = { workspace = true } [features] default = ["http"] diff --git a/crates/feedparser-rs-py/README.md b/crates/feedparser-rs-py/README.md index 92c58fe..d0383f0 100644 --- a/crates/feedparser-rs-py/README.md +++ b/crates/feedparser-rs-py/README.md @@ -14,7 +14,7 @@ High-performance RSS/Atom/JSON Feed parser for Python with feedparser-compatible - **Tolerant parsing**: Bozo flag for graceful handling of malformed feeds - **Multi-format**: RSS 0.9x/1.0/2.0, Atom 0.3/1.0, JSON Feed 1.0/1.1 - **Podcast support**: iTunes and Podcast 2.0 namespace extensions -- **Familiar API**: Inspired by feedparser, easy migration path +- **feedparser-compatible**: Dict-style access, field aliases, same API patterns - **DoS protection**: Built-in resource limits ## Installation @@ -33,15 +33,20 @@ pip install feedparser-rs ```python import feedparser_rs -# Parse from string or bytes +# Parse from string, bytes, or URL (auto-detected) d = feedparser_rs.parse('...') d = feedparser_rs.parse(b'...') +d = feedparser_rs.parse('https://example.com/feed.xml') # URL auto-detected -# Access data +# Attribute-style access (feedparser-compatible) print(d.feed.title) print(d.version) # "rss20", "atom10", etc. print(d.bozo) # True if parsing errors occurred +# Dict-style access (feedparser-compatible) +print(d['feed']['title']) +print(d['entries'][0]['link']) + for entry in d.entries: print(entry.title) print(entry.published_parsed) # time.struct_time @@ -55,35 +60,63 @@ for entry in d.entries: ```python import feedparser_rs -# Fetch and parse in one call +# Option 1: Auto-detection (recommended) +d = feedparser_rs.parse('https://example.com/feed.xml') + +# Option 2: Explicit URL function d = feedparser_rs.parse_url('https://example.com/feed.xml') -print(d.feed.title) -print(f"Fetched {len(d.entries)} entries") +# With conditional GET for efficient polling +d = feedparser_rs.parse( + 'https://example.com/feed.xml', + etag=cached_etag, + modified=cached_modified +) +if d.status == 304: + print("Feed not modified") # With custom limits limits = feedparser_rs.ParserLimits(max_entries=100) -d = feedparser_rs.parse_url_with_limits('https://example.com/feed.xml', limits) +d = feedparser_rs.parse_with_limits('https://example.com/feed.xml', limits=limits) ``` > [!TIP] -> `parse_url` supports automatic compression (gzip, deflate, brotli) and follows redirects. +> URL fetching supports automatic compression (gzip, deflate, brotli) and follows redirects. ## Migration from feedparser +feedparser-rs is designed as a drop-in replacement for Python feedparser: + ```python -# Option 1: alias import +# Drop-in replacement import feedparser_rs as feedparser -d = feedparser.parse(feed_content) -# Option 2: direct import -import feedparser_rs -d = feedparser_rs.parse(feed_content) +# Same API patterns work +d = feedparser.parse('https://example.com/feed.xml') +print(d.feed.title) +print(d['feed']['title']) # Dict-style access works too +print(d.entries[0].link) -# Option 3: URL fetching (new!) -d = feedparser_rs.parse_url('https://example.com/feed.xml') +# Deprecated field names supported +print(d.feed.description) # → d.feed.subtitle +print(d.channel.title) # → d.feed.title +print(d.items[0].guid) # → d.entries[0].id ``` +### Supported Field Aliases + +| Old Name | Maps To | +|----------|---------| +| `feed.description` | `feed.subtitle` or `feed.summary` | +| `feed.tagline` | `feed.subtitle` | +| `feed.copyright` | `feed.rights` | +| `feed.modified` | `feed.updated` | +| `channel` | `feed` | +| `items` | `entries` | +| `entry.guid` | `entry.id` | +| `entry.description` | `entry.summary` | +| `entry.issued` | `entry.published` | + ## Advanced Usage ### Custom Resource Limits @@ -98,7 +131,7 @@ limits = feedparser_rs.ParserLimits( max_links_per_entry=50, ) -d = feedparser_rs.parse_with_limits(feed_data, limits) +d = feedparser_rs.parse_with_limits(feed_data, limits=limits) ``` ### Format Detection @@ -132,20 +165,23 @@ for entry in d.entries: ### Functions -- `parse(source)` — Parse feed from bytes or str -- `parse_url(url)` — Fetch and parse feed from URL -- `parse_with_limits(source, limits)` — Parse with custom resource limits -- `parse_url_with_limits(url, limits)` — Fetch and parse with custom limits +- `parse(source, etag=None, modified=None, user_agent=None)` — Parse feed from bytes, str, or URL (auto-detected) +- `parse_url(url, etag=None, modified=None, user_agent=None)` — Fetch and parse feed from URL +- `parse_with_limits(source, etag=None, modified=None, user_agent=None, limits=None)` — Parse with custom resource limits +- `parse_url_with_limits(url, etag=None, modified=None, user_agent=None, limits=None)` — Fetch and parse with custom limits - `detect_format(source)` — Detect feed format without full parsing ### Classes -- `FeedParserDict` — Parsed feed result - - `.feed` — Feed metadata - - `.entries` — List of entries +- `FeedParserDict` — Parsed feed result (supports both attribute and dict-style access) + - `.feed` / `['feed']` — Feed metadata + - `.entries` / `['entries']` — List of entries - `.bozo` — True if parsing errors occurred - `.version` — Feed version string - `.encoding` — Character encoding + - `.status` — HTTP status code (for URL fetches) + - `.etag` — ETag header (for conditional GET) + - `.modified` — Last-Modified header (for conditional GET) - `ParserLimits` — Resource limits configuration diff --git a/crates/feedparser-rs-py/pyproject.toml b/crates/feedparser-rs-py/pyproject.toml index c3c88ea..855f575 100644 --- a/crates/feedparser-rs-py/pyproject.toml +++ b/crates/feedparser-rs-py/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "feedparser-rs" -version = "0.3.0" +version = "0.4.0" description = "High-performance RSS/Atom/JSON Feed parser with feedparser-compatible API" readme = "README.md" license = { text = "MIT OR Apache-2.0" } diff --git a/crates/feedparser-rs-py/src/lib.rs b/crates/feedparser-rs-py/src/lib.rs index 84847a7..ef82276 100644 --- a/crates/feedparser-rs-py/src/lib.rs +++ b/crates/feedparser-rs-py/src/lib.rs @@ -40,39 +40,136 @@ fn _feedparser_rs(m: &Bound<'_, PyModule>) -> PyResult<()> { Ok(()) } -/// Parse an RSS/Atom/JSON Feed from bytes or string +/// Parse an RSS/Atom/JSON Feed from bytes, string, or URL +/// +/// Automatically detects whether `source` is a URL (http://, https://) or content. +/// For URLs, fetches and parses the feed. For content, parses directly. +/// +/// # Arguments +/// +/// * `source` - URL string, feed content string, or bytes +/// * `etag` - Optional ETag from previous fetch (for URLs with conditional GET) +/// * `modified` - Optional Last-Modified timestamp (for URLs with conditional GET) +/// * `user_agent` - Optional custom User-Agent header (for URLs) +/// +/// # Examples +/// +/// ```python +/// import feedparser_rs +/// +/// # Parse from URL (auto-detected) +/// feed = feedparser_rs.parse("https://example.com/feed.xml") +/// +/// # Parse from content +/// feed = feedparser_rs.parse("...") +/// +/// # Parse from URL with caching +/// feed = feedparser_rs.parse( +/// "https://example.com/feed.xml", +/// etag=cached_etag, +/// modified=cached_modified +/// ) +/// ``` #[pyfunction] -#[pyo3(signature = (source, /))] -fn parse(py: Python<'_>, source: &Bound<'_, PyAny>) -> PyResult { - parse_with_limits(py, source, None) +#[pyo3(signature = (source, /, etag=None, modified=None, user_agent=None))] +fn parse( + py: Python<'_>, + source: &Bound<'_, PyAny>, + etag: Option<&str>, + modified: Option<&str>, + user_agent: Option<&str>, +) -> PyResult { + parse_internal(py, source, etag, modified, user_agent, None) } /// Parse with custom resource limits for DoS protection +/// +/// Like `parse()` but allows specifying custom limits for untrusted feeds. +/// +/// # Arguments +/// +/// * `source` - URL string, feed content string, or bytes +/// * `etag` - Optional ETag from previous fetch (for URLs) +/// * `modified` - Optional Last-Modified timestamp (for URLs) +/// * `user_agent` - Optional custom User-Agent header (for URLs) +/// * `limits` - Optional parser limits for DoS protection +/// +/// # Examples +/// +/// ```python +/// import feedparser_rs +/// +/// limits = feedparser_rs.ParserLimits.strict() +/// +/// # Parse from URL with limits +/// feed = feedparser_rs.parse_with_limits( +/// "https://example.com/feed.xml", +/// limits=limits +/// ) +/// +/// # Parse from content with limits +/// feed = feedparser_rs.parse_with_limits("...", limits=limits) +/// ``` #[pyfunction] -#[pyo3(signature = (source, limits=None))] +#[pyo3(signature = (source, /, etag=None, modified=None, user_agent=None, limits=None))] fn parse_with_limits( py: Python<'_>, source: &Bound<'_, PyAny>, + etag: Option<&str>, + modified: Option<&str>, + user_agent: Option<&str>, limits: Option<&PyParserLimits>, ) -> PyResult { - let bytes: Vec = if let Ok(s) = source.extract::() { + parse_internal(py, source, etag, modified, user_agent, limits) +} + +/// Internal parse function that handles both URL and content sources +fn parse_internal( + py: Python<'_>, + source: &Bound<'_, PyAny>, + etag: Option<&str>, + modified: Option<&str>, + user_agent: Option<&str>, + limits: Option<&PyParserLimits>, +) -> PyResult { + // Try to extract as string first + if let Ok(s) = source.extract::() { + // Check if it's a URL if s.starts_with("http://") || s.starts_with("https://") { - return Err(pyo3::exceptions::PyNotImplementedError::new_err( - "URL fetching not implemented. Use requests.get(url).content", - )); + // Handle URL - requires http feature + #[cfg(feature = "http")] + { + let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default(); + let parsed = + core::parse_url_with_limits(&s, etag, modified, user_agent, parser_limits) + .map_err(convert_feed_error)?; + return PyParsedFeed::from_core(py, parsed); + } + #[cfg(not(feature = "http"))] + { + return Err(pyo3::exceptions::PyNotImplementedError::new_err( + "URL fetching requires the 'http' feature. Build with: maturin develop --features http", + )); + } } - s.into_bytes() - } else if let Ok(b) = source.extract::>() { - b - } else { - return Err(pyo3::exceptions::PyTypeError::new_err( - "source must be str or bytes", - )); - }; - let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default(); - let parsed = core::parse_with_limits(&bytes, parser_limits).map_err(convert_feed_error)?; - PyParsedFeed::from_core(py, parsed) + // Parse as content + let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default(); + let parsed = + core::parse_with_limits(s.as_bytes(), parser_limits).map_err(convert_feed_error)?; + return PyParsedFeed::from_core(py, parsed); + } + + // Try to extract as bytes + if let Ok(b) = source.extract::>() { + let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default(); + let parsed = core::parse_with_limits(&b, parser_limits).map_err(convert_feed_error)?; + return PyParsedFeed::from_core(py, parsed); + } + + Err(pyo3::exceptions::PyTypeError::new_err( + "source must be str, bytes, or URL", + )) } /// Detect feed format without full parsing diff --git a/crates/feedparser-rs-py/src/types/compat.rs b/crates/feedparser-rs-py/src/types/compat.rs new file mode 100644 index 0000000..930c9eb --- /dev/null +++ b/crates/feedparser-rs-py/src/types/compat.rs @@ -0,0 +1,123 @@ +use once_cell::sync::Lazy; +/// Python feedparser backward compatibility field mappings. +/// +/// This module provides field alias mappings for deprecated Python feedparser field names. +/// Old field names map to new field names for backward compatibility. +/// +/// Example: `feed.description` → `feed.subtitle` +/// `entry.guid` → `entry.id` +use std::collections::HashMap; + +/// Feed-level field mappings: old name → list of new names (tried in order). +/// +/// Some aliases can map to multiple fields (e.g., description → subtitle OR summary). +/// The resolver tries each new field in order until it finds a non-None value. +pub static FEED_FIELD_MAP: Lazy>> = Lazy::new(|| { + let mut map = HashMap::new(); + + // Description aliases + map.insert("description", vec!["subtitle", "summary"]); + map.insert( + "description_detail", + vec!["subtitle_detail", "summary_detail"], + ); + + // Tagline aliases (old Atom 0.3 field) + map.insert("tagline", vec!["subtitle"]); + map.insert("tagline_detail", vec!["subtitle_detail"]); + + // Info alias (RSS 1.0) + map.insert("info", vec!["subtitle"]); + map.insert("info_detail", vec!["subtitle_detail"]); + + // Copyright alias + map.insert("copyright", vec!["rights"]); + map.insert("copyright_detail", vec!["rights_detail"]); + + // Modified alias + map.insert("modified", vec!["updated"]); + map.insert("modified_parsed", vec!["updated_parsed"]); + + // Date alias (generic fallback) + map.insert("date", vec!["updated", "published"]); + map.insert("date_parsed", vec!["updated_parsed", "published_parsed"]); + + // URL alias + map.insert("url", vec!["link"]); + + map +}); + +/// Entry-level field mappings: old name → list of new names (tried in order). +pub static ENTRY_FIELD_MAP: Lazy>> = Lazy::new(|| { + let mut map = HashMap::new(); + + // GUID alias (RSS) + map.insert("guid", vec!["id"]); + + // Description alias + map.insert("description", vec!["summary"]); + map.insert("description_detail", vec!["summary_detail"]); + + // Issued alias (old feedparser field) + map.insert("issued", vec!["published"]); + map.insert("issued_parsed", vec!["published_parsed"]); + + // Modified alias + map.insert("modified", vec!["updated"]); + map.insert("modified_parsed", vec!["updated_parsed"]); + + // Date alias (generic fallback) + map.insert("date", vec!["updated", "published"]); + map.insert("date_parsed", vec!["updated_parsed", "published_parsed"]); + + map +}); + +/// Container-level field mappings for PyParsedFeed. +pub static CONTAINER_FIELD_MAP: Lazy> = Lazy::new(|| { + let mut map = HashMap::new(); + + // RSS uses , Atom uses + map.insert("channel", "feed"); + + // RSS uses , Atom uses + map.insert("items", "entries"); + + map +}); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_feed_field_map_description() { + let targets = FEED_FIELD_MAP.get("description").unwrap(); + assert_eq!(targets, &vec!["subtitle", "summary"]); + } + + #[test] + fn test_feed_field_map_modified() { + let targets = FEED_FIELD_MAP.get("modified").unwrap(); + assert_eq!(targets, &vec!["updated"]); + } + + #[test] + fn test_entry_field_map_guid() { + let targets = ENTRY_FIELD_MAP.get("guid").unwrap(); + assert_eq!(targets, &vec!["id"]); + } + + #[test] + fn test_entry_field_map_issued() { + let targets = ENTRY_FIELD_MAP.get("issued").unwrap(); + assert_eq!(targets, &vec!["published"]); + } + + #[test] + fn test_container_field_map_channel() { + let target = CONTAINER_FIELD_MAP.get("channel").unwrap(); + assert_eq!(*target, "feed"); + } +} diff --git a/crates/feedparser-rs-py/src/types/entry.rs b/crates/feedparser-rs-py/src/types/entry.rs index c518de4..6ad853c 100644 --- a/crates/feedparser-rs-py/src/types/entry.rs +++ b/crates/feedparser-rs-py/src/types/entry.rs @@ -1,7 +1,9 @@ use feedparser_rs::Entry as CoreEntry; +use pyo3::exceptions::{PyAttributeError, PyKeyError}; use pyo3::prelude::*; use super::common::{PyContent, PyEnclosure, PyLink, PyPerson, PySource, PyTag, PyTextConstruct}; +use super::compat::ENTRY_FIELD_MAP; use super::datetime::optional_datetime_to_struct_time; use super::geo::PyGeoLocation; use super::media::{PyMediaContent, PyMediaThumbnail}; @@ -301,4 +303,416 @@ impl PyEntry { self.inner.id.as_deref().unwrap_or("no-id") ) } + + /// Provides backward compatibility for deprecated Python feedparser field names. + /// + /// Maps old field names to their modern equivalents: + /// - `guid` → `id` + /// - `description` → `summary` + /// - `issued` → `published` + /// - `modified` → `updated` + /// - `date` → `updated` (or `published` as fallback) + /// + /// This method is called by Python when normal attribute lookup fails. + fn __getattr__(&self, py: Python<'_>, name: &str) -> PyResult> { + // Check if this is a deprecated field name + if let Some(new_names) = ENTRY_FIELD_MAP.get(name) { + // Try each new field name in order + for new_name in new_names { + let value: Option> = match *new_name { + "id" => self + .inner + .id + .as_deref() + .and_then(|v| v.into_pyobject(py).map(|o| o.unbind().into()).ok()), + "summary" => self + .inner + .summary + .as_deref() + .and_then(|v| v.into_pyobject(py).map(|o| o.unbind().into()).ok()), + "summary_detail" => self.inner.summary_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }), + "published" => self.inner.published.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "published_parsed" => { + optional_datetime_to_struct_time(py, &self.inner.published) + .ok() + .flatten() + } + "updated" => self.inner.updated.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "updated_parsed" => optional_datetime_to_struct_time(py, &self.inner.updated) + .ok() + .flatten(), + _ => None, + }; + + // If we found a value, return it + if let Some(v) = value { + return Ok(v); + } + } + } + + // Field not found - raise AttributeError + Err(PyAttributeError::new_err(format!( + "'Entry' object has no attribute '{}'", + name + ))) + } + + /// Provides dict-style access to fields for Python feedparser compatibility. + /// + /// Supports both modern field names and deprecated aliases. + /// This method is called by Python when using dict-style access: `entry['title']`. + /// + /// Raises KeyError for unknown keys (unlike __getattr__ which raises AttributeError). + fn __getitem__(&self, py: Python<'_>, key: &str) -> PyResult> { + // Check for known fields first + match key { + "id" => Ok(self + .inner + .id + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "title" => Ok(self + .inner + .title + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "title_detail" => { + if let Some(ref tc) = self.inner.title_detail { + Ok(Py::new(py, PyTextConstruct::from_core(tc.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "link" => Ok(self + .inner + .link + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "links" => { + let links: Vec<_> = self + .inner + .links + .iter() + .map(|l| PyLink::from_core(l.clone())) + .collect(); + Ok(links.into_pyobject(py)?.into_any().unbind()) + } + "summary" => Ok(self + .inner + .summary + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "summary_detail" => { + if let Some(ref tc) = self.inner.summary_detail { + Ok(Py::new(py, PyTextConstruct::from_core(tc.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "content" => { + let content: Vec<_> = self + .inner + .content + .iter() + .map(|c| PyContent::from_core(c.clone())) + .collect(); + Ok(content.into_pyobject(py)?.into_any().unbind()) + } + "published" => Ok(self + .inner + .published + .map(|dt| dt.to_rfc3339()) + .into_pyobject(py)? + .into_any() + .unbind()), + "published_parsed" => Ok(optional_datetime_to_struct_time(py, &self.inner.published)? + .into_pyobject(py)? + .into_any() + .unbind()), + "updated" => Ok(self + .inner + .updated + .map(|dt| dt.to_rfc3339()) + .into_pyobject(py)? + .into_any() + .unbind()), + "updated_parsed" => Ok(optional_datetime_to_struct_time(py, &self.inner.updated)? + .into_pyobject(py)? + .into_any() + .unbind()), + "created" => Ok(self + .inner + .created + .map(|dt| dt.to_rfc3339()) + .into_pyobject(py)? + .into_any() + .unbind()), + "created_parsed" => Ok(optional_datetime_to_struct_time(py, &self.inner.created)? + .into_pyobject(py)? + .into_any() + .unbind()), + "expired" => Ok(self + .inner + .expired + .map(|dt| dt.to_rfc3339()) + .into_pyobject(py)? + .into_any() + .unbind()), + "expired_parsed" => Ok(optional_datetime_to_struct_time(py, &self.inner.expired)? + .into_pyobject(py)? + .into_any() + .unbind()), + "author" => Ok(self + .inner + .author + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "author_detail" => { + if let Some(ref p) = self.inner.author_detail { + Ok(Py::new(py, PyPerson::from_core(p.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "authors" => { + let authors: Vec<_> = self + .inner + .authors + .iter() + .map(|p| PyPerson::from_core(p.clone())) + .collect(); + Ok(authors.into_pyobject(py)?.into_any().unbind()) + } + "contributors" => { + let contributors: Vec<_> = self + .inner + .contributors + .iter() + .map(|p| PyPerson::from_core(p.clone())) + .collect(); + Ok(contributors.into_pyobject(py)?.into_any().unbind()) + } + "publisher" => Ok(self + .inner + .publisher + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "publisher_detail" => { + if let Some(ref p) = self.inner.publisher_detail { + Ok(Py::new(py, PyPerson::from_core(p.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "tags" => { + let tags: Vec<_> = self + .inner + .tags + .iter() + .map(|t| PyTag::from_core(t.clone())) + .collect(); + Ok(tags.into_pyobject(py)?.into_any().unbind()) + } + "enclosures" => { + let enclosures: Vec<_> = self + .inner + .enclosures + .iter() + .map(|e| PyEnclosure::from_core(e.clone())) + .collect(); + Ok(enclosures.into_pyobject(py)?.into_any().unbind()) + } + "comments" => Ok(self + .inner + .comments + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "source" => { + if let Some(ref s) = self.inner.source { + Ok(Py::new(py, PySource::from_core(s.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "itunes" => { + if let Some(ref i) = self.inner.itunes { + Ok(Py::new(py, PyItunesEntryMeta::from_core(i.as_ref().clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "podcast_transcripts" => { + let transcripts: Vec<_> = self + .inner + .podcast_transcripts + .iter() + .map(|t| PyPodcastTranscript::from_core(t.clone())) + .collect(); + Ok(transcripts.into_pyobject(py)?.into_any().unbind()) + } + "podcast_persons" => { + let persons: Vec<_> = self + .inner + .podcast_persons + .iter() + .map(|p| PyPodcastPerson::from_core(p.clone())) + .collect(); + Ok(persons.into_pyobject(py)?.into_any().unbind()) + } + "license" => Ok(self + .inner + .license + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "geo" => { + if let Some(ref g) = self.inner.geo { + Ok(Py::new(py, PyGeoLocation::from_core(g.as_ref().clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "dc_creator" => Ok(self + .inner + .dc_creator + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "dc_date" => Ok(self + .inner + .dc_date + .map(|dt| dt.to_rfc3339()) + .into_pyobject(py)? + .into_any() + .unbind()), + "dc_date_parsed" => Ok(optional_datetime_to_struct_time(py, &self.inner.dc_date)? + .into_pyobject(py)? + .into_any() + .unbind()), + "dc_rights" => Ok(self + .inner + .dc_rights + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "dc_subject" => Ok(self + .inner + .dc_subject + .clone() + .into_pyobject(py)? + .into_any() + .unbind()), + "media_thumbnails" => { + let thumbnails: Vec<_> = self + .inner + .media_thumbnails + .iter() + .map(|t| PyMediaThumbnail::from_core(t.clone())) + .collect(); + Ok(thumbnails.into_pyobject(py)?.into_any().unbind()) + } + "media_content" => { + let content: Vec<_> = self + .inner + .media_content + .iter() + .map(|c| PyMediaContent::from_core(c.clone())) + .collect(); + Ok(content.into_pyobject(py)?.into_any().unbind()) + } + "podcast" => { + if let Some(ref p) = self.inner.podcast { + Ok(Py::new(py, PyPodcastEntryMeta::from_core(p.as_ref().clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + // Check for deprecated field name aliases + _ => { + if let Some(new_names) = ENTRY_FIELD_MAP.get(key) { + // Try each new field name in order + for new_name in new_names { + let value: Option> = + match *new_name { + "id" => self.inner.id.as_deref().and_then(|v| { + v.into_pyobject(py).map(|o| o.unbind().into()).ok() + }), + "summary" => self.inner.summary.as_deref().and_then(|v| { + v.into_pyobject(py).map(|o| o.unbind().into()).ok() + }), + "summary_detail" => { + self.inner.summary_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }) + } + "published" => self.inner.published.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "published_parsed" => { + optional_datetime_to_struct_time(py, &self.inner.published) + .ok() + .flatten() + } + "updated" => self.inner.updated.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "updated_parsed" => { + optional_datetime_to_struct_time(py, &self.inner.updated) + .ok() + .flatten() + } + _ => None, + }; + + // If we found a value, return it + if let Some(v) = value { + return Ok(v); + } + } + } + // Field not found - raise KeyError + Err(PyKeyError::new_err(format!("'{}'", key))) + } + } + } } diff --git a/crates/feedparser-rs-py/src/types/feed_meta.rs b/crates/feedparser-rs-py/src/types/feed_meta.rs index fff1850..946b084 100644 --- a/crates/feedparser-rs-py/src/types/feed_meta.rs +++ b/crates/feedparser-rs-py/src/types/feed_meta.rs @@ -1,7 +1,9 @@ use feedparser_rs::FeedMeta as CoreFeedMeta; +use pyo3::exceptions::{PyAttributeError, PyKeyError}; use pyo3::prelude::*; use super::common::{PyGenerator, PyImage, PyLink, PyPerson, PyTag, PyTextConstruct}; +use super::compat::FEED_FIELD_MAP; use super::datetime::optional_datetime_to_struct_time; use super::geo::PyGeoLocation; use super::podcast::{PyItunesFeedMeta, PyPodcastMeta}; @@ -252,4 +254,420 @@ impl PyFeedMeta { self.inner.link.as_deref().unwrap_or("no-link") ) } + + /// Provides backward compatibility for deprecated Python feedparser field names. + /// + /// Maps old field names to their modern equivalents: + /// - `description` → `subtitle` (or `summary` as fallback) + /// - `tagline` → `subtitle` + /// - `modified` → `updated` + /// - `copyright` → `rights` + /// - `date` → `updated` (or `published` as fallback) + /// - `url` → `link` + /// + /// This method is called by Python when normal attribute lookup fails. + fn __getattr__(&self, py: Python<'_>, name: &str) -> PyResult> { + // Check if this is a deprecated field name + if let Some(new_names) = FEED_FIELD_MAP.get(name) { + // Try each new field name in order + for new_name in new_names { + let value: Option> = match *new_name { + "subtitle" => self + .inner + .subtitle + .as_deref() + .and_then(|v| v.into_pyobject(py).map(|o| o.unbind().into()).ok()), + "subtitle_detail" => self.inner.subtitle_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }), + "summary" => self + .inner + .subtitle + .as_deref() + .and_then(|v| v.into_pyobject(py).map(|o| o.unbind().into()).ok()), + "summary_detail" => self.inner.subtitle_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }), + "rights" => self + .inner + .rights + .as_deref() + .and_then(|v| v.into_pyobject(py).map(|o| o.unbind().into()).ok()), + "rights_detail" => self.inner.rights_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }), + "updated" => self.inner.updated.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "updated_parsed" => optional_datetime_to_struct_time(py, &self.inner.updated) + .ok() + .flatten(), + "published" => self.inner.published.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "published_parsed" => { + optional_datetime_to_struct_time(py, &self.inner.published) + .ok() + .flatten() + } + "link" => self + .inner + .link + .as_deref() + .and_then(|v| v.into_pyobject(py).map(|o| o.unbind().into()).ok()), + _ => None, + }; + + // If we found a value, return it + if let Some(v) = value { + return Ok(v); + } + } + } + + // Field not found - raise AttributeError + Err(PyAttributeError::new_err(format!( + "'FeedMeta' object has no attribute '{}'", + name + ))) + } + + /// Provides dict-style access to fields for Python feedparser compatibility. + /// + /// Supports both modern field names and deprecated aliases. + /// This method is called by Python when using dict-style access: `feed['title']`. + /// + /// Raises KeyError for unknown keys (unlike __getattr__ which raises AttributeError). + fn __getitem__(&self, py: Python<'_>, key: &str) -> PyResult> { + // Check for known fields first + match key { + "title" => Ok(self + .inner + .title + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "title_detail" => { + if let Some(ref tc) = self.inner.title_detail { + Ok(Py::new(py, PyTextConstruct::from_core(tc.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "link" => Ok(self + .inner + .link + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "links" => { + let links: Vec<_> = self + .inner + .links + .iter() + .map(|l| PyLink::from_core(l.clone())) + .collect(); + Ok(links.into_pyobject(py)?.into_any().unbind()) + } + "subtitle" => Ok(self + .inner + .subtitle + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "subtitle_detail" => { + if let Some(ref tc) = self.inner.subtitle_detail { + Ok(Py::new(py, PyTextConstruct::from_core(tc.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "updated" => Ok(self + .inner + .updated + .map(|dt| dt.to_rfc3339()) + .into_pyobject(py)? + .into_any() + .unbind()), + "updated_parsed" => Ok(optional_datetime_to_struct_time(py, &self.inner.updated)? + .into_pyobject(py)? + .into_any() + .unbind()), + "published" => Ok(self + .inner + .published + .map(|dt| dt.to_rfc3339()) + .into_pyobject(py)? + .into_any() + .unbind()), + "published_parsed" => Ok(optional_datetime_to_struct_time(py, &self.inner.published)? + .into_pyobject(py)? + .into_any() + .unbind()), + "author" => Ok(self + .inner + .author + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "author_detail" => { + if let Some(ref p) = self.inner.author_detail { + Ok(Py::new(py, PyPerson::from_core(p.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "authors" => { + let authors: Vec<_> = self + .inner + .authors + .iter() + .map(|p| PyPerson::from_core(p.clone())) + .collect(); + Ok(authors.into_pyobject(py)?.into_any().unbind()) + } + "contributors" => { + let contributors: Vec<_> = self + .inner + .contributors + .iter() + .map(|p| PyPerson::from_core(p.clone())) + .collect(); + Ok(contributors.into_pyobject(py)?.into_any().unbind()) + } + "publisher" => Ok(self + .inner + .publisher + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "publisher_detail" => { + if let Some(ref p) = self.inner.publisher_detail { + Ok(Py::new(py, PyPerson::from_core(p.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "language" => Ok(self + .inner + .language + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "rights" => Ok(self + .inner + .rights + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "rights_detail" => { + if let Some(ref tc) = self.inner.rights_detail { + Ok(Py::new(py, PyTextConstruct::from_core(tc.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "generator" => Ok(self + .inner + .generator + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "generator_detail" => { + if let Some(ref g) = self.inner.generator_detail { + Ok(Py::new(py, PyGenerator::from_core(g.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "image" => { + if let Some(ref i) = self.inner.image { + Ok(Py::new(py, PyImage::from_core(i.clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "icon" => Ok(self + .inner + .icon + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "logo" => Ok(self + .inner + .logo + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "tags" => { + let tags: Vec<_> = self + .inner + .tags + .iter() + .map(|t| PyTag::from_core(t.clone())) + .collect(); + Ok(tags.into_pyobject(py)?.into_any().unbind()) + } + "id" => Ok(self + .inner + .id + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "ttl" => Ok(self.inner.ttl.into_pyobject(py)?.into_any().unbind()), + "itunes" => { + if let Some(ref i) = self.inner.itunes { + Ok(Py::new(py, PyItunesFeedMeta::from_core(i.as_ref().clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "podcast" => { + if let Some(ref p) = self.inner.podcast { + Ok(Py::new(py, PyPodcastMeta::from_core(p.as_ref().clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "license" => Ok(self + .inner + .license + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "syndication" => { + if let Some(ref s) = self.inner.syndication { + Ok(Py::new(py, PySyndicationMeta::from_core(s.as_ref().clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + "dc_creator" => Ok(self + .inner + .dc_creator + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "dc_publisher" => Ok(self + .inner + .dc_publisher + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "dc_rights" => Ok(self + .inner + .dc_rights + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "geo" => { + if let Some(ref g) = self.inner.geo { + Ok(Py::new(py, PyGeoLocation::from_core(g.as_ref().clone()))?.into_any()) + } else { + Ok(py.None()) + } + } + // Check for deprecated field name aliases + _ => { + if let Some(new_names) = FEED_FIELD_MAP.get(key) { + // Try each new field name in order + for new_name in new_names { + let value: Option> = + match *new_name { + "subtitle" => self.inner.subtitle.as_deref().and_then(|v| { + v.into_pyobject(py).map(|o| o.unbind().into()).ok() + }), + "subtitle_detail" => { + self.inner.subtitle_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }) + } + "summary" => self.inner.subtitle.as_deref().and_then(|v| { + v.into_pyobject(py).map(|o| o.unbind().into()).ok() + }), + "summary_detail" => { + self.inner.subtitle_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }) + } + "rights" => self.inner.rights.as_deref().and_then(|v| { + v.into_pyobject(py).map(|o| o.unbind().into()).ok() + }), + "rights_detail" => { + self.inner.rights_detail.as_ref().and_then(|tc| { + Py::new(py, PyTextConstruct::from_core(tc.clone())) + .ok() + .map(|p: Py| p.into_any()) + }) + } + "updated" => self.inner.updated.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "updated_parsed" => { + optional_datetime_to_struct_time(py, &self.inner.updated) + .ok() + .flatten() + } + "published" => self.inner.published.and_then(|dt| { + dt.to_rfc3339() + .into_pyobject(py) + .map(|o| o.unbind().into()) + .ok() + }), + "published_parsed" => { + optional_datetime_to_struct_time(py, &self.inner.published) + .ok() + .flatten() + } + "link" => self.inner.link.as_deref().and_then(|v| { + v.into_pyobject(py).map(|o| o.unbind().into()).ok() + }), + _ => None, + }; + + // If we found a value, return it + if let Some(v) = value { + return Ok(v); + } + } + } + // Field not found - raise KeyError + Err(PyKeyError::new_err(format!("'{}'", key))) + } + } + } } diff --git a/crates/feedparser-rs-py/src/types/mod.rs b/crates/feedparser-rs-py/src/types/mod.rs index 1742b4a..4f6e0b3 100644 --- a/crates/feedparser-rs-py/src/types/mod.rs +++ b/crates/feedparser-rs-py/src/types/mod.rs @@ -1,4 +1,5 @@ pub mod common; +pub mod compat; pub mod datetime; pub mod entry; pub mod feed_meta; diff --git a/crates/feedparser-rs-py/src/types/parsed_feed.rs b/crates/feedparser-rs-py/src/types/parsed_feed.rs index ff2868e..090e041 100644 --- a/crates/feedparser-rs-py/src/types/parsed_feed.rs +++ b/crates/feedparser-rs-py/src/types/parsed_feed.rs @@ -1,7 +1,9 @@ use feedparser_rs::ParsedFeed as CoreParsedFeed; +use pyo3::exceptions::{PyAttributeError, PyKeyError}; use pyo3::prelude::*; use pyo3::types::PyDict; +use super::compat::CONTAINER_FIELD_MAP; use super::entry::PyEntry; use super::feed_meta::PyFeedMeta; @@ -141,4 +143,114 @@ impl PyParsedFeed { fn __str__(&self) -> String { self.__repr__() } + + /// Provides backward compatibility for deprecated Python feedparser container names. + /// + /// Maps old container names to their modern equivalents: + /// - `channel` → `feed` (RSS uses , Atom uses ) + /// - `items` → `entries` (RSS uses , Atom uses ) + /// + /// This method is called by Python when normal attribute lookup fails. + fn __getattr__(&self, py: Python<'_>, name: &str) -> PyResult> { + // Check if this is a deprecated container name + if let Some(new_name) = CONTAINER_FIELD_MAP.get(name) { + match *new_name { + "feed" => { + // Convert Py to Py + Ok(self.feed.clone_ref(py).into()) + } + "entries" => { + // Convert Vec> to Py (as Python list) + let entries: Vec<_> = self.entries.iter().map(|e| e.clone_ref(py)).collect(); + match entries.into_pyobject(py) { + Ok(list) => Ok(list.unbind()), + Err(e) => Err(e), + } + } + _ => Err(PyAttributeError::new_err(format!( + "'FeedParserDict' object has no attribute '{}'", + name + ))), + } + } else { + // Field not found - raise AttributeError + Err(PyAttributeError::new_err(format!( + "'FeedParserDict' object has no attribute '{}'", + name + ))) + } + } + + /// Provides dict-style access to fields for Python feedparser compatibility. + /// + /// Supports both modern field names and deprecated aliases: + /// - `d['feed']` → feed metadata + /// - `d['entries']` → list of entries + /// - `d['channel']` → feed (deprecated alias) + /// - `d['items']` → entries (deprecated alias) + /// - `d['version']`, `d['bozo']`, etc. → top-level fields + /// + /// This method is called by Python when using dict-style access: `d[key]`. + fn __getitem__(&self, py: Python<'_>, key: &str) -> PyResult> { + // Check for known fields first + match key { + "feed" => Ok(self.feed.clone_ref(py).into()), + "entries" => { + let entries: Vec<_> = self.entries.iter().map(|e| e.clone_ref(py)).collect(); + Ok(entries.into_pyobject(py)?.into_any().unbind()) + } + "bozo" => { + let pybozo = self.bozo.into_pyobject(py)?.to_owned(); + Ok(pybozo.into_any().unbind()) + } + "bozo_exception" => Ok(self + .bozo_exception + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + "encoding" => Ok(self + .encoding + .as_str() + .into_pyobject(py)? + .into_any() + .unbind()), + "version" => Ok(self.version.as_str().into_pyobject(py)?.into_any().unbind()), + "namespaces" => Ok(self.namespaces.clone_ref(py).into()), + "status" => Ok(self.status.into_pyobject(py)?.into_any().unbind()), + "href" => Ok(self.href.as_deref().into_pyobject(py)?.into_any().unbind()), + "etag" => Ok(self.etag.as_deref().into_pyobject(py)?.into_any().unbind()), + "modified" => Ok(self + .modified + .as_deref() + .into_pyobject(py)? + .into_any() + .unbind()), + #[cfg(feature = "http")] + "headers" => { + if let Some(ref headers) = self.headers { + Ok(headers.clone_ref(py).into()) + } else { + Ok(py.None().into_pyobject(py)?.into_any().unbind()) + } + } + // Check for deprecated container name aliases + _ => { + if let Some(new_name) = CONTAINER_FIELD_MAP.get(key) { + match *new_name { + "feed" => Ok(self.feed.clone_ref(py).into()), + "entries" => { + let entries: Vec<_> = + self.entries.iter().map(|e| e.clone_ref(py)).collect(); + Ok(entries.into_pyobject(py)?.into_any().unbind()) + } + _ => Err(PyKeyError::new_err(format!("'{}'", key))), + } + } else { + // Field not found - raise KeyError + Err(PyKeyError::new_err(format!("'{}'", key))) + } + } + } + } } diff --git a/crates/feedparser-rs-py/tests/test_basic.py b/crates/feedparser-rs-py/tests/test_basic.py index 908a31a..bb7ba11 100644 --- a/crates/feedparser-rs-py/tests/test_basic.py +++ b/crates/feedparser-rs-py/tests/test_basic.py @@ -137,7 +137,7 @@ def test_parse_with_limits(): max_entries=10, ) - d = feedparser_rs.parse_with_limits(xml, limits) + d = feedparser_rs.parse_with_limits(xml, limits=limits) assert d.version == "rss20" @@ -150,7 +150,7 @@ def test_parse_with_limits_exceeded(): ) with pytest.raises(ValueError, match="exceeds maximum"): - feedparser_rs.parse_with_limits(xml, limits) + feedparser_rs.parse_with_limits(xml, limits=limits) def test_detect_format_rss20(): diff --git a/crates/feedparser-rs-py/tests/test_compat.py b/crates/feedparser-rs-py/tests/test_compat.py new file mode 100644 index 0000000..7f4bf99 --- /dev/null +++ b/crates/feedparser-rs-py/tests/test_compat.py @@ -0,0 +1,739 @@ +""" +Test Python feedparser backward compatibility field mappings. + +Tests that deprecated field names correctly map to their modern equivalents: +- Feed-level: description, tagline, modified, copyright, date, url +- Entry-level: guid, description, issued, modified, date +- Container-level: channel, items +""" + +import pytest +import feedparser_rs + + +def test_feed_description_alias(): + """feed.description should map to feed.subtitle""" + xml = """ + + Test subtitle text + + """ + + feed = feedparser_rs.parse(xml) + + # Both should work and return the same value + assert feed.feed.subtitle == "Test subtitle text" + assert feed.feed.description == "Test subtitle text" + assert feed.feed.description == feed.feed.subtitle + + +def test_feed_tagline_alias(): + """feed.tagline should map to feed.subtitle (old Atom 0.3 field)""" + xml = """ + My feed tagline + """ + + feed = feedparser_rs.parse(xml) + + assert feed.feed.subtitle == "My feed tagline" + assert feed.feed.tagline == "My feed tagline" + assert feed.feed.tagline == feed.feed.subtitle + + +def test_feed_modified_alias(): + """feed.modified should map to feed.updated""" + xml = """ + 2024-01-01T12:00:00Z + """ + + feed = feedparser_rs.parse(xml) + + assert feed.feed.updated is not None + assert feed.feed.modified == feed.feed.updated + # Both _parsed versions should work + assert feed.feed.modified_parsed is not None + assert feed.feed.modified_parsed.tm_year == 2024 + + +def test_feed_copyright_alias(): + """feed.copyright should map to feed.rights""" + xml = """ + Copyright 2024 Example Corp + """ + + feed = feedparser_rs.parse(xml) + + assert feed.feed.rights == "Copyright 2024 Example Corp" + assert feed.feed.copyright == "Copyright 2024 Example Corp" + assert feed.feed.copyright == feed.feed.rights + + +def test_feed_date_alias_falls_back_to_updated(): + """feed.date should map to feed.updated as primary fallback""" + xml = """ + 2024-01-15T10:30:00Z + """ + + feed = feedparser_rs.parse(xml) + + assert feed.feed.date == feed.feed.updated + assert feed.feed.date_parsed.tm_year == 2024 + assert feed.feed.date_parsed.tm_mon == 1 + assert feed.feed.date_parsed.tm_mday == 15 + + +def test_feed_date_alias_falls_back_to_published(): + """feed.date should fall back to feed.published if updated is absent""" + xml = """ + + Mon, 01 Jan 2024 12:00:00 GMT + + """ + + feed = feedparser_rs.parse(xml) + + # updated is None, so date should map to published + assert feed.feed.updated is None + assert feed.feed.published is not None + assert feed.feed.date == feed.feed.published + assert feed.feed.date_parsed.tm_year == 2024 + + +def test_feed_url_alias(): + """feed.url should map to feed.link""" + xml = """ + + https://example.com + + """ + + feed = feedparser_rs.parse(xml) + + assert feed.feed.link == "https://example.com" + assert feed.feed.url == "https://example.com" + assert feed.feed.url == feed.feed.link + + +def test_entry_guid_alias(): + """entry.guid should map to entry.id""" + xml = """ + + + abc123xyz + + + """ + + feed = feedparser_rs.parse(xml) + entry = feed.entries[0] + + assert entry.id == "abc123xyz" + assert entry.guid == "abc123xyz" + assert entry.guid == entry.id + + +def test_entry_description_alias(): + """entry.description should map to entry.summary""" + xml = """ + + + Entry summary text + + + """ + + feed = feedparser_rs.parse(xml) + entry = feed.entries[0] + + assert entry.summary == "Entry summary text" + assert entry.description == "Entry summary text" + assert entry.description == entry.summary + + +def test_entry_issued_alias(): + """entry.issued should map to entry.published""" + xml = """ + + + Mon, 01 Jan 2024 12:00:00 GMT + + + """ + + feed = feedparser_rs.parse(xml) + entry = feed.entries[0] + + assert entry.published is not None + assert entry.issued == entry.published + # Both _parsed versions should work + assert entry.issued_parsed is not None + assert entry.issued_parsed.tm_year == 2024 + + +def test_entry_modified_alias(): + """entry.modified should map to entry.updated""" + xml = """ + + 2024-01-15T10:30:00Z + + """ + + feed = feedparser_rs.parse(xml) + entry = feed.entries[0] + + assert entry.updated is not None + assert entry.modified == entry.updated + assert entry.modified_parsed.tm_year == 2024 + + +def test_entry_date_alias_falls_back_to_updated(): + """entry.date should map to entry.updated as primary fallback""" + xml = """ + + 2024-01-15T10:30:00Z + + """ + + feed = feedparser_rs.parse(xml) + entry = feed.entries[0] + + assert entry.date == entry.updated + assert entry.date_parsed.tm_year == 2024 + + +def test_entry_date_alias_falls_back_to_published(): + """entry.date should fall back to entry.published if updated is absent""" + xml = """ + + + Mon, 01 Jan 2024 12:00:00 GMT + + + """ + + feed = feedparser_rs.parse(xml) + entry = feed.entries[0] + + # updated is None, so date should map to published + assert entry.updated is None + assert entry.published is not None + assert entry.date == entry.published + assert entry.date_parsed.tm_year == 2024 + + +def test_container_channel_alias(): + """d.channel should map to d.feed (RSS uses )""" + xml = """ + + RSS Feed Title + + """ + + d = feedparser_rs.parse(xml) + + # Both should work and return the same object + assert d.feed.title == "RSS Feed Title" + assert d.channel.title == "RSS Feed Title" + # Verify they're the same object by checking id + assert d.channel.title == d.feed.title + + +def test_container_items_alias(): + """d.items should map to d.entries (RSS uses )""" + xml = """ + + Item 1 + Item 2 + + """ + + d = feedparser_rs.parse(xml) + + # Both should work and return the same list + assert len(d.entries) == 2 + assert len(d.items) == 2 + assert d.items[0].title == "Item 1" + assert d.items[1].title == "Item 2" + + +def test_unknown_field_raises_attribute_error(): + """Accessing unknown field should raise AttributeError""" + xml = """ + + Test + + Test Item + + + """ + + feed = feedparser_rs.parse(xml) + + # Unknown fields should raise AttributeError + with pytest.raises(AttributeError, match="has no attribute"): + _ = feed.feed.nonexistent_field + + with pytest.raises(AttributeError, match="has no attribute"): + _ = feed.entries[0].fake_attribute + + with pytest.raises(AttributeError, match="has no attribute"): + _ = feed.this_does_not_exist + + +def test_multiple_alias_access(): + """Test accessing multiple aliases in same object""" + xml = """ + My Feed + Feed description + 2024-01-01T12:00:00Z + Copyright 2024 + + entry-1 + Entry Title + Entry summary + 2024-01-01T10:00:00Z + 2024-01-01T11:00:00Z + + """ + + feed = feedparser_rs.parse(xml) + + # Feed-level aliases + assert feed.feed.description == "Feed description" + assert feed.feed.tagline == "Feed description" + assert feed.feed.modified is not None + assert feed.feed.copyright == "Copyright 2024" + + # Entry-level aliases + entry = feed.entries[0] + assert entry.guid == "entry-1" + assert entry.description == "Entry summary" + assert entry.issued is not None + assert entry.modified is not None + + +def test_detail_field_aliases(): + """Test that _detail field aliases work correctly""" + xml = """ + <b>Bold subtitle</b> + Copyright 2024 + + Entry summary + + """ + + feed = feedparser_rs.parse(xml) + + # Feed-level _detail aliases + assert feed.feed.subtitle_detail is not None + assert feed.feed.description_detail is not None + assert feed.feed.description_detail.type == feed.feed.subtitle_detail.type + + assert feed.feed.rights_detail is not None + assert feed.feed.copyright_detail is not None + assert feed.feed.copyright_detail.type == feed.feed.rights_detail.type + + # Entry-level _detail aliases + entry = feed.entries[0] + assert entry.summary_detail is not None + assert entry.description_detail is not None + assert entry.description_detail.value == entry.summary_detail.value + + +def test_existing_attribute_access_still_works(): + """Ensure normal attribute access is not affected by __getattr__""" + xml = """ + + Test Feed + https://example.com + Feed description + + Entry Title + https://example.com/entry + entry-1 + Entry summary + + + """ + + feed = feedparser_rs.parse(xml) + + # Direct attribute access should work normally + assert feed.feed.title == "Test Feed" + assert feed.feed.link == "https://example.com" + assert feed.feed.subtitle == "Feed description" + + assert feed.entries[0].title == "Entry Title" + assert feed.entries[0].link == "https://example.com/entry" + assert feed.entries[0].id == "entry-1" + assert feed.entries[0].summary == "Entry summary" + + # FeedParserDict level + assert feed.version is not None + assert feed.bozo is not None + + +# Phase 2: Dict-style access tests (__getitem__) + + +def test_dict_access_feed_fields(): + """Test dict-style access for feed fields""" + xml = """ + + Test Feed + https://example.com + Feed description + + """ + + feed = feedparser_rs.parse(xml) + + # Dict-style access should work + assert feed['feed']['title'] == "Test Feed" + assert feed['feed']['link'] == "https://example.com" + assert feed['feed']['subtitle'] == "Feed description" + + # Mixed access should work + assert feed['feed'].title == "Test Feed" + assert feed.feed['title'] == "Test Feed" + + +def test_dict_access_entry_fields(): + """Test dict-style access for entry fields""" + xml = """ + + + Entry Title + https://example.com/entry + entry-1 + Entry summary + + + """ + + feed = feedparser_rs.parse(xml) + entry = feed['entries'][0] + + # Dict-style access should work + assert entry['title'] == "Entry Title" + assert entry['link'] == "https://example.com/entry" + assert entry['id'] == "entry-1" + assert entry['summary'] == "Entry summary" + + # Mixed access should work + assert feed['entries'][0].title == "Entry Title" + assert feed.entries[0]['title'] == "Entry Title" + + +def test_dict_access_with_deprecated_aliases(): + """Test dict-style access with deprecated field names""" + xml = """ + My Feed + Feed description + 2024-01-01T12:00:00Z + Copyright 2024 + + entry-1 + Entry Title + Entry summary + 2024-01-01T10:00:00Z + + """ + + feed = feedparser_rs.parse(xml) + + # Feed-level deprecated aliases should work with dict access + assert feed['feed']['description'] == "Feed description" + assert feed['feed']['tagline'] == "Feed description" + assert feed['feed']['copyright'] == "Copyright 2024" + assert feed['feed']['modified'] is not None + + # Entry-level deprecated aliases should work with dict access + entry = feed['entries'][0] + assert entry['guid'] == "entry-1" + assert entry['description'] == "Entry summary" + assert entry['issued'] is not None + + +def test_dict_access_container_aliases(): + """Test dict-style access with container name aliases""" + xml = """ + + RSS Feed + Item 1 + Item 2 + + """ + + d = feedparser_rs.parse(xml) + + # channel → feed alias should work with dict access + assert d['channel']['title'] == "RSS Feed" + assert d['feed']['title'] == "RSS Feed" + + # items → entries alias should work with dict access + assert len(d['items']) == 2 + assert len(d['entries']) == 2 + assert d['items'][0]['title'] == "Item 1" + assert d['entries'][0]['title'] == "Item 1" + + +def test_dict_access_top_level_fields(): + """Test dict-style access for top-level FeedParserDict fields""" + xml = """ + + Test + + """ + + feed = feedparser_rs.parse(xml) + + # Top-level fields should be accessible via dict-style + assert feed['version'] == 'rss20' + assert feed['bozo'] is False + assert feed['encoding'] is not None + + +def test_dict_access_unknown_key_raises_keyerror(): + """Dict access with unknown key should raise KeyError (not AttributeError)""" + xml = """ + + Test + + Test Item + + + """ + + feed = feedparser_rs.parse(xml) + + # Unknown keys should raise KeyError for dict access + with pytest.raises(KeyError): + _ = feed['nonexistent_field'] + + with pytest.raises(KeyError): + _ = feed['feed']['fake_field'] + + with pytest.raises(KeyError): + _ = feed['entries'][0]['unknown_key'] + + # But AttributeError should still be raised for attribute access + with pytest.raises(AttributeError, match="has no attribute"): + _ = feed.feed.fake_field + + +def test_dict_and_attribute_access_equivalence(): + """Test that dict and attribute access return same values""" + xml = """ + My Feed + Feed description + + 2024-01-01T12:00:00Z + + entry-1 + Entry Title + Entry summary + + 2024-01-01T10:00:00Z + + """ + + feed = feedparser_rs.parse(xml) + + # Feed-level fields should be identical via both access methods + assert feed.feed.title == feed['feed']['title'] + assert feed.feed.subtitle == feed['feed']['subtitle'] + assert feed.feed.link == feed['feed']['link'] + assert feed.feed.updated == feed['feed']['updated'] + + # Entry-level fields should be identical via both access methods + entry = feed.entries[0] + assert entry.id == entry['id'] + assert entry.title == entry['title'] + assert entry.summary == entry['summary'] + assert entry.link == entry['link'] + assert entry.updated == entry['updated'] + + # Top-level fields should be identical + assert feed.version == feed['version'] + assert feed.bozo == feed['bozo'] + + +def test_dict_access_with_none_values(): + """Test dict access returns None for missing optional fields""" + xml = """ + + Minimal Feed + + """ + + feed = feedparser_rs.parse(xml) + + # Missing optional fields should return None via dict access + assert feed['feed']['subtitle'] is None + assert feed['feed']['updated'] is None + assert feed['feed']['author'] is None + assert feed['feed']['image'] is None + + +def test_dict_access_detail_fields(): + """Test dict access for _detail fields""" + xml = """ + <b>Bold subtitle</b> + Copyright 2024 + + Entry summary + + """ + + feed = feedparser_rs.parse(xml) + + # _detail fields should work with dict access + assert feed['feed']['subtitle_detail'] is not None + assert feed['feed']['subtitle_detail'].type == 'html' + + assert feed['feed']['rights_detail'] is not None + assert feed['feed']['copyright_detail'] is not None + assert feed['feed']['copyright_detail'].type == 'text' + + entry = feed['entries'][0] + assert entry['summary_detail'] is not None + assert entry['description_detail'] is not None + + +def test_dict_access_list_fields(): + """Test dict access for list fields (links, tags, authors, etc.)""" + xml = """ + + + + + + + + + """ + + feed = feedparser_rs.parse(xml) + + # List fields should work with dict access + assert len(feed['feed']['links']) == 2 + assert feed['feed']['links'][0].href == "https://example.com/feed" + + assert len(feed['feed']['tags']) == 2 + assert feed['feed']['tags'][0].term == "technology" + + entry = feed['entries'][0] + assert len(entry['links']) >= 1 + assert len(entry['tags']) == 1 + assert entry['tags'][0].term == "rust" + + +# ============================================================================= +# Phase 4: Auto-URL Detection Tests +# ============================================================================= + + +def test_parse_with_optional_http_params(): + """Test that parse() accepts optional HTTP parameters for URL fetching""" + # When parsing content (not URL), these params should be ignored + xml = """ + + Test Feed + + """ + + # Should work with optional params (they're just ignored for content) + feed = feedparser_rs.parse(xml, etag="some-etag", modified="some-date") + assert feed.feed.title == "Test Feed" + assert feed.version == 'rss20' + + +def test_parse_with_user_agent_param(): + """Test that parse() accepts user_agent parameter""" + xml = """ + + Test Feed + + """ + + # Should work with user_agent param (ignored for content) + feed = feedparser_rs.parse(xml, user_agent="TestBot/1.0") + assert feed.feed.title == "Test Feed" + + +def test_parse_url_detection_http(): + """Test that parse() detects http:// URLs""" + # This test verifies URL detection logic without actually fetching + # Since we don't have an HTTP feature enabled or a real server, + # we just verify the parse function signature accepts URL-like strings + try: + # This will either succeed (if http feature enabled and server exists) + # or raise NotImplementedError (if http feature disabled) + feedparser_rs.parse("http://example.com/nonexistent") + except NotImplementedError as e: + # http feature not enabled - this is expected + assert "http" in str(e).lower() + except Exception: + # Some other error (network, etc.) - also acceptable + pass + + +def test_parse_url_detection_https(): + """Test that parse() detects https:// URLs""" + try: + feedparser_rs.parse("https://example.com/nonexistent") + except NotImplementedError as e: + # http feature not enabled - this is expected + assert "http" in str(e).lower() + except Exception: + # Some other error (network, etc.) - also acceptable + pass + + +def test_parse_content_starting_with_http_in_text(): + """Test that content containing 'http' as text is not treated as URL""" + # This should be parsed as content, not as a URL + xml = """ + + HTTP Guide + Learn about http protocol + + """ + + feed = feedparser_rs.parse(xml) + assert feed.feed.title == "HTTP Guide" + assert "http" in feed.feed.subtitle.lower() + + +def test_parse_bytes_content(): + """Test that bytes content is still parsed correctly""" + xml = b""" + + Bytes Feed + + """ + + feed = feedparser_rs.parse(xml) + assert feed.feed.title == "Bytes Feed" + + +def test_parse_with_limits_accepts_http_params(): + """Test that parse_with_limits() also accepts HTTP parameters""" + xml = """ + + Test Feed + + """ + + limits = feedparser_rs.ParserLimits() + + # Should work with all optional params + feed = feedparser_rs.parse_with_limits( + xml, + etag="etag", + modified="modified", + user_agent="TestBot/1.0", + limits=limits + ) + assert feed.feed.title == "Test Feed"