afuetterer · afuetterer · Dec 11, 2024 · Jul 6, 2024
@@ -94,6 +94,7 @@ repos:
     additional_dependencies:
     - orjson # Ref: https://github.com/python/mypy/blob/v1.13.0/CHANGELOG.md#improved-performance
     - httpx>=0.27
+    - pandas-stubs>=2.2
     - pytest>=8.2
     - respx>=0.21
     - typer>=0.12

@@ -45,9 +45,11 @@ Repository(re3data_org_identifier='r3d100010468', repository_name=RepositoryName
     - dataclass (default): Returns a Python dataclass object, allowing convenient access to the element of the re3data
         schema
     - response: Returns a Python object representing the API response
-    - original XML: Returns the raw XML response from the API
-    - JSON: Returns a JSON representation of the API response
+    - original XML (str): Returns the raw XML response from the API
+    - JSON (str): Returns a JSON representation of the API response
     - dictionary: Returns a dictionary representation of the API response
+    - csv (str): Returns a CSV representation of the API response
+    - dataframe: Returns a pandas.DataFrame representation of the API response
 
 ## Requirements
 
@@ -61,6 +63,8 @@ Repository(re3data_org_identifier='r3d100010468', repository_name=RepositoryName
     schemas, simplifies processing of API responses.
 - **Optional CLI**: [typer](https://github.com/tiangolo/typer), a popular library for building command-line interfaces,
     powers the user-friendly interface.
+- **Optional DataFrame/CSV**: [pandas](https://github.com/pandas-dev/pandas), a powerful and flexible data analysis
+    library, enables generation of DataFrames and CSV files from parsed XML responses.
 
 ## Installation
 

@@ -129,6 +129,14 @@ Install with `python -m pip install "python-re3data[cli]"`.
 | ------------------------------------------ | ------- | ------------------------------------------------------------------------------------------- |
 | [typer](https://github.com/tiangolo/typer) | >= 0.12 | A popular library for building command-line interfaces, powers the user-friendly interface. |
 
+#### CSV
+
+Install with `python -m pip install "python-re3data[csv]"`.
+
+| Package                                        | Version | Description                                                                                                              |
+| ---------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------ |
+| [pandas](https://github.com/pandas-dev/pandas) | >= 2.0  | A powerful and flexible data analysis library, enables generation of DataFrames and CSV files from parsed XML responses. |
+
 <!---
 This installation guide is adapted from these sources:
 - "pandas" Installation, https://pandas.pydata.org/docs/getting_started/install.html (BSD-3-Clause license)

@@ -49,9 +49,12 @@ dependencies = [
 optional-dependencies.cli = [
   "typer>=0.12",
 ]
+optional-dependencies.csv = [
+  "pandas>=2",
+]
 optional-dependencies.dev = [
   "pre-commit-uv~=4.1",
-  "python-re3data[cli]",
+  "python-re3data[cli,csv]",
 ]
 optional-dependencies.docs = [
   "mike~=2.1",

@@ -22,8 +22,9 @@
 from re3data._response import Response, _build_response
 
 if TYPE_CHECKING:
-    from re3data._resources import Repository, RepositorySummary
+    from pandas import DataFrame
 
+    from re3data._resources import Repository, RepositorySummary
 logger = logging.getLogger(__name__)
 
 
@@ -59,7 +60,7 @@ async def list(
         query: str | None = None,
         return_type: ReturnType = ReturnType.DATACLASS,
         count: bool = False,
-    ) -> list[RepositorySummary] | Response | dict[str, Any] | str | int:
+    ) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
         """List the metadata of all repositories in the re3data API.
 
         Args:
@@ -83,7 +84,7 @@ async def list(
 
     async def get(
         self, repository_id: str, return_type: ReturnType = ReturnType.DATACLASS
-    ) -> Repository | Response | dict[str, Any] | str:
+    ) -> Repository | Response | dict[str, Any] | DataFrame | str:
         """Get the metadata of a specific repository.
 
         Args:

@@ -24,8 +24,9 @@
 from re3data._response import Response, _build_response
 
 if TYPE_CHECKING:
-    from re3data._resources import Repository, RepositorySummary
+    from pandas import DataFrame
 
+    from re3data._resources import Repository, RepositorySummary
 logger = logging.getLogger(__name__)
 
 
@@ -61,7 +62,7 @@ def list(
         query: str | None = None,
         return_type: ReturnType = ReturnType.DATACLASS,
         count: bool = False,
-    ) -> list[RepositorySummary] | Response | dict[str, Any] | str | int:
+    ) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
         """List the metadata of all repositories in the re3data API.
 
         Args:
@@ -85,7 +86,7 @@ def list(
 
     def get(
         self, repository_id: str, return_type: ReturnType = ReturnType.DATACLASS
-    ) -> Repository | Response | dict[str, Any] | str:
+    ) -> Repository | Response | dict[str, Any] | DataFrame | str:
         """Get the metadata of a specific repository.
 
         Args:

@@ -13,9 +13,11 @@
 
 from re3data import __version__
 from re3data._response import Response, _count_repositories, _parse_repositories_response, _parse_repository_response
-from re3data._serializer import _to_dict, _to_json
+from re3data._serializer import _to_csv, _to_dataframe, _to_dict, _to_json
 
 if TYPE_CHECKING:
+    from pandas import DataFrame
+
     from re3data._resources import Repository, RepositorySummary
 
 BASE_URL: str = "https://www.re3data.org/api/beta/"
@@ -37,7 +39,9 @@ class ResourceType(str, Enum):
 
 
 class ReturnType(str, Enum):
+    CSV = "csv"
     DATACLASS = "dataclass"
+    DATAFRAME = "dataframe"
     DICT = "dict"
     JSON = "json"
     RESPONSE = "response"
@@ -80,19 +84,19 @@ def _build_query_params(query: str | None = None) -> dict[str, str]:
 @overload
 def _dispatch_return_type(
     response: Response, resource_type: Literal[ResourceType.REPOSITORY], return_type: ReturnType, count: bool = False
-) -> Repository | Response | dict[str, Any] | str: ...
+) -> Repository | Response | dict[str, Any] | DataFrame | str: ...
 @overload
 def _dispatch_return_type(
     response: Response,
     resource_type: Literal[ResourceType.REPOSITORY_LIST],
     return_type: ReturnType,
     count: bool = False,
-) -> list[RepositorySummary] | Response | dict[str, Any] | str | int: ...
+) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int: ...
 
 
-def _dispatch_return_type(
+def _dispatch_return_type(  # noqa: PLR0911
     response: Response, resource_type: ResourceType, return_type: ReturnType, count: bool = False
-) -> Repository | list[RepositorySummary] | Response | dict[str, Any] | str | int:
+) -> Repository | list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
     """Dispatch the response to the correct return type based on the provided return type and resource type.
 
     Args:
@@ -105,14 +109,15 @@ def _dispatch_return_type(
         Depending on the return_type and resource_type, this can be a Repository object, a list of RepositorySummary
             objects, an HTTP response, a dictionary representation or the original XML.
     """
+    # return the count of repositories, the response or the original xml before parsing the response
     if resource_type == ResourceType.REPOSITORY_LIST and count:
         return _count_repositories(response.text)
-
     if return_type == ReturnType.RESPONSE:
         return response
     if return_type == ReturnType.XML:
         return response.text
 
+    # all subsequent return types rely on parsing the response first
     parsed: Repository | list[RepositorySummary]
     if resource_type == ResourceType.REPOSITORY_LIST:
         parsed = _parse_repositories_response(response)
@@ -121,9 +126,16 @@ def _dispatch_return_type(
     if return_type == ReturnType.DATACLASS:
         return parsed
 
+    # JSON and dictionary
     if return_type == ReturnType.JSON:
         return _to_json(parsed)
-    return _to_dict(parsed)
+    if return_type == ReturnType.DICT:
+        return _to_dict(parsed)
+
+    # tabular representations: DataFrame and CSV
+    if return_type == ReturnType.DATAFRAME:
+        return _to_dataframe(parsed)
+    return _to_csv(parsed)
 
 
 class BaseClient:

@@ -2,24 +2,41 @@
 #
 # SPDX-License-Identifier: MIT
 
-"""The _serializer module offers functions for converting parsed data into dictionaries or JSON strings.
+"""The _serializer module offers functions for converting parsed data into various return types.
 
-This module provides functions to serialize various types of data into dictionaries or JSON strings.
+This module provides functions to serialize various types of data, e.g. into dictionaries or JSON strings.
 The serialized data can be used for further processing or storage.
 
 Functions:
     _to_dict: Serialize parsed data into a dictionary.
     _to_json: Serialize parsed data into a JSON string.
+    _to_dataframe: Serialize parsed data into a DataFrame.
+    _to_csv: Serialize parsed data into a CSV string.
 """
 
-from typing import Any
+from __future__ import annotations
+
+import logging
+import sys
+from typing import TYPE_CHECKING, Any
+
+try:
+    from pandas import json_normalize
+
+    PANDAS_INSTALLED = True
+except ImportError:
+    PANDAS_INSTALLED = False
 
 from xsdata.formats.dataclass.context import XmlContext
 from xsdata.formats.dataclass.serializers import DictEncoder, JsonSerializer
 from xsdata.formats.dataclass.serializers.config import SerializerConfig
 
-from re3data._resources import Repository, RepositorySummary
+if TYPE_CHECKING:
+    from pandas import DataFrame
 
+    from re3data._resources import Repository, RepositorySummary
+
+logger = logging.getLogger(__name__)
 CONFIG = SerializerConfig(indent="  ")
 CONTEXT = XmlContext()
 
@@ -51,3 +68,35 @@ def _to_json(parsed: Repository | list[RepositorySummary]) -> str:
         A JSON representation of the input data.
     """
     return JSON_SERIALIZER.render(parsed)
+
+
+def _to_dataframe(parsed: Repository | list[RepositorySummary]) -> DataFrame:
+    """Serialize parsed data into a DataFrame.
+
+    Args:
+        parsed: The input data to be serialized. It can be either a single `Repository` object or a list of
+            `RepositorySummary` objects.
+
+    Returns:
+        A DataFrame representation of the input data.
+    """
+    if PANDAS_INSTALLED:
+        return json_normalize(_to_dict(parsed))
+    logger.error("`pandas` is missing. Please run 'pip install python-re3data[csv]'.")
+    sys.exit(1)
+
+
+def _to_csv(parsed: Repository | list[RepositorySummary]) -> str:
+    """Serialize parsed data into a CSV string.
+
+    Args:
+        parsed: The input data to be serialized. It can be either a single `Repository` object or a list of
+            `RepositorySummary` objects.
+
+    Returns:
+        A CSV string representation of the input data.
+    """
+    if PANDAS_INSTALLED:
+        return _to_dataframe(parsed).to_csv(index=False)
+    logger.error("`pandas` is missing. Please run 'pip install python-re3data[csv]'.")
+    sys.exit(1)
@@ -8,6 +8,7 @@
 
 import httpx
 import pytest
+from pandas import DataFrame
 
 from re3data import RepositoryNotFoundError, Response, ReturnType
 from re3data._resources import Repository, RepositoryName, RepositorySummary
@@ -64,6 +65,21 @@ async def test_client_list_repositories_dict(async_client: AsyncClient, mock_rep
     assert repository["id"] == "r3d100010371"
 
 
+async def test_client_list_repositories_csv(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
+    response = await async_client.repositories.list(return_type=ReturnType.CSV)
+    assert isinstance(response, str)
+    assert response.startswith("id,doi,name,")
+    assert "r3d100010371" in response
+    assert "https://doi.org/10.17616/R3P594" in response
+
+
+async def test_client_list_repositories_dataframe(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
+    response = await async_client.repositories.list(return_type=ReturnType.DATAFRAME)
+    assert isinstance(response, DataFrame)
+    assert response.shape == (3, 5)
+    assert response["id"].loc[0] == "r3d100010371"
+
+
 async def test_client_list_repositories_response(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
     response = await async_client.repositories.list(return_type=ReturnType.RESPONSE)
     assert isinstance(response, Response)
@@ -139,6 +155,24 @@ async def test_client_get_single_repository_dict(
     assert response["re3data.orgIdentifier"] == zenodo_id
 
 
+async def test_client_get_single_repository_csv(
+    async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
+) -> None:
+    response = await async_client.repositories.get(zenodo_id, return_type=ReturnType.CSV)
+    assert isinstance(response, str)
+    assert response.startswith("re3data.orgIdentifier,additionalName,repositoryURL,")
+    assert "r3d100010468" in response
+
+
+async def test_client_get_single_repository_dataframe(
+    async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
+) -> None:
+    response = await async_client.repositories.get(zenodo_id, return_type=ReturnType.DATAFRAME)
+    assert isinstance(response, DataFrame)
+    assert response.shape == (1, 43)
+    assert response["re3data.orgIdentifier"].loc[0] == "r3d100010468"
+
+
 async def test_client_get_single_repository_response(
     async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
 ) -> None: