Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ repos:
additional_dependencies:
- orjson # Ref: https://github.com/python/mypy/blob/v1.13.0/CHANGELOG.md#improved-performance
- httpx>=0.27
- pandas-stubs>=2.2
- pytest>=8.2
- respx>=0.21
- typer>=0.12
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ Repository(re3data_org_identifier='r3d100010468', repository_name=RepositoryName
- dataclass (default): Returns a Python dataclass object, allowing convenient access to the element of the re3data
schema
- response: Returns a Python object representing the API response
- original XML: Returns the raw XML response from the API
- JSON: Returns a JSON representation of the API response
- original XML (str): Returns the raw XML response from the API
- JSON (str): Returns a JSON representation of the API response
- dictionary: Returns a dictionary representation of the API response
- csv (str): Returns a CSV representation of the API response
- dataframe: Returns a pandas.DataFrame representation of the API response

## Requirements

Expand All @@ -61,6 +63,8 @@ Repository(re3data_org_identifier='r3d100010468', repository_name=RepositoryName
schemas, simplifies processing of API responses.
- **Optional CLI**: [typer](https://github.com/tiangolo/typer), a popular library for building command-line interfaces,
powers the user-friendly interface.
- **Optional DataFrame/CSV**: [pandas](https://github.com/pandas-dev/pandas), a powerful and flexible data analysis
library, enables generation of DataFrames and CSV files from parsed XML responses.

## Installation

Expand Down
8 changes: 8 additions & 0 deletions docs/src/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ Install with `python -m pip install "python-re3data[cli]"`.
| ------------------------------------------ | ------- | ------------------------------------------------------------------------------------------- |
| [typer](https://github.com/tiangolo/typer) | >= 0.12 | A popular library for building command-line interfaces, powers the user-friendly interface. |

#### CSV

Install with `python -m pip install "python-re3data[csv]"`.

| Package | Version | Description |
| ---------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------ |
| [pandas](https://github.com/pandas-dev/pandas) | >= 2.0 | A powerful and flexible data analysis library, enables generation of DataFrames and CSV files from parsed XML responses. |

<!---
This installation guide is adapted from these sources:
- "pandas" Installation, https://pandas.pydata.org/docs/getting_started/install.html (BSD-3-Clause license)
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,12 @@ dependencies = [
optional-dependencies.cli = [
"typer>=0.12",
]
optional-dependencies.csv = [
"pandas>=2",
]
optional-dependencies.dev = [
"pre-commit-uv~=4.1",
"python-re3data[cli]",
"python-re3data[cli,csv]",
]
optional-dependencies.docs = [
"mike~=2.1",
Expand Down
7 changes: 4 additions & 3 deletions src/re3data/_client/_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
from re3data._response import Response, _build_response

if TYPE_CHECKING:
from re3data._resources import Repository, RepositorySummary
from pandas import DataFrame

from re3data._resources import Repository, RepositorySummary
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -59,7 +60,7 @@ async def list(
query: str | None = None,
return_type: ReturnType = ReturnType.DATACLASS,
count: bool = False,
) -> list[RepositorySummary] | Response | dict[str, Any] | str | int:
) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
"""List the metadata of all repositories in the re3data API.

Args:
Expand All @@ -83,7 +84,7 @@ async def list(

async def get(
self, repository_id: str, return_type: ReturnType = ReturnType.DATACLASS
) -> Repository | Response | dict[str, Any] | str:
) -> Repository | Response | dict[str, Any] | DataFrame | str:
"""Get the metadata of a specific repository.

Args:
Expand Down
7 changes: 4 additions & 3 deletions src/re3data/_client/_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
from re3data._response import Response, _build_response

if TYPE_CHECKING:
from re3data._resources import Repository, RepositorySummary
from pandas import DataFrame

from re3data._resources import Repository, RepositorySummary
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -61,7 +62,7 @@ def list(
query: str | None = None,
return_type: ReturnType = ReturnType.DATACLASS,
count: bool = False,
) -> list[RepositorySummary] | Response | dict[str, Any] | str | int:
) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
"""List the metadata of all repositories in the re3data API.

Args:
Expand All @@ -85,7 +86,7 @@ def list(

def get(
self, repository_id: str, return_type: ReturnType = ReturnType.DATACLASS
) -> Repository | Response | dict[str, Any] | str:
) -> Repository | Response | dict[str, Any] | DataFrame | str:
"""Get the metadata of a specific repository.

Args:
Expand Down
26 changes: 19 additions & 7 deletions src/re3data/_client/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@

from re3data import __version__
from re3data._response import Response, _count_repositories, _parse_repositories_response, _parse_repository_response
from re3data._serializer import _to_dict, _to_json
from re3data._serializer import _to_csv, _to_dataframe, _to_dict, _to_json

if TYPE_CHECKING:
from pandas import DataFrame

from re3data._resources import Repository, RepositorySummary

BASE_URL: str = "https://www.re3data.org/api/beta/"
Expand All @@ -37,7 +39,9 @@ class ResourceType(str, Enum):


class ReturnType(str, Enum):
CSV = "csv"
DATACLASS = "dataclass"
DATAFRAME = "dataframe"
DICT = "dict"
JSON = "json"
RESPONSE = "response"
Expand Down Expand Up @@ -80,19 +84,19 @@ def _build_query_params(query: str | None = None) -> dict[str, str]:
@overload
def _dispatch_return_type(
response: Response, resource_type: Literal[ResourceType.REPOSITORY], return_type: ReturnType, count: bool = False
) -> Repository | Response | dict[str, Any] | str: ...
) -> Repository | Response | dict[str, Any] | DataFrame | str: ...
@overload
def _dispatch_return_type(
response: Response,
resource_type: Literal[ResourceType.REPOSITORY_LIST],
return_type: ReturnType,
count: bool = False,
) -> list[RepositorySummary] | Response | dict[str, Any] | str | int: ...
) -> list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int: ...


def _dispatch_return_type(
def _dispatch_return_type( # noqa: PLR0911
response: Response, resource_type: ResourceType, return_type: ReturnType, count: bool = False
) -> Repository | list[RepositorySummary] | Response | dict[str, Any] | str | int:
) -> Repository | list[RepositorySummary] | Response | dict[str, Any] | DataFrame | str | int:
"""Dispatch the response to the correct return type based on the provided return type and resource type.

Args:
Expand All @@ -105,14 +109,15 @@ def _dispatch_return_type(
Depending on the return_type and resource_type, this can be a Repository object, a list of RepositorySummary
objects, an HTTP response, a dictionary representation or the original XML.
"""
# return the count of repositories, the response or the original xml before parsing the response
if resource_type == ResourceType.REPOSITORY_LIST and count:
return _count_repositories(response.text)

if return_type == ReturnType.RESPONSE:
return response
if return_type == ReturnType.XML:
return response.text

# all subsequent return types rely on parsing the response first
parsed: Repository | list[RepositorySummary]
if resource_type == ResourceType.REPOSITORY_LIST:
parsed = _parse_repositories_response(response)
Expand All @@ -121,9 +126,16 @@ def _dispatch_return_type(
if return_type == ReturnType.DATACLASS:
return parsed

# JSON and dictionary
if return_type == ReturnType.JSON:
return _to_json(parsed)
return _to_dict(parsed)
if return_type == ReturnType.DICT:
return _to_dict(parsed)

# tabular representations: DataFrame and CSV
if return_type == ReturnType.DATAFRAME:
return _to_dataframe(parsed)
return _to_csv(parsed)


class BaseClient:
Expand Down
57 changes: 53 additions & 4 deletions src/re3data/_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,41 @@
#
# SPDX-License-Identifier: MIT

"""The _serializer module offers functions for converting parsed data into dictionaries or JSON strings.
"""The _serializer module offers functions for converting parsed data into various return types.

This module provides functions to serialize various types of data into dictionaries or JSON strings.
This module provides functions to serialize various types of data, e.g. into dictionaries or JSON strings.
The serialized data can be used for further processing or storage.

Functions:
_to_dict: Serialize parsed data into a dictionary.
_to_json: Serialize parsed data into a JSON string.
_to_dataframe: Serialize parsed data into a DataFrame.
_to_csv: Serialize parsed data into a CSV string.
"""

from typing import Any
from __future__ import annotations

import logging
import sys
from typing import TYPE_CHECKING, Any

try:
from pandas import json_normalize

PANDAS_INSTALLED = True
except ImportError:
PANDAS_INSTALLED = False

from xsdata.formats.dataclass.context import XmlContext
from xsdata.formats.dataclass.serializers import DictEncoder, JsonSerializer
from xsdata.formats.dataclass.serializers.config import SerializerConfig

from re3data._resources import Repository, RepositorySummary
if TYPE_CHECKING:
from pandas import DataFrame

from re3data._resources import Repository, RepositorySummary

logger = logging.getLogger(__name__)
CONFIG = SerializerConfig(indent=" ")
CONTEXT = XmlContext()

Expand Down Expand Up @@ -51,3 +68,35 @@ def _to_json(parsed: Repository | list[RepositorySummary]) -> str:
A JSON representation of the input data.
"""
return JSON_SERIALIZER.render(parsed)


def _to_dataframe(parsed: Repository | list[RepositorySummary]) -> DataFrame:
"""Serialize parsed data into a DataFrame.

Args:
parsed: The input data to be serialized. It can be either a single `Repository` object or a list of
`RepositorySummary` objects.

Returns:
A DataFrame representation of the input data.
"""
if PANDAS_INSTALLED:
return json_normalize(_to_dict(parsed))
logger.error("`pandas` is missing. Please run 'pip install python-re3data[csv]'.")
sys.exit(1)


def _to_csv(parsed: Repository | list[RepositorySummary]) -> str:
"""Serialize parsed data into a CSV string.

Args:
parsed: The input data to be serialized. It can be either a single `Repository` object or a list of
`RepositorySummary` objects.

Returns:
A CSV string representation of the input data.
"""
if PANDAS_INSTALLED:
return _to_dataframe(parsed).to_csv(index=False)
logger.error("`pandas` is missing. Please run 'pip install python-re3data[csv]'.")
sys.exit(1)
34 changes: 34 additions & 0 deletions tests/integration/test_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import httpx
import pytest
from pandas import DataFrame

from re3data import RepositoryNotFoundError, Response, ReturnType
from re3data._resources import Repository, RepositoryName, RepositorySummary
Expand Down Expand Up @@ -64,6 +65,21 @@ async def test_client_list_repositories_dict(async_client: AsyncClient, mock_rep
assert repository["id"] == "r3d100010371"


async def test_client_list_repositories_csv(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
response = await async_client.repositories.list(return_type=ReturnType.CSV)
assert isinstance(response, str)
assert response.startswith("id,doi,name,")
assert "r3d100010371" in response
assert "https://doi.org/10.17616/R3P594" in response


async def test_client_list_repositories_dataframe(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
response = await async_client.repositories.list(return_type=ReturnType.DATAFRAME)
assert isinstance(response, DataFrame)
assert response.shape == (3, 5)
assert response["id"].loc[0] == "r3d100010371"


async def test_client_list_repositories_response(async_client: AsyncClient, mock_repository_list_route: Route) -> None:
response = await async_client.repositories.list(return_type=ReturnType.RESPONSE)
assert isinstance(response, Response)
Expand Down Expand Up @@ -139,6 +155,24 @@ async def test_client_get_single_repository_dict(
assert response["re3data.orgIdentifier"] == zenodo_id


async def test_client_get_single_repository_csv(
async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
) -> None:
response = await async_client.repositories.get(zenodo_id, return_type=ReturnType.CSV)
assert isinstance(response, str)
assert response.startswith("re3data.orgIdentifier,additionalName,repositoryURL,")
assert "r3d100010468" in response


async def test_client_get_single_repository_dataframe(
async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
) -> None:
response = await async_client.repositories.get(zenodo_id, return_type=ReturnType.DATAFRAME)
assert isinstance(response, DataFrame)
assert response.shape == (1, 43)
assert response["re3data.orgIdentifier"].loc[0] == "r3d100010468"


async def test_client_get_single_repository_response(
async_client: AsyncClient, mock_repository_get_route: Route, zenodo_id: str
) -> None:
Expand Down
Loading
Loading