|
| 1 | +"""Arrow IPC helpers for ``GET /v1/results/{id}``. |
| 2 | +
|
| 3 | +The auto-generated :class:`hotdata.api.results_api.ResultsApi` understands the |
| 4 | +``format=arrow`` query parameter but cannot decode the |
| 5 | +``application/vnd.apache.arrow.stream`` response body — openapi-generator picks |
| 6 | +the JSON content variant for status 200 and routes Arrow bytes through the |
| 7 | +JSON deserializer, which raises ``Unsupported content type``. |
| 8 | +
|
| 9 | +This module wraps the generated client with a thin subclass that: |
| 10 | +
|
| 11 | +* sets ``Accept: application/vnd.apache.arrow.stream`` and ``?format=arrow``, |
| 12 | +* uses the generator's ``*_without_preload_content`` plumbing to hold the |
| 13 | + underlying ``urllib3.HTTPResponse`` open as a byte stream, |
| 14 | +* hands that stream to ``pyarrow.ipc.open_stream`` so callers get a |
| 15 | + :class:`pyarrow.Table` (or a :class:`pyarrow.RecordBatchStreamReader` for |
| 16 | + the streaming variant). |
| 17 | +
|
| 18 | +Install with ``pip install 'hotdata[arrow]'`` to pull in pyarrow. |
| 19 | +""" |
| 20 | + |
| 21 | +from __future__ import annotations |
| 22 | + |
| 23 | +from contextlib import contextmanager |
| 24 | +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional |
| 25 | + |
| 26 | +from hotdata.api.results_api import ResultsApi as _GeneratedResultsApi |
| 27 | +from hotdata.models.results_format_query import ResultsFormatQuery |
| 28 | + |
| 29 | +if TYPE_CHECKING: # pragma: no cover - import-time only for type checkers |
| 30 | + import pyarrow as pa # type: ignore[import-untyped] |
| 31 | + |
| 32 | + |
| 33 | +ARROW_STREAM_MEDIA_TYPE = "application/vnd.apache.arrow.stream" |
| 34 | + |
| 35 | + |
| 36 | +class ResultNotReadyError(Exception): |
| 37 | + """Raised when the result exists but is not yet ``ready``. |
| 38 | +
|
| 39 | + The server replies with HTTP 202 while a result is ``pending`` or |
| 40 | + ``processing``. Poll :meth:`ResultsApi.get_result` until ``status='ready'`` |
| 41 | + before fetching as Arrow. |
| 42 | + """ |
| 43 | + |
| 44 | + def __init__(self, status: str, result_id: str) -> None: |
| 45 | + self.status = status |
| 46 | + self.result_id = result_id |
| 47 | + super().__init__( |
| 48 | + f"Result {result_id} is not ready (status={status!r}); " |
| 49 | + "poll get_result until status='ready' before fetching as Arrow." |
| 50 | + ) |
| 51 | + |
| 52 | + |
| 53 | +def _import_pyarrow() -> Any: |
| 54 | + try: |
| 55 | + import pyarrow.ipc as ipc # type: ignore[import-untyped] |
| 56 | + except ImportError as exc: # pragma: no cover - exercised via tests |
| 57 | + raise ImportError( |
| 58 | + "pyarrow is required to fetch results as Arrow. " |
| 59 | + "Install with: pip install 'hotdata[arrow]'" |
| 60 | + ) from exc |
| 61 | + return ipc |
| 62 | + |
| 63 | + |
| 64 | +class ResultsApi(_GeneratedResultsApi): |
| 65 | + """Drop-in replacement for :class:`hotdata.api.results_api.ResultsApi` |
| 66 | + that adds Arrow IPC fetch helpers. |
| 67 | +
|
| 68 | + All methods on the base class continue to work unchanged. |
| 69 | + """ |
| 70 | + |
| 71 | + def get_result_arrow( |
| 72 | + self, |
| 73 | + id: str, |
| 74 | + *, |
| 75 | + offset: Optional[int] = None, |
| 76 | + limit: Optional[int] = None, |
| 77 | + _request_timeout: Any = None, |
| 78 | + ) -> "pa.Table": |
| 79 | + """Fetch a ready result as a :class:`pyarrow.Table`. |
| 80 | +
|
| 81 | + Buffers the full Arrow IPC stream into memory before returning. Use |
| 82 | + :meth:`stream_result_arrow` for large results where you want to |
| 83 | + iterate batches without materializing the whole table. |
| 84 | +
|
| 85 | + :param id: Result ID. |
| 86 | + :param offset: Rows to skip (default: 0). |
| 87 | + :param limit: Maximum rows to return (default: unbounded). |
| 88 | + :raises ResultNotReadyError: result is still pending or processing. |
| 89 | + :raises hotdata.exceptions.ApiException: for other HTTP errors |
| 90 | + (400 invalid params, 404 not found, 409 failed result). |
| 91 | + """ |
| 92 | + ipc = _import_pyarrow() |
| 93 | + response = self._call_arrow(id=id, offset=offset, limit=limit, |
| 94 | + _request_timeout=_request_timeout) |
| 95 | + try: |
| 96 | + return ipc.open_stream(response).read_all() |
| 97 | + finally: |
| 98 | + response.release_conn() |
| 99 | + |
| 100 | + @contextmanager |
| 101 | + def stream_result_arrow( |
| 102 | + self, |
| 103 | + id: str, |
| 104 | + *, |
| 105 | + offset: Optional[int] = None, |
| 106 | + limit: Optional[int] = None, |
| 107 | + _request_timeout: Any = None, |
| 108 | + ) -> Iterator["pa.RecordBatchStreamReader"]: |
| 109 | + """Yield a :class:`pyarrow.RecordBatchStreamReader` for a ready result. |
| 110 | +
|
| 111 | + The HTTP connection is released when the context exits. Iterate the |
| 112 | + reader to consume :class:`pyarrow.RecordBatch` messages, or call |
| 113 | + ``reader.read_all()`` for a full :class:`pyarrow.Table`. |
| 114 | +
|
| 115 | + Example:: |
| 116 | +
|
| 117 | + with results.stream_result_arrow(result_id) as reader: |
| 118 | + for batch in reader: |
| 119 | + process(batch) |
| 120 | +
|
| 121 | + :raises ResultNotReadyError: result is still pending or processing. |
| 122 | + :raises hotdata.exceptions.ApiException: for other HTTP errors. |
| 123 | + """ |
| 124 | + ipc = _import_pyarrow() |
| 125 | + response = self._call_arrow(id=id, offset=offset, limit=limit, |
| 126 | + _request_timeout=_request_timeout) |
| 127 | + try: |
| 128 | + yield ipc.open_stream(response) |
| 129 | + finally: |
| 130 | + response.release_conn() |
| 131 | + |
| 132 | + def _call_arrow( |
| 133 | + self, |
| 134 | + *, |
| 135 | + id: str, |
| 136 | + offset: Optional[int], |
| 137 | + limit: Optional[int], |
| 138 | + _request_timeout: Any, |
| 139 | + ) -> Any: |
| 140 | + # Build the request via the generator's private serialize helper so |
| 141 | + # path/query/auth handling stays in lockstep with the generated client. |
| 142 | + # Override only what we need: the Accept header and the format query. |
| 143 | + headers: Dict[str, Any] = {"Accept": ARROW_STREAM_MEDIA_TYPE} |
| 144 | + params = self._get_result_serialize( |
| 145 | + id=id, |
| 146 | + offset=offset, |
| 147 | + limit=limit, |
| 148 | + format=ResultsFormatQuery.ARROW, |
| 149 | + _request_auth=None, |
| 150 | + _content_type=None, |
| 151 | + _headers=headers, |
| 152 | + _host_index=0, |
| 153 | + ) |
| 154 | + response_data = self.api_client.call_api( |
| 155 | + *params, |
| 156 | + _request_timeout=_request_timeout, |
| 157 | + ) |
| 158 | + |
| 159 | + if response_data.status == 200: |
| 160 | + # Hand the raw urllib3.HTTPResponse to the caller. preload_content |
| 161 | + # was False on the way in, so the body has not been consumed. |
| 162 | + return response_data.response |
| 163 | + |
| 164 | + # Non-200: drain, deserialize as JSON, then raise. response_deserialize |
| 165 | + # raises ApiException for status >= 400 itself; only 202 falls through. |
| 166 | + try: |
| 167 | + response_data.read() |
| 168 | + body = self.api_client.response_deserialize( |
| 169 | + response_data=response_data, |
| 170 | + response_types_map={ |
| 171 | + "202": "GetResultResponse", |
| 172 | + "400": "ApiErrorResponse", |
| 173 | + "404": "ApiErrorResponse", |
| 174 | + "409": "GetResultResponse", |
| 175 | + }, |
| 176 | + ).data |
| 177 | + finally: |
| 178 | + response_data.response.release_conn() |
| 179 | + |
| 180 | + raise ResultNotReadyError( |
| 181 | + status=getattr(body, "status", "pending"), |
| 182 | + result_id=getattr(body, "result_id", id), |
| 183 | + ) |
| 184 | + |
| 185 | + |
| 186 | +__all__ = [ |
| 187 | + "ARROW_STREAM_MEDIA_TYPE", |
| 188 | + "ResultNotReadyError", |
| 189 | + "ResultsApi", |
| 190 | +] |
0 commit comments