-
Notifications
You must be signed in to change notification settings - Fork 80
Expand file tree
/
Copy patharrow_array_stream.hpp
More file actions
124 lines (101 loc) · 3.54 KB
/
arrow_array_stream.hpp
File metadata and controls
124 lines (101 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
//===----------------------------------------------------------------------===//
// DuckDB
//
// duckdb_python/arrow/arrow_array_stream.hpp
//
//
//===----------------------------------------------------------------------===//
#pragma once
#include "duckdb/common/arrow/arrow_wrapper.hpp"
#include "duckdb/common/atomic.hpp"
#include "duckdb/common/constants.hpp"
#include "duckdb/function/table_function.hpp"
#include "duckdb/function/table/arrow.hpp"
#include "duckdb/main/client_config.hpp"
#include "duckdb/main/config.hpp"
#include "duckdb_python/pybind11/pybind_wrapper.hpp"
#include "duckdb/common/string.hpp"
#include "duckdb/common/vector.hpp"
#include "duckdb/main/client_properties.hpp"
namespace duckdb {
namespace pyarrow {
class RecordBatchReader : public py::object {
public:
RecordBatchReader(const py::object &o) : py::object(o, borrowed_t {}) {
}
using py::object::object;
public:
static bool check_(const py::handle &object) {
return !py::none().is(object);
}
};
class Table : public py::object {
public:
Table(const py::object &o) : py::object(o, borrowed_t {}) {
}
using py::object::object;
public:
static bool check_(const py::handle &object) {
return !py::none().is(object);
}
};
} // namespace pyarrow
enum class PyArrowObjectType {
Invalid,
Table,
Scanner,
Dataset,
PyCapsule,
PyCapsuleInterface,
MessageReader,
PolarsLazyFrame
};
void TransformDuckToArrowChunk(ArrowSchema &arrow_schema, ArrowArray &data, py::list &batches);
PyArrowObjectType GetArrowType(const py::handle &obj);
class PythonTableArrowArrayStreamFactory {
public:
explicit PythonTableArrowArrayStreamFactory(PyObject *arrow_table, const ClientProperties &client_properties_p,
PyArrowObjectType arrow_type_p)
: arrow_object(arrow_table), client_properties(client_properties_p), cached_arrow_type(arrow_type_p) {
cached_schema.release = nullptr;
}
~PythonTableArrowArrayStreamFactory() {
if (cached_arrow_table.ptr() != nullptr) {
py::gil_scoped_acquire acquire;
cached_arrow_table = py::object();
}
if (cached_schema.release) {
cached_schema.release(&cached_schema);
}
}
//! Produces an Arrow Scanner, should be only called once when initializing Scan States
static unique_ptr<ArrowArrayStreamWrapper> Produce(uintptr_t factory, ArrowStreamParameters ¶meters);
//! Get the schema of the arrow object
static void GetSchemaInternal(py::handle arrow_object, ArrowSchemaWrapper &schema);
static void GetSchema(uintptr_t factory_ptr, ArrowSchemaWrapper &schema);
//! Arrow Object (i.e., Scanner, Record Batch Reader, Table, Dataset)
PyObject *arrow_object;
const ClientProperties client_properties;
const PyArrowObjectType cached_arrow_type;
//! Cached Arrow table from an unfiltered .collect().to_arrow() on a LazyFrame.
//! Avoids re-reading from source and re-converting on repeated scans without filters.
py::object cached_arrow_table;
private:
ArrowSchema cached_schema;
bool schema_cached = false;
static py::object ProduceScanner(py::object &arrow_scanner, py::handle &arrow_obj_handle,
ArrowStreamParameters ¶meters, const ClientProperties &client_properties);
};
} // namespace duckdb
namespace pybind11 {
namespace detail {
template <>
struct handle_type_name<duckdb::pyarrow::RecordBatchReader> {
static constexpr auto name = _("pyarrow.lib.RecordBatchReader");
};
template <>
struct handle_type_name<duckdb::pyarrow::Table> {
static constexpr auto name = _("pyarrow.lib.Table");
};
} // namespace detail
} // namespace pybind11