Skip to content
82 changes: 50 additions & 32 deletions .github/workflows/paimon-python-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
container: "python:${{ matrix.python-version }}-slim"
strategy:
matrix:
python-version: ['3.6.15', '3.10']
python-version: [ '3.6.15', '3.10' ]

steps:
- name: Checkout code
Expand All @@ -70,6 +70,7 @@ jobs:
build-essential \
git \
curl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

- name: Verify Java and Maven installation
Expand All @@ -88,66 +89,82 @@ jobs:
- name: Install Python dependencies
shell: bash
run: |
df -h
if [[ "${{ matrix.python-version }}" == "3.6.15" ]]; then
python -m pip install --upgrade pip==21.3.1
python --version
python -m pip install -q pyroaring readerwriterlock==1.0.9 'fsspec==2021.10.1' 'cachetools==4.2.4' 'ossfs==2021.8.0' pyarrow==6.0.1 pandas==1.1.5 'polars==0.9.12' 'fastavro==1.4.7' zstandard==0.19.0 dataclasses==0.8.0 flake8 pytest py4j==0.10.9.9 requests parameterized==0.8.1 2>&1 >/dev/null
python -m pip install --no-cache-dir pyroaring readerwriterlock==1.0.9 'fsspec==2021.10.1' 'cachetools==4.2.4' 'ossfs==2021.8.0' pyarrow==6.0.1 pandas==1.1.5 'polars==0.9.12' 'fastavro==1.4.7' zstandard==0.19.0 dataclasses==0.8.0 flake8 pytest py4j==0.10.9.9 requests parameterized==0.8.1 2>&1 >/dev/null
else
python -m pip install --upgrade pip
python -m pip install -q pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 2>&1 >/dev/null
pip install torch --index-url https://download.pytorch.org/whl/cpu
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0
fi
df -h
- name: Run lint-python.sh
shell: bash
run: |
chmod +x paimon-python/dev/lint-python.sh
./paimon-python/dev/lint-python.sh
./paimon-python/dev/lint-python.sh -e pytest_torch

requirement_version_compatible_test:
torch_test:
runs-on: ubuntu-latest
container: "python:3.10-slim"

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up JDK ${{ env.JDK_VERSION }}
uses: actions/setup-java@v4
with:
java-version: ${{ env.JDK_VERSION }}
distribution: 'temurin'

- name: Set up Maven
uses: stCarolas/setup-maven@v4.5
with:
maven-version: 3.8.8

- name: Install system dependencies
shell: bash
run: |
apt-get update && apt-get install -y \
build-essential \
git \
curl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

- name: Verify Java and Maven installation
run: |
java -version
mvn -version

- name: Verify Python version
run: python --version

- name: Build Java
- name: Install Python dependencies
shell: bash
run: |
echo "Start compiling modules"
mvn -T 2C -B clean install -DskipTests
python -m pip install --upgrade pip
pip install torch --index-url https://download.pytorch.org/whl/cpu
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0
- name: Run lint-python.sh
shell: bash
run: |
chmod +x paimon-python/dev/lint-python.sh
./paimon-python/dev/lint-python.sh -i pytest_torch

requirement_version_compatible_test:
runs-on: ubuntu-latest
container: "python:3.10-slim"

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Install system dependencies
shell: bash
run: |
apt-get update && apt-get install -y \
build-essential \
git \
curl \
&& rm -rf /var/lib/apt/lists/*

- name: Verify Python version
run: python --version

- name: Install base Python dependencies
shell: bash
run: |
python -m pip install --upgrade pip
python -m pip install -q \
pip install torch --index-url https://download.pytorch.org/whl/cpu
python -m pip install --no-cache-dir \
pyroaring \
readerwriterlock==1.0.9 \
fsspec==2024.3.1 \
Expand All @@ -165,36 +182,37 @@ jobs:
requests \
parameterized==0.9.0 \
packaging


- name: Test requirement version compatibility
shell: bash
run: |
cd paimon-python

# Test Ray version compatibility
echo "=========================================="
echo "Testing Ray version compatibility"
echo "=========================================="
for ray_version in 2.44.0 2.48.0 2.53.0; do
echo "Testing Ray version: $ray_version"

# Install specific Ray version
python -m pip install -q ray==$ray_version
python -m pip install --no-cache-dir -q ray==$ray_version

# Verify Ray version
python -c "import ray; print(f'Ray version: {ray.__version__}')"
python -c "from packaging.version import parse; import ray; assert parse(ray.__version__) == parse('$ray_version'), f'Expected Ray $ray_version, got {ray.__version__}'"

# Run tests
python -m pytest pypaimon/tests/ray_data_test.py::RayDataTest -v --tb=short || {
echo "Tests failed for Ray $ray_version"
exit 1
}

# Uninstall Ray to avoid conflicts
python -m pip uninstall -y ray
done

# Add other dependency version tests here in the future
# Example:
# echo "=========================================="
Expand Down
67 changes: 50 additions & 17 deletions docs/content/program-api/python-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ catalog_options = {
}
catalog = CatalogFactory.create(catalog_options)
```

{{< /tab >}}
{{< /tabs >}}

Expand Down Expand Up @@ -473,6 +474,38 @@ ray_dataset = table_read.to_ray(splits)

See [Ray Data API Documentation](https://docs.ray.io/en/latest/data/api/doc/ray.data.read_datasource.html) for more details.

### Read Pytorch Dataset

This requires `torch` to be installed.

You can read all the data into a `torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`:

```python
from torch.utils.data import DataLoader

table_read = read_builder.new_read()
dataset = table_read.to_torch(splits, streaming=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we find a reference for this parameter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dataloader = DataLoader(
dataset,
batch_size=2,
num_workers=2, # Concurrency to read data
shuffle=False
)

# Collect all data from dataloader
for batch_idx, batch_data in enumerate(dataloader):
print(batch_data)

# output:
# {'user_id': tensor([1, 2]), 'behavior': ['a', 'b']}
# {'user_id': tensor([3, 4]), 'behavior': ['c', 'd']}
# {'user_id': tensor([5, 6]), 'behavior': ['e', 'f']}
# {'user_id': tensor([7, 8]), 'behavior': ['g', 'h']}
```

When the `streaming` parameter is true, it will iteratively read;
when it is false, it will read the full amount of data into memory.

### Incremental Read

This API allows reading data committed between two snapshot timestamps. The steps are as follows.
Expand Down Expand Up @@ -671,22 +704,22 @@ Key points about shard read:
The following shows the supported features of Python Paimon compared to Java Paimon:

**Catalog Level**
- FileSystemCatalog
- RestCatalog
- FileSystemCatalog
- RestCatalog

**Table Level**
- Append Tables
- `bucket = -1` (unaware)
- `bucket > 0` (fixed)
- Primary Key Tables
- only support deduplicate
- `bucket = -2` (postpone)
- `bucket > 0` (fixed)
- read with deletion vectors enabled
- Read/Write Operations
- Batch read and write for append tables and primary key tables
- Predicate filtering
- Overwrite semantics
- Incremental reading of Delta data
- Reading and writing blob data
- `with_shard` feature
- Append Tables
- `bucket = -1` (unaware)
- `bucket > 0` (fixed)
- Primary Key Tables
- only support deduplicate
- `bucket = -2` (postpone)
- `bucket > 0` (fixed)
- read with deletion vectors enabled
- Read/Write Operations
- Batch read and write for append tables and primary key tables
- Predicate filtering
- Overwrite semantics
- Incremental reading of Delta data
- Reading and writing blob data
- `with_shard` feature
31 changes: 28 additions & 3 deletions paimon-python/dev/lint-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ function collect_checks() {
function get_all_supported_checks() {
_OLD_IFS=$IFS
IFS=$'\n'
SUPPORT_CHECKS=("flake8_check" "pytest_check" "mixed_check") # control the calling sequence
SUPPORT_CHECKS=("flake8_check" "pytest_torch_check" "pytest_check" "mixed_check") # control the calling sequence
for fun in $(declare -F); do
if [[ `regexp_match "$fun" "_check$"` = true ]]; then
check_name="${fun:11}"
Expand Down Expand Up @@ -179,7 +179,7 @@ function pytest_check() {
TEST_DIR="pypaimon/tests/py36"
echo "Running tests for Python 3.6: $TEST_DIR"
else
TEST_DIR="pypaimon/tests --ignore=pypaimon/tests/py36 --ignore=pypaimon/tests/e2e"
TEST_DIR="pypaimon/tests --ignore=pypaimon/tests/py36 --ignore=pypaimon/tests/e2e --ignore=pypaimon/tests/torch_read_test.py"
echo "Running tests for Python $PYTHON_VERSION (excluding py36): pypaimon/tests --ignore=pypaimon/tests/py36"
fi

Expand All @@ -197,7 +197,32 @@ function pytest_check() {
print_function "STAGE" "pytest checks... [SUCCESS]"
fi
}
function pytest_torch_check() {
print_function "STAGE" "pytest torch checks"
if [ ! -f "$PYTEST_PATH" ]; then
echo "For some unknown reasons, the pytest package is not complete."
fi

# Get Python version
PYTHON_VERSION=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
echo "Detected Python version: $PYTHON_VERSION"
TEST_DIR="pypaimon/tests/torch_read_test.py"
echo "Running tests for Python $PYTHON_VERSION: pypaimon/tests/torch_read_test.py"

# the return value of a pipeline is the status of the last command to exit
# with a non-zero status or zero if no command exited with a non-zero status
set -o pipefail
($PYTEST_PATH $TEST_DIR) 2>&1 | tee -a $LOG_FILE

PYCODESTYLE_STATUS=$?
if [ $PYCODESTYLE_STATUS -ne 0 ]; then
print_function "STAGE" "pytest checks... [FAILED]"
# Stop the running script.
exit 1;
else
print_function "STAGE" "pytest checks... [SUCCESS]"
fi
}
# Mixed tests check - runs Java-Python interoperability tests
function mixed_check() {
# Get Python version
Expand Down Expand Up @@ -279,7 +304,7 @@ usage: $0 [options]
-l list all checks supported.
Examples:
./lint-python.sh => exec all checks.
./lint-python.sh -e tox,flake8 => exclude checks tox,flake8.
./lint-python.sh -e flake8 => exclude checks flake8.
./lint-python.sh -i flake8 => include checks flake8.
./lint-python.sh -i mixed => include checks mixed.
./lint-python.sh -l => list all checks supported.
Expand Down
20 changes: 8 additions & 12 deletions paimon-python/dev/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,23 @@
cachetools>=4.2,<6; python_version=="3.6"
cachetools>=5,<6; python_version>"3.6"
dataclasses>=0.8; python_version < "3.7"
fastavro>=1.4,<2; python_version<"3.9"
fastavro>=1.4,<2; python_version>="3.9"
fastavro>=1.4,<2
fsspec>=2021.10,<2026; python_version<"3.8"
fsspec>=2023,<2026; python_version>="3.8"
ossfs>=2021.8; python_version<"3.8"
ossfs>=2023; python_version>="3.8"
packaging>=21,<26; python_version<"3.8"
packaging>=21,<26; python_version>="3.8"
packaging>=21,<26
pandas>=1.1,<2; python_version < "3.7"
pandas>=1.3,<3; python_version >= "3.7" and python_version < "3.9"
pandas>=1.5,<3; python_version >= "3.9"
polars>=0.9,<1; python_version<"3.8"
polars>=1,<2; python_version=="3.8"
polars>=1,<2; python_version>"3.8"
polars>=1,<2; python_version>="3.8"
pyarrow>=6,<7; python_version < "3.8"
pyarrow>=16,<20; python_version >= "3.8" and python_version < "3.13"
pyarrow>=16,<20; python_version >= "3.13"
pyarrow>=16,<20; python_version >= "3.8"
pylance>=0.20,<1; python_version>="3.9"
pylance>=0.10,<1; python_version>="3.8" and python_version<"3.9"
pyroaring
ray>=2.10,<3
readerwriterlock>=1,<2
zstandard>=0.19,<1; python_version<"3.9"
zstandard>=0.19,<1; python_version>="3.9"
pylance>=0.20,<1; python_version>="3.9"
pylance>=0.10,<1; python_version>="3.8" and python_version<"3.9"
torch
zstandard>=0.19,<1
Loading