From 32849011455186e288dc0b8b3b13628e32e099ec Mon Sep 17 00:00:00 2001 From: Rogdham Date: Sat, 6 Dec 2025 11:26:28 +0100 Subject: [PATCH] chore: format and lint with ruff --- .github/workflows/build.yml | 16 + CHANGELOG.md | 1 + docs/conf.py | 16 +- pyproject.toml | 42 +++ requirements-dev.txt | 3 + requirements-lint.txt | 1 + src/pyzstd/__init__.py | 106 +++--- src/pyzstd/__init__.pyi | 133 ++++---- src/pyzstd/__main__.py | 549 ++++++++++++++++++------------- src/pyzstd/_seekable_zstdfile.py | 374 ++++++++++++--------- tests/test_seekable.py | 10 +- tests/test_zstd.py | 2 +- 12 files changed, 726 insertions(+), 527 deletions(-) create mode 100644 requirements-dev.txt create mode 100644 requirements-lint.txt diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5f54906..3fb04d6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -73,6 +73,22 @@ jobs: - name: Test run: python -m unittest discover tests -v + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: 3.14 + - name: Install dependencies + run: python -m pip install -r requirements-lint.txt + - name: ruff check + run: ruff check + - name: ruff format + run: ruff format --check + publish: name: Publish to PyPI if: startsWith(github.ref, 'refs/tags') diff --git a/CHANGELOG.md b/CHANGELOG.md index f3a1bad..6cb56cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Changes in build dependency: remove `setuptools` and C build toolchain, add `hatchling` and `hatch-vcs` - Remove git submodule usage - Drop support for Python 3.9 and below +- Use `ruff` as formatter and linter ## 0.18.0 (October 5, 2025) diff --git a/docs/conf.py b/docs/conf.py index f6a7333..fa704e2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,9 +1,9 @@ -project = 'pyzstd module' -author = 'Ma Lin and contributors' -copyright = '2020-present, Ma Lin and contributors' -language = 'en' +project = "pyzstd module" +author = "Ma Lin and contributors" +copyright = "2020-present, Ma Lin and contributors" +language = "en" -master_doc = 'index' -pygments_style = 'sphinx' -extensions = ['myst_parser', 'sphinx_rtd_theme'] -html_theme = 'sphinx_rtd_theme' +master_doc = "index" +pygments_style = "sphinx" +extensions = ["myst_parser", "sphinx_rtd_theme"] +html_theme = "sphinx_rtd_theme" diff --git a/pyproject.toml b/pyproject.toml index a61a3ed..452c2c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,3 +58,45 @@ version-file = "src/pyzstd/_version.py" [tool.hatch.version] source = "vcs" + + +# +# ruff +# + +[tool.ruff] +src = ["src"] +target-version = "py310" +extend-exclude = [ + "tests", + '*.pyi', # FIXME +] + +[tool.ruff.lint] +select = ["ALL"] +ignore = [ + "ANN", # FIXME + "C901", + "COM812", + "D", + "E501", + "EM", + "ERA001", + "FA100", + "ISC001", + "PLR0912", + "PLR0913", + "PLR0915", + "PLR2004", + "PTH", + "TRY003", + "TRY301", +] + +[tool.ruff.lint.per-file-ignores] +"src/pyzstd/__main__.py" = ["PLC0415", "T201"] +"docs/conf.py" = ["A001", "INP001"] + +[tool.ruff.lint.isort] +force-sort-within-sections = true +known-first-party = ["pyzstd"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..3d574ce --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +-e . + +-r requirements-lint.txt diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 0000000..ef96a1b --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1 @@ +ruff==0.14.8 diff --git a/src/pyzstd/__init__.py b/src/pyzstd/__init__.py index d59cfa9..ff98234 100644 --- a/src/pyzstd/__init__.py +++ b/src/pyzstd/__init__.py @@ -1,9 +1,8 @@ +from enum import IntEnum import sys +from typing import NamedTuple import warnings -from collections import namedtuple -from enum import IntEnum - if sys.version_info < (3, 14): from backports import zstd else: @@ -14,8 +13,7 @@ except ImportError: from typing_extensions import deprecated -from pyzstd._version import __version__ - +from pyzstd._version import __version__ # noqa: F401 __doc__ = """\ Python bindings to Zstandard (zstd) compression library, the API style is @@ -28,32 +26,32 @@ PyPI: https://pypi.org/project/pyzstd""" __all__ = ( - "ZstdCompressor", - "RichMemZstdCompressor", - "ZstdDecompressor", - "EndlessZstdDecompressor", "CParameter", "DParameter", + "EndlessZstdDecompressor", + "RichMemZstdCompressor", + "SeekableFormatError", + "SeekableZstdFile", "Strategy", + "ZstdCompressor", + "ZstdDecompressor", + "ZstdDict", "ZstdError", + "ZstdFile", "compress", - "richmem_compress", - "decompress", "compress_stream", + "compressionLevel_values", + "decompress", "decompress_stream", - "ZstdDict", - "train_dict", "finalize_dict", "get_frame_info", "get_frame_size", - "ZstdFile", "open", + "richmem_compress", + "train_dict", + "zstd_support_multithread", "zstd_version", "zstd_version_info", - "zstd_support_multithread", - "compressionLevel_values", - "SeekableZstdFile", - "SeekableFormatError", ) @@ -68,29 +66,29 @@ def __repr__(self): class CParameter(IntEnum): """Compression parameters""" - compressionLevel = zstd.CompressionParameter.compression_level - windowLog = zstd.CompressionParameter.window_log - hashLog = zstd.CompressionParameter.hash_log - chainLog = zstd.CompressionParameter.chain_log - searchLog = zstd.CompressionParameter.search_log - minMatch = zstd.CompressionParameter.min_match - targetLength = zstd.CompressionParameter.target_length + compressionLevel = zstd.CompressionParameter.compression_level # noqa: N815 + windowLog = zstd.CompressionParameter.window_log # noqa: N815 + hashLog = zstd.CompressionParameter.hash_log # noqa: N815 + chainLog = zstd.CompressionParameter.chain_log # noqa: N815 + searchLog = zstd.CompressionParameter.search_log # noqa: N815 + minMatch = zstd.CompressionParameter.min_match # noqa: N815 + targetLength = zstd.CompressionParameter.target_length # noqa: N815 strategy = zstd.CompressionParameter.strategy - targetCBlockSize = 130 # not part of PEP-784 + targetCBlockSize = 130 # not part of PEP-784 # noqa: N815 - enableLongDistanceMatching = zstd.CompressionParameter.enable_long_distance_matching - ldmHashLog = zstd.CompressionParameter.ldm_hash_log - ldmMinMatch = zstd.CompressionParameter.ldm_min_match - ldmBucketSizeLog = zstd.CompressionParameter.ldm_bucket_size_log - ldmHashRateLog = zstd.CompressionParameter.ldm_hash_rate_log + enableLongDistanceMatching = zstd.CompressionParameter.enable_long_distance_matching # noqa: N815 + ldmHashLog = zstd.CompressionParameter.ldm_hash_log # noqa: N815 + ldmMinMatch = zstd.CompressionParameter.ldm_min_match # noqa: N815 + ldmBucketSizeLog = zstd.CompressionParameter.ldm_bucket_size_log # noqa: N815 + ldmHashRateLog = zstd.CompressionParameter.ldm_hash_rate_log # noqa: N815 - contentSizeFlag = zstd.CompressionParameter.content_size_flag - checksumFlag = zstd.CompressionParameter.checksum_flag - dictIDFlag = zstd.CompressionParameter.dict_id_flag + contentSizeFlag = zstd.CompressionParameter.content_size_flag # noqa: N815 + checksumFlag = zstd.CompressionParameter.checksum_flag # noqa: N815 + dictIDFlag = zstd.CompressionParameter.dict_id_flag # noqa: N815 - nbWorkers = zstd.CompressionParameter.nb_workers - jobSize = zstd.CompressionParameter.job_size - overlapLog = zstd.CompressionParameter.overlap_log + nbWorkers = zstd.CompressionParameter.nb_workers # noqa: N815 + jobSize = zstd.CompressionParameter.job_size # noqa: N815 + overlapLog = zstd.CompressionParameter.overlap_log # noqa: N815 def bounds(self): """Return lower and upper bounds of a compression parameter, both inclusive.""" @@ -100,7 +98,7 @@ def bounds(self): class DParameter(IntEnum): """Decompression parameters""" - windowLogMax = zstd.DecompressionParameter.window_log_max + windowLogMax = zstd.DecompressionParameter.window_log_max # noqa: N815 def bounds(self): """Return lower and upper bounds of a decompression parameter, both inclusive.""" @@ -110,17 +108,15 @@ def bounds(self): def _convert_level_or_option(level_or_option, mode): """Transform pyzstd params into PEP-784 `options` param""" if not isinstance(mode, str): - raise ValueError(f"Invalid mode type: {mode}") + raise TypeError(f"Invalid mode type: {mode}") read_mode = mode.startswith("r") if isinstance(level_or_option, int): if read_mode: raise TypeError( - ( - "In read mode (decompression), level_or_option argument " - "should be a dict object, that represents decompression " - "option. It doesn't support int type compression level " - "in this case." - ) + "In read mode (decompression), level_or_option argument " + "should be a dict object, that represents decompression " + "option. It doesn't support int type compression level " + "in this case." ) return { CParameter.compressionLevel: level_or_option, @@ -477,7 +473,7 @@ def __init__( ) -def open( +def open( # noqa: A001 filename, mode="rb", *, @@ -536,7 +532,7 @@ def cb(total_input, total_output, data_in, data_out): elif callback is None: - def cb(total_input, total_output, data_in, data_out): + def cb(total_input, total_output, data_in, data_out): # noqa: ARG001 output_stream.write(data_out) else: @@ -561,7 +557,7 @@ def compress_stream( zstd_dict=None, pledged_input_size=None, read_size=131_072, - write_size=_DEPRECATED_PLACEHOLDER, + write_size=_DEPRECATED_PLACEHOLDER, # noqa: ARG001 callback=None, ): """Compresses input_stream and writes the compressed data to output_stream, it @@ -604,7 +600,7 @@ def compress_stream( total_output = 0 compressor = ZstdCompressor(level_or_option, zstd_dict) if pledged_input_size is not None and pledged_input_size != 2**64 - 1: - compressor._set_pledged_input_size(pledged_input_size) + compressor._set_pledged_input_size(pledged_input_size) # noqa: SLF001 while data_in := input_stream.read(read_size): total_input += len(data_in) data_out = compressor.compress(data_in) @@ -701,9 +697,17 @@ def decompress_stream( zstd_version = zstd.zstd_version zstd_version_info = zstd.zstd_version_info zstd_support_multithread = CParameter.nbWorkers.bounds() != (0, 0) -compressionLevel_values = namedtuple("values", ["default", "min", "max"])( + + +class CompressionValues(NamedTuple): + default: int + min: int + max: int + + +compressionLevel_values = CompressionValues( # noqa: N816 zstd.COMPRESSION_LEVEL_DEFAULT, *CParameter.compressionLevel.bounds() ) # import here to avoid circular dependency issues -from ._seekable_zstdfile import SeekableFormatError, SeekableZstdFile +from ._seekable_zstdfile import SeekableFormatError, SeekableZstdFile # noqa: E402 diff --git a/src/pyzstd/__init__.pyi b/src/pyzstd/__init__.pyi index 9f9c523..d8e4d17 100644 --- a/src/pyzstd/__init__.pyi +++ b/src/pyzstd/__init__.pyi @@ -1,9 +1,16 @@ -import io +from collections.abc import ByteString, Callable, Iterable from enum import IntEnum +import io from os import PathLike -from typing import overload, Dict, ByteString, Optional, Union, Callable, \ - Iterable, Literal, ClassVar, Tuple, NamedTuple, BinaryIO, \ - TextIO +from typing import ( + BinaryIO, + ClassVar, + Literal, + NamedTuple, + TextIO, + TypeAlias, + overload, +) try: from warnings import deprecated @@ -12,15 +19,15 @@ except ImportError: __version__: str zstd_version: str -zstd_version_info: Tuple[int, int, int] +zstd_version_info: tuple[int, int, int] zstd_support_multithread: bool -class values(NamedTuple): +class CompressionValues(NamedTuple): default: int min: int max: int -compressionLevel_values: values +compressionLevel_values: CompressionValues class Strategy(IntEnum): fast: int @@ -58,14 +65,14 @@ class CParameter(IntEnum): jobSize: int overlapLog: int - def bounds(self) -> Tuple[int, int]: ... + def bounds(self) -> tuple[int, int]: ... class DParameter(IntEnum): windowLogMax: int - def bounds(self) -> Tuple[int, int]: ... + def bounds(self) -> tuple[int, int]: ... -ZstdDictInfo = Tuple['ZstdDict', int] +ZstdDictInfo: TypeAlias = tuple[ZstdDict, int] class ZstdDict: dict_content: bytes dict_id: int @@ -88,8 +95,8 @@ class ZstdCompressor: last_mode: Literal[0, 1, 2] def __init__(self, - level_or_option: Union[None, int, Dict[CParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None) -> None: ... + level_or_option: None | int | dict[CParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None) -> None: ... def compress(self, data, @@ -98,13 +105,13 @@ class ZstdCompressor: def flush(self, mode: Literal[1, 2] = ...) -> bytes: ... - def _set_pledged_input_size(self, size: Union[int, None]) -> None: ... + def _set_pledged_input_size(self, size: int | None) -> None: ... @deprecated("See https://pyzstd.readthedocs.io/en/stable/deprecated.html for alternatives to pyzstd.RichMemZstdCompressor") class RichMemZstdCompressor: def __init__(self, - level_or_option: Union[None, int, Dict[CParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None) -> None: ... + level_or_option: None | int | dict[CParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None) -> None: ... def compress(self, data) -> bytes: ... @@ -114,8 +121,8 @@ class ZstdDecompressor: unused_data: bytes def __init__(self, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, - option: Optional[Dict[DParameter, int]] = None) -> None: ... + zstd_dict: None | ZstdDict | ZstdDictInfo = None, + option: dict[DParameter, int] | None = None) -> None: ... def decompress(self, data: ByteString, @@ -126,8 +133,8 @@ class EndlessZstdDecompressor: at_frame_edge: bool def __init__(self, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, - option: Optional[Dict[DParameter, int]] = None) -> None: ... + zstd_dict: None | ZstdDict | ZstdDictInfo = None, + option: dict[DParameter, int] | None = None) -> None: ... def decompress(self, data: ByteString, @@ -137,32 +144,32 @@ class ZstdError(Exception): ... def compress(data, - level_or_option: Union[None, int, Dict[CParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None) -> bytes: ... + level_or_option: None | int | dict[CParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None) -> bytes: ... @deprecated("See https://pyzstd.readthedocs.io/en/stable/deprecated.html for alternatives to pyzstd.richmem_compress") def richmem_compress(data, - level_or_option: Union[None, int, Dict[CParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None) -> bytes: ... + level_or_option: None | int | dict[CParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None) -> bytes: ... def decompress(data: ByteString, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, - option: Optional[Dict[DParameter, int]] = None) -> bytes: ... + zstd_dict: None | ZstdDict | ZstdDictInfo = None, + option: dict[DParameter, int] | None = None) -> bytes: ... @deprecated("See https://pyzstd.readthedocs.io/en/stable/deprecated.html for alternatives to pyzstd.compress_stream") -def compress_stream(input_stream: BinaryIO, output_stream: Union[BinaryIO, None], *, - level_or_option: Union[None, int, Dict[CParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, - pledged_input_size: Optional[int] = None, +def compress_stream(input_stream: BinaryIO, output_stream: BinaryIO | None, *, + level_or_option: None | int | dict[CParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None, + pledged_input_size: int | None = None, read_size: int = 131_072, write_size: int = 131_591, - callback: Optional[Callable[[int, int, memoryview, memoryview], None]] = None) -> Tuple[int, int]: ... + callback: Callable[[int, int, memoryview, memoryview], None] | None = None) -> tuple[int, int]: ... @deprecated("See https://pyzstd.readthedocs.io/en/stable/deprecated.html for alternatives to pyzstd.decompress_stream") -def decompress_stream(input_stream: BinaryIO, output_stream: Union[BinaryIO, None], *, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, - option: Optional[Dict[DParameter, int]] = None, +def decompress_stream(input_stream: BinaryIO, output_stream: BinaryIO | None, *, + zstd_dict: None | ZstdDict | ZstdDictInfo = None, + option: dict[DParameter, int] | None = None, read_size: int = 131_075, write_size: int = 131_072, - callback: Optional[Callable[[int, int, memoryview, memoryview], None]] = None) -> Tuple[int, int]: ... + callback: Callable[[int, int, memoryview, memoryview], None] | None = None) -> tuple[int, int]: ... def train_dict(samples: Iterable, dict_size: int) -> ZstdDict: ... @@ -173,7 +180,7 @@ def finalize_dict(zstd_dict: ZstdDict, level: int) -> ZstdDict: ... class frame_info(NamedTuple): - decompressed_size: Union[int, None] + decompressed_size: int | None dictionary_id: int def get_frame_info(frame_buffer: ByteString) -> frame_info: ... @@ -185,11 +192,11 @@ class ZstdFile(io.BufferedIOBase): FLUSH_FRAME: ClassVar[Literal[2]] def __init__(self, - filename: Union[str, bytes, PathLike, BinaryIO], + filename: str | bytes | PathLike | BinaryIO, mode: str = "r", *, - level_or_option: Union[None, int, Dict[CParameter, int], Dict[DParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, + level_or_option: None | int | dict[CParameter, int] | dict[DParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None, read_size: int = 131_075, write_size: int = 131_591) -> None: ... def close(self) -> None: ... @@ -197,11 +204,11 @@ class ZstdFile(io.BufferedIOBase): def flush(self, mode: Literal[1, 2] = ...) -> None: ... - def read(self, size: Optional[int] = -1) -> bytes: ... + def read(self, size: int | None = -1) -> bytes: ... def read1(self, size: int = -1) -> bytes: ... def readinto(self, b) -> int: ... def readinto1(self, b) -> int: ... - def readline(self, size: Optional[int] = -1) -> bytes: ... + def readline(self, size: int | None = -1) -> bytes: ... def seek(self, offset: int, whence: int = 0) -> int: ... @@ -217,51 +224,51 @@ class ZstdFile(io.BufferedIOBase): class SeekableZstdFile(ZstdFile): def __init__(self, - filename: Union[str, bytes, PathLike, BinaryIO], + filename: str | bytes | PathLike | BinaryIO, mode: str = "r", *, - level_or_option: Union[None, int, Dict[CParameter, int], Dict[DParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, + level_or_option: None | int | dict[CParameter, int] | dict[DParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None, read_size: int = 131_075, write_size: int = 131_591, - max_frame_content_size: int = 1024*1024*1024) -> None: ... + max_frame_content_size: int = ...) -> None: ... @property - def seek_table_info(self) -> Tuple[int, int, int]: ... + def seek_table_info(self) -> tuple[int, int, int]: ... @staticmethod - def is_seekable_format_file(filename: Union[str, bytes, PathLike, BinaryIO]) -> bool: ... + def is_seekable_format_file(filename: str | bytes | PathLike | BinaryIO) -> bool: ... -_BinaryMode = Literal["r", "rb", # read +_BinaryMode: TypeAlias = Literal["r", "rb", # read "w", "wb", "a", "ab", "x", "xb"] # write -_TextMode = Literal["rt", # read +_TextMode: TypeAlias = Literal["rt", # read "wt", "at", "xt"] # write @overload -def open(filename: Union[str, bytes, PathLike, BinaryIO], +def open(filename: str | bytes | PathLike | BinaryIO, mode: _BinaryMode = "rb", *, - level_or_option: Union[None, int, Dict[CParameter, int], Dict[DParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, + level_or_option: None | int | dict[CParameter, int] | dict[DParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None, encoding: None = None, errors: None = None, newline: None = None) -> ZstdFile: ... @overload -def open(filename: Union[str, bytes, PathLike, BinaryIO], +def open(filename: str | bytes | PathLike | BinaryIO, mode: _TextMode, *, - level_or_option: Union[None, int, Dict[CParameter, int], Dict[DParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, - encoding: Optional[str] = None, - errors: Optional[str] = None, - newline: Optional[str] = None) -> TextIO: ... + level_or_option: None | int | dict[CParameter, int] | dict[DParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None, + encoding: str | None = None, + errors: str | None = None, + newline: str | None = None) -> TextIO: ... @overload -def open(filename: Union[str, bytes, PathLike, BinaryIO], +def open(filename: str | bytes | PathLike | BinaryIO, mode: str, *, - level_or_option: Union[None, int, Dict[CParameter, int], Dict[DParameter, int]] = None, - zstd_dict: Union[None, ZstdDict, ZstdDictInfo] = None, - encoding: Optional[str] = None, - errors: Optional[str] = None, - newline: Optional[str] = None) -> Union[ZstdFile, TextIO]: ... + level_or_option: None | int | dict[CParameter, int] | dict[DParameter, int] = None, + zstd_dict: None | ZstdDict | ZstdDictInfo = None, + encoding: str | None = None, + errors: str | None = None, + newline: str | None = None) -> ZstdFile | TextIO: ... diff --git a/src/pyzstd/__main__.py b/src/pyzstd/__main__.py index 2fcbed0..4d7af14 100644 --- a/src/pyzstd/__main__.py +++ b/src/pyzstd/__main__.py @@ -1,30 +1,36 @@ -#!/usr/bin/env python3 # CLI of pyzstd module: python -m pyzstd --help import argparse import os from shutil import copyfileobj from time import time -from pyzstd import \ - CParameter, DParameter, \ - train_dict, ZstdDict, ZstdFile, \ - compressionLevel_values, zstd_version, \ - __version__ as pyzstd_version +from pyzstd import ( + CParameter, + DParameter, + ZstdDict, + ZstdFile, + compressionLevel_values, + train_dict, + zstd_version, +) +from pyzstd import __version__ as pyzstd_version # buffer sizes recommended by zstd C_READ_BUFFER = 131072 D_READ_BUFFER = 131075 + # open output file and assign to args.output def open_output(args, path): if not args.f and os.path.isfile(path): - answer = input(('output file already exists:\n' - '{}\noverwrite? (y/n) ').format(path)) + answer = input(f"output file already exists:\n{path}\noverwrite? (y/n) ") print() - if answer != 'y': + if answer != "y": import sys + sys.exit() - args.output = open(path, 'wb') + args.output = open(path, "wb") # noqa: SIM115 + def close_files(args): if args.input is not None: @@ -33,54 +39,58 @@ def close_files(args): if args.output is not None: args.output.close() + def compress_option(args): # threads message if args.threads == 0: - threads_msg = 'single-thread mode' + threads_msg = "single-thread mode" else: - threads_msg = 'multi-thread mode, %d threads.' % args.threads + threads_msg = f"multi-thread mode, {args.threads} threads." # long mode if args.long >= 0: use_long = 1 - windowLog = args.long - long_msg = 'yes, windowLog is %d.' % windowLog + window_log = args.long + long_msg = f"yes, windowLog is {window_log}." else: use_long = 0 - windowLog = 0 - long_msg = 'no' + window_log = 0 + long_msg = "no" # option - option = {CParameter.compressionLevel: args.level, - CParameter.nbWorkers: args.threads, - CParameter.enableLongDistanceMatching: use_long, - CParameter.windowLog: windowLog, - CParameter.checksumFlag: args.checksum, - CParameter.dictIDFlag: args.write_dictID} + option = { + CParameter.compressionLevel: args.level, + CParameter.nbWorkers: args.threads, + CParameter.enableLongDistanceMatching: use_long, + CParameter.windowLog: window_log, + CParameter.checksumFlag: args.checksum, + CParameter.dictIDFlag: args.write_dictID, + } # pre-compress message - msg = (' - compression level: {}\n' - ' - threads: {}\n' - ' - long mode: {}\n' - ' - zstd dictionary: {}\n' - ' - add checksum: {}').format( - args.level, threads_msg, long_msg, - args.zd, args.checksum) + msg = ( + f" - compression level: {args.level}\n" + f" - threads: {threads_msg}\n" + f" - long mode: {long_msg}\n" + f" - zstd dictionary: {args.zd}\n" + f" - add checksum: {args.checksum}" + ) print(msg) return option + def compress(args): # output file if args.output is None: - open_output(args, args.input.name + '.zst') + open_output(args, args.input.name + ".zst") # pre-compress message - msg = ('Compress file:\n' - ' - input file : {}\n' - ' - output file: {}').format( - args.input.name, - args.output.name) + msg = ( + "Compress file:\n" + f" - input file : {args.input.name}\n" + f" - output file: {args.output.name}" + ) print(msg) # option @@ -88,7 +98,7 @@ def compress(args): # compress t1 = time() - with ZstdFile(args.output, 'w', level_or_option=option, zstd_dict=args.zd) as fout: + with ZstdFile(args.output, "w", level_or_option=option, zstd_dict=args.zd) as fout: copyfileobj(args.input, fout) t2 = time() in_size = args.input.tell() @@ -96,24 +106,23 @@ def compress(args): close_files(args) # post-compress message - if in_size != 0: - ratio = 100 * out_size / in_size - else: - ratio = 100.0 - msg = ('\nCompression succeeded, {:.2f} seconds.\n' - 'Input {:,} bytes, output {:,} bytes, ratio {:.2f}%.\n').format( - t2-t1, in_size, out_size, ratio) + ratio = 100.0 if in_size == 0 else 100 * out_size / in_size + msg = ( + f"\nCompression succeeded, {t2 - t1:.2f} seconds.\n" + f"Input {in_size:,} bytes, output {out_size:,} bytes, ratio {ratio:.2f}%.\n" + ) print(msg) + def decompress(args): # output file if args.output is None: if args.test is None: from re import subn - out_path, replaced = subn(r'(?i)^(.*)\.zst$', r'\1', args.input.name) + out_path, replaced = subn(r"(?i)^(.*)\.zst$", r"\1", args.input.name) if not replaced: - out_path = args.input.name + '.decompressed' + out_path = args.input.name + ".decompressed" else: out_path = os.devnull open_output(args, out_path) @@ -124,12 +133,13 @@ def decompress(args): # pre-decompress message output_name = args.output.name if output_name == os.devnull: - output_name = 'None' - print(('Decompress file:\n' - ' - input file : {}\n' - ' - output file: {}\n' - ' - zstd dictionary: {}').format( - args.input.name, output_name, args.zd)) + output_name = "None" + print( + "Decompress file:\n" + f" - input file : {args.input.name}\n" + f" - output file: {output_name}\n" + f" - zstd dictionary: {args.zd}" + ) # decompress t1 = time() @@ -141,50 +151,54 @@ def decompress(args): close_files(args) # post-decompress message - if out_size != 0: - ratio = 100 * in_size / out_size - else: - ratio = 100.0 - msg = ('\nDecompression succeeded, {:.2f} seconds.\n' - 'Input {:,} bytes, output {:,} bytes, ratio {:.2f}%.\n').format( - t2-t1, in_size, out_size, ratio) + ratio = 100.0 if out_size == 0 else 100 * in_size / out_size + msg = ( + f"\nDecompression succeeded, {t2 - t1:.2f} seconds.\n" + f"Input {in_size:,} bytes, output {out_size:,} bytes, ratio {ratio:.2f}%.\n" + ) print(msg) + def train(args): from glob import glob # check output file if args.output is None: - msg = 'need to specify output file using -o/--output option' - raise Exception(msg) + raise ValueError("need to specify output file using -o/--output option") # gather samples - print('Gathering samples, please wait.', flush=True) + print("Gathering samples, please wait.", flush=True) lst = [] for file in glob(args.train, recursive=True): - with open(file, 'rb') as f: + with open(file, "rb") as f: dat = f.read() lst.append(dat) - print('samples count: %d' % len(lst), end='\r', flush=True) + print("samples count:", len(lst), end="\r", flush=True) if len(lst) == 0: - raise Exception('No samples gathered, please check GLOB_PATH.') + raise ValueError("No samples gathered, please check GLOB_PATH.") samples_size = sum(len(sample) for sample in lst) if samples_size == 0: - raise Exception("Samples content is empty, can't train.") + raise ValueError("Samples content is empty, can't train.") # pre-train message - msg = ('Gathered, train zstd dictionary:\n' - ' - samples: {}\n' - ' - samples number: {}\n' - ' - samples content: {:,} bytes\n' - ' - dict file: {}\n' - ' - dict max size: {:,} bytes\n' - ' - dict id: {}\n' - 'Training, please wait.').format( - args.train, len(lst), samples_size, - args.output.name, args.maxdict, - 'random' if args.dictID is None else args.dictID) + msg = ( + "Gathered, train zstd dictionary:\n" + " - samples: {}\n" + " - samples number: {}\n" + " - samples content: {:,} bytes\n" + " - dict file: {}\n" + " - dict max size: {:,} bytes\n" + " - dict id: {}\n" + "Training, please wait." + ).format( + args.train, + len(lst), + samples_size, + args.output.name, + args.maxdict, + "random" if args.dictID is None else args.dictID, + ) print(msg, flush=True) # train @@ -195,9 +209,11 @@ def train(args): # Dictionary_ID: 4 bytes, stored in little-endian format. # it can be any value, except 0 (which means no Dictionary_ID). if args.dictID is not None and len(zd.dict_content) >= 8: - content = zd.dict_content[:4] + \ - args.dictID.to_bytes(4, 'little') + \ - zd.dict_content[8:] + content = ( + zd.dict_content[:4] + + args.dictID.to_bytes(4, "little") + + zd.dict_content[8:] + ) zd = ZstdDict(content) # save to file @@ -205,66 +221,47 @@ def train(args): close_files(args) # post-train message - msg = ('Training succeeded, {:.2f} seconds.\n' - 'Dictionary: {}\n').format(t2-t1, zd) + msg = f"Training succeeded, {t2 - t1:.2f} seconds.\nDictionary: {zd}\n" print(msg) -def get_ZstdTarFile(): - # lazy import for tar operations - from tarfile import TarFile - class ZstdTarFile(TarFile): - def __init__(self, name, mode='r', *, level_or_option=None, zstd_dict=None, **kwargs): - self.zstd_file = ZstdFile(name, mode, - level_or_option=level_or_option, - zstd_dict=zstd_dict) - try: - super().__init__(fileobj=self.zstd_file, mode=mode, **kwargs) - except: - self.zstd_file.close() - raise - - def close(self): - try: - super().close() - finally: - self.zstd_file.close() +def tarfile_create(args): + import sys - return ZstdTarFile + if sys.version_info < (3, 14): + from backports.zstd import tarfile + else: + import tarfile -def tarfile_create(args): # check input dir args.tar_input_dir = args.tar_input_dir.rstrip(os.sep) if not os.path.isdir(args.tar_input_dir): - msg = 'Tar archive input dir invalid: ' + args.tar_input_dir + msg = "Tar archive input dir invalid: " + args.tar_input_dir raise NotADirectoryError(msg) dirname, basename = os.path.split(args.tar_input_dir) # check output file if args.output is None: - out_path = os.path.join(dirname, basename + '.tar.zst') + out_path = os.path.join(dirname, basename + ".tar.zst") open_output(args, out_path) - # get ZstdTarFile class - ZstdTarFile = get_ZstdTarFile() - # pre-compress message - msg = ('Archive tar file:\n' - ' - input directory: {}\n' - ' - output file: {}').format( - args.tar_input_dir, - args.output.name) + msg = ( + "Archive tar file:\n" + f" - input directory: {args.tar_input_dir}\n" + f" - output file: {args.output.name}" + ) print(msg) # option option = compress_option(args) # compress - print('Archiving, please wait.', flush=True) + print("Archiving, please wait.", flush=True) t1 = time() - with ZstdTarFile(args.output, mode='w', - level_or_option=option, - zstd_dict=args.zd) as f: + with tarfile.TarFile.zstopen( + None, fileobj=args.output, mode="w", options=option, zstd_dict=args.zd + ) as f: f.add(args.tar_input_dir, basename) uncompressed_size = f.fileobj.tell() t2 = time() @@ -278,46 +275,51 @@ def tarfile_create(args): else: ratio = 100.0 - msg = ('Archiving succeeded, {:.2f} seconds.\n' - 'Input ~{:,} bytes, output {:,} bytes, ratio {:.2f}%.\n').format( - t2-t1, uncompressed_size, output_file_size, ratio) + msg = ( + f"Archiving succeeded, {t2 - t1:.2f} seconds.\n" + f"Input ~{uncompressed_size:,} bytes, output {output_file_size:,} bytes, ratio {ratio:.2f}%.\n" + ) print(msg) + def tarfile_extract(args): + import sys + + if sys.version_info < (3, 14): + from backports.zstd import tarfile + else: + import tarfile + # input file size if args.input is None: - msg = 'need to specify input file using -d/--decompress option.' + msg = "need to specify input file using -d/--decompress option." raise FileNotFoundError(msg) input_file_size = os.path.getsize(args.input.name) # check output dir if not os.path.isdir(args.tar_output_dir): - msg = 'Tar archive output dir invalid: ' + args.tar_output_dir + msg = "Tar archive output dir invalid: " + args.tar_output_dir raise NotADirectoryError(msg) - # get ZstdTarFile class - ZstdTarFile = get_ZstdTarFile() - # option option = {DParameter.windowLogMax: args.windowLogMax} # pre-extract message - msg = ('Extract tar archive:\n' - ' - input file: {}\n' - ' - output dir: {}\n' - ' - zstd dictionary: {}\n' - 'Extracting, please wait.').format( - args.input.name, - args.tar_output_dir, - args.zd) + msg = ( + "Extract tar archive:\n" + f" - input file: {args.input.name}\n" + f" - output dir: {args.tar_output_dir}\n" + f" - zstd dictionary: {args.zd}\n" + "Extracting, please wait." + ) print(msg, flush=True) # extract t1 = time() - with ZstdTarFile(args.input, mode='r', - zstd_dict=args.zd, - level_or_option=option) as f: - f.extractall(args.tar_output_dir) + with tarfile.TarFile.zstopen( + None, fileobj=args.input, mode="r", zstd_dict=args.zd, options=option + ) as f: + f.extractall(args.tar_output_dir, filter="data") uncompressed_size = f.fileobj.tell() t2 = time() close_files(args) @@ -327,123 +329,192 @@ def tarfile_extract(args): ratio = 100 * input_file_size / uncompressed_size else: ratio = 100.0 - msg = ('Extraction succeeded, {:.2f} seconds.\n' - 'Input {:,} bytes, output ~{:,} bytes, ratio {:.2f}%.\n').format( - t2-t1, input_file_size, uncompressed_size, ratio) + msg = ( + f"Extraction succeeded, {t2 - t1:.2f} seconds.\n" + f"Input {input_file_size:,} bytes, output ~{uncompressed_size:,} bytes, ratio {ratio:.2f}%.\n" + ) print(msg) -def range_action(min, max, bits_msg=False): + +def range_action(start, end): class RangeAction(argparse.Action): - def __call__(self, parser, args, values, option_string=None): + def __call__(self, parser, args, values, option_string=None): # noqa: ARG002 # convert to int try: v = int(values) - except: - raise TypeError('{} should be an integer'.format(option_string)) + except ValueError: + raise TypeError(f"{option_string} should be an integer") from None # check range - if not (min <= v <= max): + if not (start <= v <= end): # message - msg = ('{} value should: {} <= v <= {}. ' - 'provided value is {}.').format( - option_string, min, max, v) + msg = ( + f"{option_string} value should: {start} <= v <= {end}. " + f"provided value is {v}." + ) raise ValueError(msg) setattr(args, self.dest, v) + return RangeAction + def parse_arg(): p = argparse.ArgumentParser( - prog = 'CLI of pyzstd module', - description = ("The command style is similar to zstd's " - "CLI, but there are some differences.\n" - "Zstd's CLI should be faster, it has " - "some I/O optimizations."), - epilog=('Examples of use:\n' - ' compress a file:\n' - ' python -m pyzstd -c IN_FILE -o OUT_FILE\n' - ' decompress a file:\n' - ' python -m pyzstd -d IN_FILE -o OUT_FILE\n' - ' create a tar archive:\n' - ' python -m pyzstd --tar-input-dir DIR -o OUT_FILE\n' - ' extract a tar archive, output will forcibly overwrite existing files:\n' - ' python -m pyzstd -d IN_FILE --tar-output-dir DIR\n' - ' train a zstd dictionary, ** traverses sub-directories:\n' - ' python -m pyzstd --train "E:\\cpython\\**\\*.c" -o OUT_FILE'), - formatter_class=argparse.RawDescriptionHelpFormatter) - - g = p.add_argument_group('Common arguments') - g.add_argument('-D', '--dict', metavar='FILE', type=argparse.FileType('rb'), - help='use FILE as zstd dictionary for compression or decompression') - g.add_argument('-o', '--output', metavar='FILE', type=str, - help='result stored into FILE') - g.add_argument('-f', action='store_true', - help='disable output check, allows overwriting existing file.') - - g = p.add_argument_group('Compression arguments') + prog="CLI of pyzstd module", + description=( + "The command style is similar to zstd's " + "CLI, but there are some differences.\n" + "Zstd's CLI should be faster, it has " + "some I/O optimizations." + ), + epilog=( + "Examples of use:\n" + " compress a file:\n" + " python -m pyzstd -c IN_FILE -o OUT_FILE\n" + " decompress a file:\n" + " python -m pyzstd -d IN_FILE -o OUT_FILE\n" + " create a tar archive:\n" + " python -m pyzstd --tar-input-dir DIR -o OUT_FILE\n" + " extract a tar archive, output will forcibly overwrite existing files:\n" + " python -m pyzstd -d IN_FILE --tar-output-dir DIR\n" + " train a zstd dictionary, ** traverses sub-directories:\n" + ' python -m pyzstd --train "E:\\cpython\\**\\*.c" -o OUT_FILE' + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + g = p.add_argument_group("Common arguments") + g.add_argument( + "-D", + "--dict", + metavar="FILE", + type=argparse.FileType("rb"), + help="use FILE as zstd dictionary for compression or decompression", + ) + g.add_argument( + "-o", "--output", metavar="FILE", type=str, help="result stored into FILE" + ) + g.add_argument( + "-f", + action="store_true", + help="disable output check, allows overwriting existing file.", + ) + + g = p.add_argument_group("Compression arguments") gm = g.add_mutually_exclusive_group() - gm.add_argument('-c', '--compress', metavar='FILE', type=str, - help='compress FILE') - gm.add_argument('--tar-input-dir', metavar='DIR', type=str, - help=('create a tar archive from DIR. ' - 'this option overrides -c/--compress option.')) - g.add_argument('-l', '--level', metavar='#', - default=compressionLevel_values.default, - action=range_action(compressionLevel_values.min, - compressionLevel_values.max), - help='compression level, range: [{},{}], default: {}.'. - format(compressionLevel_values.min, - compressionLevel_values.max, - compressionLevel_values.default)) - g.add_argument('-t', '--threads', metavar='#', default=0, - action=range_action(*CParameter.nbWorkers.bounds(), True), - help=('spawns # threads to compress. if this option is not ' - 'specified or is 0, use single thread mode.')) - g.add_argument('--long', metavar='#', nargs='?', const=27, default=-1, - action=range_action(*CParameter.windowLog.bounds(), True), - help='enable long distance matching with given windowLog (default #: 27)') - g.add_argument('--no-checksum', action='store_false', - dest='checksum', default=True, - help="don't add 4-byte XXH64 checksum to the frame") - g.add_argument('--no-dictID', action='store_false', - dest='write_dictID', default=True, - help="don't write dictID into frame header (dictionary compression only)") - - g = p.add_argument_group('Decompression arguments') + gm.add_argument("-c", "--compress", metavar="FILE", type=str, help="compress FILE") + gm.add_argument( + "--tar-input-dir", + metavar="DIR", + type=str, + help=( + "create a tar archive from DIR. this option overrides -c/--compress option." + ), + ) + g.add_argument( + "-l", + "--level", + metavar="#", + default=compressionLevel_values.default, + action=range_action(compressionLevel_values.min, compressionLevel_values.max), + help=f"compression level, range: [{compressionLevel_values.min},{compressionLevel_values.max}], default: {compressionLevel_values.default}.", + ) + g.add_argument( + "-t", + "--threads", + metavar="#", + default=0, + action=range_action(*CParameter.nbWorkers.bounds()), + help=( + "spawns # threads to compress. if this option is not " + "specified or is 0, use single thread mode." + ), + ) + g.add_argument( + "--long", + metavar="#", + nargs="?", + const=27, + default=-1, + action=range_action(*CParameter.windowLog.bounds()), + help="enable long distance matching with given windowLog (default #: 27)", + ) + g.add_argument( + "--no-checksum", + action="store_false", + dest="checksum", + default=True, + help="don't add 4-byte XXH64 checksum to the frame", + ) + g.add_argument( + "--no-dictID", + action="store_false", + dest="write_dictID", + default=True, + help="don't write dictID into frame header (dictionary compression only)", + ) + + g = p.add_argument_group("Decompression arguments") gm = g.add_mutually_exclusive_group() - gm.add_argument('-d', '--decompress', metavar='FILE', type=str, - help='decompress FILE') - g.add_argument('--tar-output-dir', metavar='DIR', type=str, - help=('extract tar archive to DIR, ' - 'output will forcibly overwrite existing files. ' - 'this option overrides -o/--output option.')) - gm.add_argument('--test', metavar='FILE', type=str, - help='try to decompress FILE to check integrity') - g.add_argument('--windowLogMax', metavar='#', default=0, - action=range_action(*DParameter.windowLogMax.bounds(), True), - help='set a memory usage limit for decompression (windowLogMax)') - - g = p.add_argument_group('Dictionary builder') - g.add_argument('--train', metavar='GLOB_PATH', type=str, - help='create a dictionary from a training set of files') - g.add_argument('--maxdict', metavar='SIZE', type=int, default=112640, - help='limit dictionary to SIZE bytes (default: 112640)') - g.add_argument('--dictID', metavar='DICT_ID', default=None, - action=range_action(1, 0xFFFFFFFF), - help='specify dictionary ID value (default: random)') + gm.add_argument( + "-d", "--decompress", metavar="FILE", type=str, help="decompress FILE" + ) + g.add_argument( + "--tar-output-dir", + metavar="DIR", + type=str, + help=( + "extract tar archive to DIR, " + "output will forcibly overwrite existing files. " + "this option overrides -o/--output option." + ), + ) + gm.add_argument( + "--test", + metavar="FILE", + type=str, + help="try to decompress FILE to check integrity", + ) + g.add_argument( + "--windowLogMax", + metavar="#", + default=0, + action=range_action(*DParameter.windowLogMax.bounds()), + help="set a memory usage limit for decompression (windowLogMax)", + ) + + g = p.add_argument_group("Dictionary builder") + g.add_argument( + "--train", + metavar="GLOB_PATH", + type=str, + help="create a dictionary from a training set of files", + ) + g.add_argument( + "--maxdict", + metavar="SIZE", + type=int, + default=112640, + help="limit dictionary to SIZE bytes (default: 112640)", + ) + g.add_argument( + "--dictID", + metavar="DICT_ID", + default=None, + action=range_action(1, 0xFFFFFFFF), + help="specify dictionary ID value (default: random)", + ) args = p.parse_args() # input file if args.compress is not None: - args.input = open(args.compress, 'rb', - buffering=C_READ_BUFFER) + args.input = open(args.compress, "rb", buffering=C_READ_BUFFER) # noqa: SIM115 elif args.decompress is not None: - args.input = open(args.decompress, 'rb', - buffering=D_READ_BUFFER) + args.input = open(args.decompress, "rb", buffering=D_READ_BUFFER) # noqa: SIM115 elif args.test is not None: - args.input = open(args.test, 'rb', - buffering=D_READ_BUFFER) + args.input = open(args.test, "rb", buffering=D_READ_BUFFER) # noqa: SIM115 else: args.input = None @@ -456,22 +527,27 @@ def parse_arg(): zd_content = args.dict.read() args.dict.close() # Magic_Number: 4 bytes, value 0xEC30A437, little-endian format. - is_raw = zd_content[:4] != b'\x37\xA4\x30\xEC' + is_raw = zd_content[:4] != b"\x37\xa4\x30\xec" args.zd = ZstdDict(zd_content, is_raw=is_raw) else: args.zd = None # arguments combination - functions = [args.compress, args.decompress, - args.test, args.train, args.tar_input_dir] + functions = [ + args.compress, + args.decompress, + args.test, + args.train, + args.tar_input_dir, + ] if sum(1 for i in functions if i is not None) > 1: - raise Exception('Wrong arguments combination') + raise ValueError("Wrong arguments combination") return args + def main(): - print('*** pyzstd module v{}, zstd library v{}. ***\n'. - format(pyzstd_version, zstd_version)) + print(f"*** pyzstd module v{pyzstd_version}, zstd library v{zstd_version}. ***\n") args = parse_arg() @@ -486,7 +562,8 @@ def main(): elif args.train: train(args) else: - print('Invalid command. See help: python -m pyzstd --help') + print("Invalid command. See help: python -m pyzstd --help") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/pyzstd/_seekable_zstdfile.py b/src/pyzstd/_seekable_zstdfile.py index 3ef7e8d..c4f6653 100644 --- a/src/pyzstd/_seekable_zstdfile.py +++ b/src/pyzstd/_seekable_zstdfile.py @@ -6,20 +6,23 @@ from struct import Struct import warnings -from pyzstd import ZstdCompressor, ZstdDecompressor, _DEPRECATED_PLACEHOLDER +from pyzstd import _DEPRECATED_PLACEHOLDER, ZstdCompressor, ZstdDecompressor -__all__ = ('SeekableFormatError', 'SeekableZstdFile') +__all__ = ("SeekableFormatError", "SeekableZstdFile") _MODE_CLOSED = 0 _MODE_READ = 1 _MODE_WRITE = 2 + class SeekableFormatError(Exception): - 'An error related to Zstandard Seekable Format.' + "An error related to Zstandard Seekable Format." + def __init__(self, msg): - super().__init__('Zstandard Seekable Format error: ' + msg) + super().__init__("Zstandard Seekable Format error: " + msg) + -__doc__ = '''\ +__doc__ = """\ Zstandard Seekable Format (Ver 0.1.0, Apr 2017) Square brackets are used to indicate optional fields. All numeric fields are little-endian unless specified otherwise. @@ -39,13 +42,14 @@ def __init__(self, msg): Bit_number Field_name 7 Checksum_Flag 6-2 Reserved_Bits (should ensure they are set to 0) - 1-0 Unused_Bits (should not interpret these bits)''' -__format_version__ = '0.1.0' + 1-0 Unused_Bits (should not interpret these bits)""" +__format_version__ = "0.1.0" + class _SeekTable: - _s_2uint32 = Struct(' fsize - skippable_frame_size: - msg = ('Wrong seek table. Since index %d frame (0-based), ' - 'the cumulated compressed size is greater than ' - 'file size.') % idx + msg = ( + f"Wrong seek table. Since index {idx} frame (0-based), " + "the cumulated compressed size is greater than " + "file size." + ) raise SeekableFormatError(msg) # Check format if self._full_c_size != fsize - skippable_frame_size: - raise SeekableFormatError('The cumulated compressed size is wrong') + raise SeekableFormatError("The cumulated compressed size is wrong") # Parsed successfully, save for future use. self._seek_frame_size = skippable_frame_size @@ -198,19 +213,16 @@ def load_seek_table(self, fp, seek_to_0): # Find frame index by decompressed position def index_by_dpos(self, pos): # Array's first item is 0, so need this. - if pos < 0: - pos = 0 + pos = max(pos, 0) i = bisect_right(self._cumulated_d_size, pos) if i != self._frames_count + 1: return i - else: - # None means >= EOF - return None + # None means >= EOF + return None def get_frame_sizes(self, i): - return (self._cumulated_c_size[i-1], - self._cumulated_d_size[i-1]) + return (self._cumulated_c_size[i - 1], self._cumulated_d_size[i - 1]) def get_full_c_size(self): return self._full_c_size @@ -239,9 +251,9 @@ def _merge_frames(self, max_frames): # Merge c_size = 0 d_size = 0 - for j in range(pos, pos+length, 2): + for j in range(pos, pos + length, 2): c_size += arr[j] - d_size += arr[j+1] + d_size += arr[j + 1] self.append_entry(c_size, d_size) pos += length @@ -251,12 +263,14 @@ def write_seek_table(self, fp): if self._frames_count > 0xFFFFFFFF: # Emit a warning warnings.warn( - ('SeekableZstdFile\'s seek table has %d entries, ' - 'which exceeds the maximal value allowed by ' - 'Zstandard Seekable Format (0xFFFFFFFF). The ' - 'entries will be merged into 0xFFFFFFFF entries, ' - 'this may reduce seeking performance.') % self._frames_count, - RuntimeWarning, 3) + f"SeekableZstdFile's seek table has {self._frames_count} entries, " + "which exceeds the maximal value allowed by " + "Zstandard Seekable Format (0xFFFFFFFF). The " + "entries will be merged into 0xFFFFFFFF entries, " + "this may reduce seeking performance.", + RuntimeWarning, + 3, + ) # Merge frames self._merge_frames(0xFFFFFFFF) @@ -267,17 +281,14 @@ def write_seek_table(self, fp): ba = bytearray(size) # Header - self._s_2uint32.pack_into(ba, offset, 0x184D2A5E, size-8) + self._s_2uint32.pack_into(ba, offset, 0x184D2A5E, size - 8) offset += 8 # Entries for i in range(0, len(self._frames), 2): - self._s_2uint32.pack_into(ba, offset, - self._frames[i], - self._frames[i+1]) + self._s_2uint32.pack_into(ba, offset, self._frames[i], self._frames[i + 1]) offset += 8 # Footer - self._s_footer.pack_into(ba, offset, - self._frames_count, 0, 0x8F92EAB1) + self._s_footer.pack_into(ba, offset, self._frames_count, 0, 0x8F92EAB1) # Write fp.write(ba) @@ -294,31 +305,32 @@ def __len__(self): return self._frames_count def get_info(self): - return (self._frames_count, - self._full_c_size, - self._full_d_size) + return (self._frames_count, self._full_c_size, self._full_d_size) -class _EOFSuccess(EOFError): +class _EOFSuccess(EOFError): # noqa: N818 pass class _SeekableDecompressReader(io.RawIOBase): def __init__(self, fp, zstd_dict, option, read_size): # Check fp readable/seekable - if not hasattr(fp, 'readable') or not hasattr(fp, "seekable"): + if not hasattr(fp, "readable") or not hasattr(fp, "seekable"): raise TypeError( - ("In SeekableZstdFile's reading mode, the file object should " - "have .readable()/.seekable() methods.")) + "In SeekableZstdFile's reading mode, the file object should " + "have .readable()/.seekable() methods." + ) if not fp.readable(): raise TypeError( - ("In SeekableZstdFile's reading mode, the file object should " - "be readable.")) + "In SeekableZstdFile's reading mode, the file object should " + "be readable." + ) if not fp.seekable(): raise TypeError( - ("In SeekableZstdFile's reading mode, the file object should " - "be seekable. If the file object is not seekable, it can be " - "read sequentially using ZstdFile class.")) + "In SeekableZstdFile's reading mode, the file object should " + "be seekable. If the file object is not seekable, it can be " + "read sequentially using ZstdFile class." + ) self._fp = fp self._zstd_dict = zstd_dict @@ -360,7 +372,9 @@ def _decompress(self, size): elif self._decompressor.needs_input: data = self._fp.read(self._read_size) if not data: # EOF - raise EOFError("Compressed file ended before the end-of-stream marker was reached") + raise EOFError( + "Compressed file ended before the end-of-stream marker was reached" + ) else: data = self._decompressor.unused_data if self._decompressor.eof: # frame edge @@ -374,11 +388,11 @@ def _decompress(self, size): return out def readinto(self, b): - with memoryview(b) as view, view.cast('B') as byte_view: + with memoryview(b) as view, view.cast("B") as byte_view: try: while True: if out := self._decompress(byte_view.nbytes): - byte_view[:len(out)] = out + byte_view[: len(out)] = out return len(out) except _EOFSuccess: return 0 @@ -387,14 +401,14 @@ def readinto(self, b): # this method may not be called. def seek(self, offset, whence=0): # offset is absolute file position - if whence == 0: # SEEK_SET + if whence == 0: # SEEK_SET pass elif whence == 1: # SEEK_CUR offset = self._pos + offset elif whence == 2: # SEEK_END offset = self._size + offset else: - raise ValueError("Invalid value for whence: {}".format(whence)) + raise ValueError(f"Invalid value for whence: {whence}") # Get new frame index new_frame = self._seek_table.index_by_dpos(offset) @@ -414,9 +428,7 @@ def seek(self, offset, whence=0): # |--data1--|--skippable--|--data2--| # cpos: ^P1 # dpos: ^P1 ^P2 - if new_frame == old_frame and \ - offset >= self._pos and \ - self._fp.tell() >= c_pos: + if new_frame == old_frame and offset >= self._pos and self._fp.tell() >= c_pos: pass else: # Jump @@ -434,6 +446,7 @@ def seek(self, offset, whence=0): def get_seek_table_info(self): return self._seek_table.get_info() + # Compared to ZstdFile class, it's important to handle the seekable # of underlying file object carefully. Need to check seekable in # each situation. For example, there may be a CD-R file system that @@ -443,19 +456,27 @@ class SeekableZstdFile(io.BufferedIOBase): or read 0-size file. It provides relatively fast seeking ability in read mode. """ + # The format uses uint32_t for compressed/decompressed sizes. If flush # block a lot, compressed_size may exceed the limit, so set a max size. - FRAME_MAX_C_SIZE = 2*1024*1024*1024 + FRAME_MAX_C_SIZE = 2 * 1024 * 1024 * 1024 # Zstd seekable format's example code also use 1GiB as max content size. - FRAME_MAX_D_SIZE = 1*1024*1024*1024 + FRAME_MAX_D_SIZE = 1 * 1024 * 1024 * 1024 FLUSH_BLOCK = ZstdCompressor.FLUSH_BLOCK FLUSH_FRAME = ZstdCompressor.FLUSH_FRAME - def __init__(self, filename, mode="r", *, - level_or_option=None, zstd_dict=None, - read_size=_DEPRECATED_PLACEHOLDER, write_size=_DEPRECATED_PLACEHOLDER, - max_frame_content_size=1024*1024*1024): + def __init__( + self, + filename, + mode="r", + *, + level_or_option=None, + zstd_dict=None, + read_size=_DEPRECATED_PLACEHOLDER, + write_size=_DEPRECATED_PLACEHOLDER, + max_frame_content_size=1024 * 1024 * 1024, + ): """Open a Zstandard Seekable Format file in binary mode. In read mode, the file can be 0-size file. @@ -488,11 +509,19 @@ def __init__(self, filename, mode="r", *, if read_size == _DEPRECATED_PLACEHOLDER: read_size = 131075 else: - warnings.warn("pyzstd.SeekableZstdFile()'s read_size parameter is deprecated", DeprecationWarning, stacklevel=2) + warnings.warn( + "pyzstd.SeekableZstdFile()'s read_size parameter is deprecated", + DeprecationWarning, + stacklevel=2, + ) if write_size == _DEPRECATED_PLACEHOLDER: write_size = 131591 else: - warnings.warn("pyzstd.SeekableZstdFile()'s write_size parameter is deprecated", DeprecationWarning, stacklevel=2) + warnings.warn( + "pyzstd.SeekableZstdFile()'s write_size parameter is deprecated", + DeprecationWarning, + stacklevel=2, + ) self._fp = None self._close_fp = False @@ -500,42 +529,45 @@ def __init__(self, filename, mode="r", *, self._buffer = None if not isinstance(mode, str): - raise ValueError('mode must be a str') - mode = mode.removesuffix('b') # handle rb, wb, xb, ab + raise TypeError("mode must be a str") + mode = mode.removesuffix("b") # handle rb, wb, xb, ab # Read or write mode if mode == "r": if not isinstance(level_or_option, (type(None), dict)): raise TypeError( - ("In read mode (decompression), level_or_option argument " - "should be a dict object, that represents decompression " - "option. It doesn't support int type compression level " - "in this case.")) + "In read mode (decompression), level_or_option argument " + "should be a dict object, that represents decompression " + "option. It doesn't support int type compression level " + "in this case." + ) if read_size <= 0: raise ValueError("read_size argument should > 0") if write_size != 131591: - raise ValueError( - "write_size argument is only valid in write modes.") + raise ValueError("write_size argument is only valid in write modes.") # Specified max_frame_content_size argument - if max_frame_content_size != 1024*1024*1024: - raise ValueError(('max_frame_content_size argument is only ' - 'valid in write modes (compression).')) + if max_frame_content_size != 1024 * 1024 * 1024: + raise ValueError( + "max_frame_content_size argument is only " + "valid in write modes (compression)." + ) mode_code = _MODE_READ elif mode in {"w", "a", "x"}: if not isinstance(level_or_option, (type(None), int, dict)): - raise TypeError(("level_or_option argument " - "should be int or dict object.")) + raise TypeError( + "level_or_option argument should be int or dict object." + ) if read_size != 131075: - raise ValueError( - "read_size argument is only valid in read mode.") + raise ValueError("read_size argument is only valid in read mode.") if write_size <= 0: raise ValueError("write_size argument should > 0") if not (0 < max_frame_content_size <= self.FRAME_MAX_D_SIZE): raise ValueError( - ('max_frame_content_size argument should be ' - '0 < value <= %d, provided value is %d.') % \ - (self.FRAME_MAX_D_SIZE, max_frame_content_size)) + "max_frame_content_size argument should be " + f"0 < value <= {self.FRAME_MAX_D_SIZE}, " + f"provided value is {max_frame_content_size}." + ) # For seekable format self._max_frame_content_size = max_frame_content_size @@ -543,41 +575,43 @@ def __init__(self, filename, mode="r", *, self._seek_table = _SeekTable(read_mode=False) mode_code = _MODE_WRITE - self._compressor = ZstdCompressor(level_or_option=level_or_option, - zstd_dict=zstd_dict) + self._compressor = ZstdCompressor( + level_or_option=level_or_option, zstd_dict=zstd_dict + ) self._pos = 0 # Load seek table in append mode if mode == "a": if not isinstance(filename, (str, bytes, PathLike)): raise TypeError( - ("In append mode ('a', 'ab'), " - "SeekableZstdFile.__init__() method can't " - "accept file object as filename argument. " - "Please use file path (str/bytes/PathLike).")) + "In append mode ('a', 'ab'), " + "SeekableZstdFile.__init__() method can't " + "accept file object as filename argument. " + "Please use file path (str/bytes/PathLike)." + ) # Load seek table if file exists if isfile(filename): - with io.open(filename, "rb") as f: + with open(filename, "rb") as f: if not hasattr(f, "seekable") or not f.seekable(): raise TypeError( - ("In SeekableZstdFile's append mode " - "('a', 'ab'), the opened 'rb' file " - "object should be seekable.")) + "In SeekableZstdFile's append mode " + "('a', 'ab'), the opened 'rb' file " + "object should be seekable." + ) self._seek_table.load_seek_table(f, seek_to_0=False) else: - raise ValueError("Invalid mode: {!r}".format(mode)) + raise ValueError(f"Invalid mode: {mode!r}") # File object if isinstance(filename, (str, bytes, PathLike)): - self._fp = io.open(filename, mode + "b") + self._fp = open(filename, mode + "b") # noqa: SIM115 self._close_fp = True elif hasattr(filename, "read") or hasattr(filename, "write"): self._fp = filename else: - raise TypeError(("filename must be a str, bytes, " - "file or PathLike object")) + raise TypeError("filename must be a str, bytes, file or PathLike object") self._mode = mode_code @@ -586,7 +620,8 @@ def __init__(self, filename, mode="r", *, self._fp, zstd_dict=zstd_dict, option=level_or_option, - read_size=read_size) + read_size=read_size, + ) self._buffer = io.BufferedReader(raw) elif mode == "a": @@ -596,18 +631,22 @@ def __init__(self, filename, mode="r", *, self._fp.truncate() else: # Add the seek table frame - self._seek_table.append_entry( - self._seek_table.seek_frame_size, 0) + self._seek_table.append_entry(self._seek_table.seek_frame_size, 0) # Emit a warning - warnings.warn(("SeekableZstdFile is opened in append mode " - "('a', 'ab'), but the underlying file object " - "is not seekable. Therefore the seek table (a " - "zstd skippable frame) at the end of the file " - "can't be overwritten. Each time open such file " - "in append mode, it will waste some storage " - "space. %d bytes were wasted this time.") % \ - self._seek_table.seek_frame_size, - RuntimeWarning, 2) + warnings.warn( + ( + "SeekableZstdFile is opened in append mode " + "('a', 'ab'), but the underlying file object " + "is not seekable. Therefore the seek table (a " + "zstd skippable frame) at the end of the file " + "can't be overwritten. Each time open such file " + "in append mode, it will waste some storage " + f"space. {self._seek_table.seek_frame_size} bytes " + "were wasted this time." + ), + RuntimeWarning, + 2, + ) def _reset_frame_sizes(self): self._current_c_size = 0 @@ -639,7 +678,7 @@ def close(self): return try: if self._mode == _MODE_READ: - if getattr(self, '_buffer', None): + if getattr(self, "_buffer", None): self._buffer.close() self._buffer = None elif self._mode == _MODE_WRITE: @@ -667,7 +706,7 @@ def write(self, data): self._check_can_write() # Accept any data that supports the buffer protocol. # And memoryview's subview is faster than slice. - with memoryview(data) as view, view.cast('B') as byte_view: + with memoryview(data) as view, view.cast("B") as byte_view: nbytes = byte_view.nbytes pos = 0 @@ -676,7 +715,9 @@ def write(self, data): write_size = min(nbytes, self._left_d_size) # Compress & write - compressed = self._compressor.compress(byte_view[pos:pos+write_size]) + compressed = self._compressor.compress( + byte_view[pos : pos + write_size] + ) output_size = self._fp.write(compressed) self._pos += write_size @@ -689,8 +730,10 @@ def write(self, data): self._left_d_size -= write_size # Should flush a frame - if self._left_d_size == 0 or \ - self._current_c_size >= self.FRAME_MAX_C_SIZE: + if ( + self._left_d_size == 0 + or self._current_c_size >= self.FRAME_MAX_C_SIZE + ): self.flush(self.FLUSH_FRAME) return pos @@ -712,15 +755,17 @@ def flush(self, mode=ZstdCompressor.FLUSH_BLOCK): self._check_not_closed() if mode not in {self.FLUSH_BLOCK, self.FLUSH_FRAME}: - raise ValueError('Invalid mode argument, expected either ' - 'ZstdFile.FLUSH_FRAME or ' - 'ZstdFile.FLUSH_BLOCK') + raise ValueError( + "Invalid mode argument, expected either " + "ZstdFile.FLUSH_FRAME or " + "ZstdFile.FLUSH_BLOCK" + ) if self._compressor.last_mode != mode: # Flush zstd block/frame, and write. compressed = self._compressor.flush(mode) output_size = self._fp.write(compressed) - if hasattr(self._fp, 'flush'): + if hasattr(self._fp, "flush"): self._fp.flush() # Cumulate @@ -728,11 +773,9 @@ def flush(self, mode=ZstdCompressor.FLUSH_BLOCK): # self._current_d_size += 0 # self._left_d_size -= 0 - if mode == self.FLUSH_FRAME and \ - self._current_c_size != 0: + if mode == self.FLUSH_FRAME and self._current_c_size != 0: # Add an entry to seek table - self._seek_table.append_entry(self._current_c_size, - self._current_d_size) + self._seek_table.append_entry(self._current_c_size, self._current_d_size) self._reset_frame_sizes() def read(self, size=-1): @@ -827,8 +870,9 @@ def tell(self): self._check_not_closed() if self._mode == _MODE_READ: return self._buffer.tell() - elif self._mode == _MODE_WRITE: + if self._mode == _MODE_WRITE: return self._pos + raise RuntimeError # impossible code path def fileno(self): """Return the file descriptor for the underlying file.""" @@ -871,11 +915,10 @@ def seek_table_info(self): """ if self._mode == _MODE_WRITE: return self._seek_table.get_info() - elif self._mode == _MODE_READ: + if self._mode == _MODE_READ: return self._buffer.raw.get_seek_table_info() - else: - # Closed - return None + # Closed + return None @staticmethod def is_seekable_format_file(filename): @@ -889,17 +932,22 @@ def is_seekable_format_file(filename): """ # Check argument if isinstance(filename, (str, bytes, PathLike)): - fp = io.open(filename, 'rb') + fp = open(filename, "rb") # noqa: SIM115 is_file_path = True - elif hasattr(filename, 'readable') and filename.readable() and \ - hasattr(filename, "seekable") and filename.seekable(): + elif ( + hasattr(filename, "readable") + and filename.readable() + and hasattr(filename, "seekable") + and filename.seekable() + ): fp = filename is_file_path = False orig_pos = fp.tell() else: raise TypeError( - ('filename argument should be a str/bytes/PathLike object, ' - 'or a file object that is readable and seekable.')) + "filename argument should be a str/bytes/PathLike object, " + "or a file object that is readable and seekable." + ) # Write mode uses less RAM table = _SeekTable(read_mode=False) diff --git a/tests/test_seekable.py b/tests/test_seekable.py index 126d21c..bbc90a5 100644 --- a/tests/test_seekable.py +++ b/tests/test_seekable.py @@ -707,7 +707,7 @@ def test_init_with_x_mode(self): os.remove(filename) def test_init_bad_mode(self): - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): SeekableZstdFile(BytesIO(COMPRESSED), (3, "x")) with self.assertRaises(ValueError): SeekableZstdFile(BytesIO(COMPRESSED), "") @@ -856,7 +856,7 @@ def seek(offset, whence=0): return get_file # test .close() method - with patch("io.open", mock_open(io.open)): + with patch("builtins.open", mock_open(io.open)): with self.assertRaisesRegex(OSError, 'xyz'): SeekableZstdFile(filename, 'ab') @@ -1407,7 +1407,7 @@ def seekable(*args, **kwargs): return get_file # append 1 - with patch("io.open", mock_open(io.open)): + with patch("builtins.open", mock_open(io.open)): with self.assertWarnsRegex(RuntimeWarning, (r"at the end of the file " r"can't be overwritten" @@ -1419,7 +1419,7 @@ def seekable(*args, **kwargs): f.close() # append 2 - with patch("io.open", mock_open(io.open)): + with patch("builtins.open", mock_open(io.open)): with self.assertWarnsRegex(RuntimeWarning, (r"at the end of the file " r"can't be overwritten" @@ -1465,7 +1465,7 @@ def seekable(*args, **kwargs): return get_file # append - with patch("io.open", mock_open(io.open)): + with patch("builtins.open", mock_open(io.open)): with self.assertRaisesRegex( TypeError, (r"In SeekableZstdFile's append mode \('a', 'ab'\)," diff --git a/tests/test_zstd.py b/tests/test_zstd.py index 2931642..a8de8ec 100644 --- a/tests/test_zstd.py +++ b/tests/test_zstd.py @@ -2295,7 +2295,7 @@ def test_init_with_x_mode(self): os.remove(filename) def test_init_bad_mode(self): - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): ZstdFile(BytesIO(COMPRESSED_100_PLUS_32KB), (3, "x")) with self.assertRaises(ValueError): ZstdFile(BytesIO(COMPRESSED_100_PLUS_32KB), "")