diff --git a/CHANGELOG.md b/CHANGELOG.md index 09464f62..7744b095 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Added + +- Converter for Bavaria, Germany + +### Changed + +- Change implicit `block_size=0` through URL query parameter to explicit attribute in converter class - Improve Converter template.py usability ## [v0.10.0] - 2025-03-11 diff --git a/fiboa_cli/convert_utils.py b/fiboa_cli/convert_utils.py index 806f01f7..bb82b77f 100644 --- a/fiboa_cli/convert_utils.py +++ b/fiboa_cli/convert_utils.py @@ -125,7 +125,7 @@ def add_asset_to_collection(collection, output_file, rows=None, columns=None): def stream_file(fs, src_uri, dst_file, chunk_size=10 * 1024 * 1024): - with fs.open(src_uri, mode="rb") as f: + with fs.open(src_uri, mode="rb", block_size=0) as f: while True: chunk = f.read(chunk_size) if not chunk: @@ -172,6 +172,7 @@ class BaseConverter: source_variants: Optional[dict[dict[str, str] | str]] = None variant: str = None open_options = {} + avoid_range_request = False years: Optional[dict[dict[int, str] | str]] = None year: str = None @@ -218,7 +219,7 @@ def layer_filter(self, layer: str, uri: str) -> bool: def post_migrate(self, gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: return gdf - def get_cache(self, cache_folder=None, force=False): + def get_cache(self, cache_folder=None, force=False, **kwargs): if cache_folder is None: if not force: return None, None @@ -228,12 +229,12 @@ def get_cache(self, cache_folder=None, force=False): with TemporaryDirectory(**_kwargs) as tmp_folder: cache_folder = tmp_folder - cache_fs = get_fs(cache_folder) + cache_fs = get_fs(cache_folder, **kwargs) if not cache_fs.exists(cache_folder): cache_fs.makedirs(cache_folder) return cache_fs, cache_folder - def download_files(self, uris, cache_folder=None): + def download_files(self, uris, cache_folder=None, **kwargs): """Download (and cache) files from various sources""" if isinstance(uris, str): uris = {uris: name_from_uri(uris)} @@ -249,7 +250,7 @@ def download_files(self, uris, cache_folder=None): else: name = target - source_fs = get_fs(uri) + source_fs = get_fs(uri, **kwargs) cache_fs, cache_folder = self.get_cache(cache_folder, force=True) if isinstance(source_fs, LocalFileSystem): @@ -501,7 +502,10 @@ def convert( raise ValueError("No input files provided") log("Getting file(s) if not cached yet") - paths = self.download_files(urls, cache) + request_args = {} + if self.avoid_range_request: + request_args["block_size"] = 0 + paths = self.download_files(urls, cache, **request_args) kwargs.update(self.open_options) gdf = self.read_data(paths, **kwargs) diff --git a/fiboa_cli/converter_rest.py b/fiboa_cli/converter_rest.py index 04d5f0a8..d62c0e1c 100644 --- a/fiboa_cli/converter_rest.py +++ b/fiboa_cli/converter_rest.py @@ -40,7 +40,7 @@ def get_data(self, paths, **kwargs): return super().get_data(paths, **kwargs) base_url = paths[0] # loop over paths to support more than 1 source - source_fs = get_fs(base_url) + source_fs = get_fs(base_url, **kwargs) cache_fs, cache_folder = self.get_cache(self.cache_folder) service_metadata = requests.get(base_url, {"f": "pjson"}).json() diff --git a/fiboa_cli/datasets/be_wal.py b/fiboa_cli/datasets/be_wal.py index 5aa28ce3..2c0bee98 100644 --- a/fiboa_cli/datasets/be_wal.py +++ b/fiboa_cli/datasets/be_wal.py @@ -8,10 +8,11 @@ class Converter(AdminConverterMixin, BaseConverter): sources = { - "https://geoservices.wallonie.be/geotraitement/spwdatadownload/get/2a0d9be0-ac3d-443e-9db0-a7cfb0f128e2/LU_ExistingLandUse_SIGEC2022.gml.zip?blocksize=0": [ + "https://geoservices.wallonie.be/geotraitement/spwdatadownload/get/2a0d9be0-ac3d-443e-9db0-a7cfb0f128e2/LU_ExistingLandUse_SIGEC2022.gml.zip": [ "LU_ExistingLandUse_SIGEC2022.gml" ] } + avoid_range_request = True id = "be_wal" admin_region_code = "WAL" short_name = "Belgium, Wallonia" diff --git a/fiboa_cli/datasets/de_by.py b/fiboa_cli/datasets/de_by.py new file mode 100644 index 00000000..218db9d6 --- /dev/null +++ b/fiboa_cli/datasets/de_by.py @@ -0,0 +1,57 @@ +from ..convert_utils import BaseConverter +from .commons.admin import AdminConverterMixin + + +class Converter(AdminConverterMixin, BaseConverter): + sources = "https://geodaten.bayern.de/odd/m/3/daten/ln/landnutzung.gpkg" + avoid_range_request = True + + id = "de_by" + admin_subdivision_code = "BY" + short_name = "Germany, Bavaria" + title = "Field boundaries for Bavaria, Germany" + description = """A field block (German: "Feldblock") is a contiguous agricultural area surrounded by permanent boundaries, which is cultivated by one or more farmers with one or more crops, is fully or partially set aside or is fully or partially taken out of production.""" + license = "CC-BY-4.0" + attribution = "Datenquelle: Bayerische Vermessungsverwaltung – www.geodaten.bayern.de" + providers = [ + { + "name": "Bayerische Vermessungsverwaltung", + "url": "https://www.ldbv.bayern.de", + "roles": ["producer", "licensor"], + } + ] + extensions = {"https://fiboa.github.io/flik-extension/v0.1.0/schema.yaml"} + + columns = { + "geometry": "geometry", + "objid": ["id", "flik"], + "datumderletztenueberpruefung": "determination_datetime", + "beginnt": "datetime:first_determination", + "bewirtschaftung": "cultivation", + # "artderbetriebsflaeche": "artderbetriebsflaeche", + # "name": "name", + # "istweiterenutzung": "istweiterenutzung", + # "mappingannahme": "mappingannahme", + "quellobjektid": "source_id", + } + missing_schemas = { + "properties": { + "datetime:first_determination": {"type": "date-time"}, + "cultivation": {"type": "string"}, + # "artderbetriebsflaeche": {"type": "string"}, + # "name": {"type": "string"}, + # "istweiterenutzung": {"type": "string"}, + # "mappingannahme": {"type": "boolean"}, + "source_id": {"type": "string"}, + } + } + + column_filters = { + # see https://www.adv-online.de/GeoInfoDok/Aktuelle-Anwendungsschemata/Landnutzung-1.0.2/binarywriterservlet?imgUid=be12989a-7b60-5819-393b-216067bef8a0&uBasVariant=11111111-1111-1111-1111-111111111111#_C10573-_A10573_44376 + "bewirtschaftung": lambda col: col.isin( + ["1010", "1011", "1012", "1013", "1014", "1030", "1040", "1050"] + ) + } + + def layer_filter(self, layer: str, uri: str) -> bool: + return layer == "ln_landwirtschaft" diff --git a/fiboa_cli/datasets/es_ar.py b/fiboa_cli/datasets/es_ar.py index 607c344f..943cb61b 100644 --- a/fiboa_cli/datasets/es_ar.py +++ b/fiboa_cli/datasets/es_ar.py @@ -8,10 +8,11 @@ class ARConverter(ESBaseConverter): # These files can be annoying to download (web server failure, no http-range support for continuation) # Alternative is to download the files by municipality, check the atom.xml sources = { - "https://icearagon.aragon.es/datosdescarga/descarga.php?file=/CartoTema/sigpac/rec22_sigpac.shp.zip&blocksize=0": "rec22_sigpac.shp.zip", - "https://icearagon.aragon.es/datosdescarga/descarga.php?file=/CartoTema/sigpac/rec44_sigpac.shp.zip&blocksize=0": "rec44_sigpac.shp.zip", - "https://icearagon.aragon.es/datosdescarga/descarga.php?file=/CartoTema/sigpac/rec50_sigpac.shp.zip&blocksize=0": "rec50_sigpac.shp.zip", + "https://icearagon.aragon.es/datosdescarga/descarga.php?file=/CartoTema/sigpac/rec22_sigpac.shp.zip": "rec22_sigpac.shp.zip", + "https://icearagon.aragon.es/datosdescarga/descarga.php?file=/CartoTema/sigpac/rec44_sigpac.shp.zip": "rec44_sigpac.shp.zip", + "https://icearagon.aragon.es/datosdescarga/descarga.php?file=/CartoTema/sigpac/rec50_sigpac.shp.zip": "rec50_sigpac.shp.zip", } + avoid_range_request = True id = "es_ar" short_name = "Spain Aragon" diff --git a/fiboa_cli/datasets/es_nc.py b/fiboa_cli/datasets/es_nc.py index 6dd2e80b..3416d10c 100644 --- a/fiboa_cli/datasets/es_nc.py +++ b/fiboa_cli/datasets/es_nc.py @@ -72,4 +72,4 @@ def download_files(self, uris, cache_folder=None): # Hostname has invalid SSL, prefill cache and avoid ssl-errors self.prefill_cache(uris, cache_folder) - return super().download_files(uris, cache_folder=cache_folder) + return super().download_files(uris, cache_folder) diff --git a/fiboa_cli/datasets/sk.py b/fiboa_cli/datasets/sk.py index 0c585477..8f155db2 100644 --- a/fiboa_cli/datasets/sk.py +++ b/fiboa_cli/datasets/sk.py @@ -4,7 +4,7 @@ class Converter(AdminConverterMixin, BaseConverter): sources = { - "https://data.slovensko.sk/download?id=e39ad227-1899-4cff-b7c8-734f90aa0b59&blocksize=0": [ + "https://data.slovensko.sk/download?id=e39ad227-1899-4cff-b7c8-734f90aa0b59": [ "HU2024_20240917shp/HU2024_20240917.shp" ] } diff --git a/fiboa_cli/util.py b/fiboa_cli/util.py index 317215e4..bc4cf9f1 100644 --- a/fiboa_cli/util.py +++ b/fiboa_cli/util.py @@ -103,28 +103,24 @@ def load_datatypes(version): return response["$defs"] -def get_fs(url_or_path: str) -> AbstractFileSystem: +def get_fs(url_or_path: str, **kwargs) -> AbstractFileSystem: """Choose fsspec filesystem by sniffing input url""" parsed = urlparse(url_or_path) if parsed.scheme in ("http", "https"): - if re.search(r"[?&]blocksize=0", url_or_path): - # We read in chunks. Some origin-server don't support http-range request - # Add an additional blocksize=0 parameter to your url for a workaround - return HTTPFileSystem(block_size=0) - return HTTPFileSystem() + return HTTPFileSystem(**kwargs) if parsed.scheme == "s3": from s3fs import S3FileSystem - return S3FileSystem() + return S3FileSystem(**kwargs) if parsed.scheme == "gs": from gcsfs import GCSFileSystem - return GCSFileSystem() + return GCSFileSystem(**kwargs) - return LocalFileSystem() + return LocalFileSystem(**kwargs) def is_valid_file_uri(uri, extensions=[]):