fixing etags to support single AND multipart forms fixes #2

MattReimer · MattReimer · commit 11970848e311 · 2025-12-15T16:48:14.000-08:00
diff --git a/RSXML-Python.code-workspace b/RSXML-Python.code-workspace
@@ -18,6 +18,13 @@
     "python.analysis.extraPaths": [
       "./src"
     ],
+    "autopep8.args": [
+      "--max-line-length=240"
+    ],
+    "pylint.args": [
+      "--disable=C0301,C0114,C0103,W0719,W0718",
+      "--max-line-length=240"
+    ],
     "python.terminal.activateEnvironment": true,
     "python.testing.pytestEnabled": true,
     "python.testing.unittestEnabled": true,
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,9 @@ urls = { "Source" = "https://github.com/Riverscapes/RiverscapesXML", "Homepage"
 
 [project.optional-dependencies]
 dev = [
+  "boto3>=1.42.10",
+  "botocore>=1.42.10",
+  "questionary>=2.1.1",
   "pytest>=8.3.4",
   "ruff>=0.1.0",
   "nox>=2023.4.22",
diff --git a/scripts/verify_etags_s3.py b/scripts/verify_etags_s3.py
@@ -0,0 +1,109 @@
+""" Verify Etags
+
+This is just a little helper script to upload files to S3 using both single part
+and multipart uploads, and then fetch the Etag from S3 to verify that our etag
+calculation matches what S3 produces.
+
+These values go into the test_etags.py unit tests to verify correctness of our etag calculations.
+
+There is no reason to run this script other than to regenerate those values if the etag calculation
+algorithm changes.
+
+"""
+import os
+
+import boto3
+import questionary
+from botocore.exceptions import NoCredentialsError
+
+from rsxml.dotenv import parse_dotenv
+
+
+def upload_file(file_path: str, bucket_name: str, object_name=None, force_multipart=False):
+    """Upload a file to an S3 bucket
+
+    :param file_path: File to upload
+    :param bucket_name: Bucket to upload to
+    :param object_name: S3 object name. If not specified then file_name is used
+    :param force_multipart: If True, forces multipart upload even for small files
+    :return: True if file was uploaded, else False
+    """
+
+    # If S3 object_name was not specified, use file_name
+    if object_name is None:
+        object_name = os.path.basename(file_path)
+
+    # Upload the file
+    s3_client = boto3.client('s3')
+
+    try:
+        print(f"Uploading {file_path} to {bucket_name}/{object_name}...")
+
+        if force_multipart:
+            # Force multipart upload configuration
+            # Set threshold to 0 to force multipart for any file size
+            config = boto3.s3.transfer.TransferConfig(
+                multipart_threshold=1,
+                multipart_chunksize=50 * 1024 * 1024  # 50MB chunks to match rsxml defaults
+            )
+            s3_client.upload_file(file_path, bucket_name, object_name, Config=config)
+        else:
+            # Standard upload (let boto3 decide, or force single part by setting high threshold)
+            # To ensure single part for testing, we set a very high threshold
+            config = boto3.s3.transfer.TransferConfig(
+                multipart_threshold=10 * 1024 * 1024 * 1024  # 10GB threshold
+            )
+            s3_client.upload_file(file_path, bucket_name, object_name, Config=config)
+
+        # Fetch the object to get the ETag
+        response = s3_client.head_object(Bucket=bucket_name, Key=object_name)
+        remote_etag = response['ETag']
+        print(f"Upload Successful. Remote ETag: {remote_etag}")
+        return remote_etag
+
+    except NoCredentialsError:
+        print("Credentials not available")
+        return None
+    except Exception as e:
+        print(f"Upload failed: {e}")
+        return None
+
+
+def main():
+
+    bucket_name = questionary.text("S3 Bucket name:").ask()
+    s3_prefix = questionary.text("S3 Prefix (optional):", default="matt/s3etagtest").ask()
+    if not bucket_name:
+        return
+
+    dir_path = questionary.path("Directory containing files to upload:").ask()
+    if not dir_path:
+        return
+
+    if not os.path.exists(dir_path):
+        print(f"Directory {dir_path} does not exist.")
+        return
+
+    files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and not f.startswith('.')]
+
+    print(f"Found {len(files)} files in {dir_path}")
+
+    for filename in files:
+        file_path = os.path.join(dir_path, filename)
+
+        print(f"\n--- Processing {filename} ---")
+
+        # 1. Upload as Single Part (Standard MD5)
+        print("1. Uploading as Single Part...")
+        etag_single = upload_file(file_path, bucket_name, s3_prefix + f"/single_part/{filename}", force_multipart=False)
+
+        # 2. Upload as Multipart
+        print("2. Uploading as Multipart...")
+        etag_multipart = upload_file(file_path, bucket_name, s3_prefix + f"/multipart/{filename}", force_multipart=True)
+        print(f"Summary for {filename}:")
+        print(f"  Single Part ETag: {etag_single}")
+        print(f"  Multipart ETag:   {etag_multipart}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/rsxml/__version__.py b/src/rsxml/__version__.py
@@ -1 +1 @@
-__version__ = "2.2.0"
+__version__ = "2.2.1"
diff --git a/src/rsxml/etag.py b/src/rsxml/etag.py
@@ -4,22 +4,20 @@
 from rsxml.constants import MULTIPART_CHUNK_SIZE, MULTIPART_THRESHOLD
 
 
-def _md5(contents: str):
-    return hashlib.md5(contents).hexdigest()
-
-
 def calculate_etag(
     filePath: str,
     chunk_size_bytes: int = MULTIPART_CHUNK_SIZE,
     chunk_thresh_bytes: int = MULTIPART_THRESHOLD,
-):
+    force_single_part: bool = False,
+) -> str:
     """Calculate the Etag of a file. This is useful for figuring out if
     it needs to be uploaded to the warehouse or not.
 
     Args:
         filePath (str): path to the file we want an etag for
         chunk_size_bytes (int): The size of a multipart upload
         chunk_thresh_bytes (int): The threshold before we start using multipart uploads
+        force_single_part (bool): If True, force a single part etag (standard MD5) even for large files.
 
     Returns:
         str: The Etag of the file
@@ -29,26 +27,30 @@ def calculate_etag(
 
     etag = ""
     # For files smaller than the threshold size we just MD5 the whole file
-    if filesize_bytes < chunk_thresh_bytes:
-        with open(filePath, "rb") as file:
-            etag = _md5(file.read())
+    if filesize_bytes < chunk_thresh_bytes or force_single_part:
+        hash_obj = hashlib.md5()
+        with open(filePath, "rb") as f:
+            # Read in chunks to avoid memory issues with large files
+            for chunk in iter(lambda: f.read(4096 * 1024), b""):
+                hash_obj.update(chunk)
+        etag = hash_obj.hexdigest()
     # For large files we need to use the MD5 hashing schem prescribed by S3 for multipart uploads
     else:
         parts = filesize_bytes // chunk_size_bytes
         if filesize_bytes % chunk_size_bytes > 0:
             parts += 1
 
-        total_md5 = ""
+        md5_digests = []
         with open(filePath, "rb") as file:
             for part in range(parts):
                 skip_bytes = chunk_size_bytes * part
                 total_bytes_left = filesize_bytes - skip_bytes
                 bytes_to_read = min(total_bytes_left, chunk_size_bytes)
                 file.seek(skip_bytes)
                 buffer = file.read(bytes_to_read)
-                total_md5 += _md5(buffer)
+                md5_digests.append(hashlib.md5(buffer).digest())
 
-        combined_hash = _md5(bytes.fromhex(total_md5))
+        combined_hash = hashlib.md5(b"".join(md5_digests)).hexdigest()
         etag = f"{combined_hash}-{parts}"
 
     return f'"{etag}"'
diff --git a/tests/test_etags.py b/tests/test_etags.py
@@ -24,12 +24,20 @@ def test_calculate_etag(self):
         data_dir = env["ETAG_TEST_DATA"]
 
         etag_verify = {
-            "cat.jpg": '"a7eac640e7a66bdb9a0855c5137c81e5"',
-            "5mb.zip": '"91eccd44f01401d67e88ffab3ed9bb29"',
-            "100Mb.zip": '"bcae7ed6c3162532280c2a5447ba9484-2"',
-            "262Mb.zip": '"f028a132dcd7c2180b4bd530ce952a8f-6"',
-            "2Gb.zip": '"b61fe85ee6b4ab829b568c5f4453e95c-41"',
+            "cat.jpg": ['"a7eac640e7a66bdb9a0855c5137c81e5"',
+                        '"665bec20e3e85efd5055ecb9ae5a1c99-1"'],
+            "5mb.zip": ['"0bcc4b703f25a9caf1b79316a79555c6"',
+                        '"80ab5d2025dea57e7f6977cef01b0d25-1"'],
+            "100Mb.zip": ['"a7bf4a3167615963ec5216b0ae395792"',
+                          '"3d7a5327d0882dfe163f2176e5619b4c-2"'],
+            "262Mb.zip": ['"4c19ac8705002920e5ef7535fb2f35e1"',
+                          '"12878d9ab1bb5f9e0d35470ebd468f21-6"'],
+            "2Gb.zip": ['"f77c0c2655ccdaf6f6363b981b149fc9"',
+                        '"0d61a9abe6db277e0c84407122404529-41"'],
         }
         for filename, etag in etag_verify.items():
             filename = os.path.join(data_dir, filename)
-            self.assertEqual(etag, calculate_etag(filename))
+            # Standard single part calculation. This will give us the etag form for small files that doesn't end with "-N"
+            self.assertEqual(etag[0], calculate_etag(filename, force_single_part=True))
+            # Force multipart calculation by setting threshold to 0. This ensures we get the multipart etag even for small files.
+            self.assertEqual(etag[1], calculate_etag(filename, force_single_part=False, chunk_thresh_bytes=0))
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "2.2.0"`
	`1`	`+__version__ = "2.2.1"`