Skip to content

Commit 1197084

Browse files
committed
fixing etags to support single AND multipart forms fixes #2
1 parent 7c3e49c commit 1197084

File tree

7 files changed

+276
-19
lines changed

7 files changed

+276
-19
lines changed

RSXML-Python.code-workspace

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@
1818
"python.analysis.extraPaths": [
1919
"./src"
2020
],
21+
"autopep8.args": [
22+
"--max-line-length=240"
23+
],
24+
"pylint.args": [
25+
"--disable=C0301,C0114,C0103,W0719,W0718",
26+
"--max-line-length=240"
27+
],
2128
"python.terminal.activateEnvironment": true,
2229
"python.testing.pytestEnabled": true,
2330
"python.testing.unittestEnabled": true,

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ urls = { "Source" = "https://github.com/Riverscapes/RiverscapesXML", "Homepage"
3838

3939
[project.optional-dependencies]
4040
dev = [
41+
"boto3>=1.42.10",
42+
"botocore>=1.42.10",
43+
"questionary>=2.1.1",
4144
"pytest>=8.3.4",
4245
"ruff>=0.1.0",
4346
"nox>=2023.4.22",

scripts/verify_etags_s3.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
""" Verify Etags
2+
3+
This is just a little helper script to upload files to S3 using both single part
4+
and multipart uploads, and then fetch the Etag from S3 to verify that our etag
5+
calculation matches what S3 produces.
6+
7+
These values go into the test_etags.py unit tests to verify correctness of our etag calculations.
8+
9+
There is no reason to run this script other than to regenerate those values if the etag calculation
10+
algorithm changes.
11+
12+
"""
13+
import os
14+
15+
import boto3
16+
import questionary
17+
from botocore.exceptions import NoCredentialsError
18+
19+
from rsxml.dotenv import parse_dotenv
20+
21+
22+
def upload_file(file_path: str, bucket_name: str, object_name=None, force_multipart=False):
23+
"""Upload a file to an S3 bucket
24+
25+
:param file_path: File to upload
26+
:param bucket_name: Bucket to upload to
27+
:param object_name: S3 object name. If not specified then file_name is used
28+
:param force_multipart: If True, forces multipart upload even for small files
29+
:return: True if file was uploaded, else False
30+
"""
31+
32+
# If S3 object_name was not specified, use file_name
33+
if object_name is None:
34+
object_name = os.path.basename(file_path)
35+
36+
# Upload the file
37+
s3_client = boto3.client('s3')
38+
39+
try:
40+
print(f"Uploading {file_path} to {bucket_name}/{object_name}...")
41+
42+
if force_multipart:
43+
# Force multipart upload configuration
44+
# Set threshold to 0 to force multipart for any file size
45+
config = boto3.s3.transfer.TransferConfig(
46+
multipart_threshold=1,
47+
multipart_chunksize=50 * 1024 * 1024 # 50MB chunks to match rsxml defaults
48+
)
49+
s3_client.upload_file(file_path, bucket_name, object_name, Config=config)
50+
else:
51+
# Standard upload (let boto3 decide, or force single part by setting high threshold)
52+
# To ensure single part for testing, we set a very high threshold
53+
config = boto3.s3.transfer.TransferConfig(
54+
multipart_threshold=10 * 1024 * 1024 * 1024 # 10GB threshold
55+
)
56+
s3_client.upload_file(file_path, bucket_name, object_name, Config=config)
57+
58+
# Fetch the object to get the ETag
59+
response = s3_client.head_object(Bucket=bucket_name, Key=object_name)
60+
remote_etag = response['ETag']
61+
print(f"Upload Successful. Remote ETag: {remote_etag}")
62+
return remote_etag
63+
64+
except NoCredentialsError:
65+
print("Credentials not available")
66+
return None
67+
except Exception as e:
68+
print(f"Upload failed: {e}")
69+
return None
70+
71+
72+
def main():
73+
74+
bucket_name = questionary.text("S3 Bucket name:").ask()
75+
s3_prefix = questionary.text("S3 Prefix (optional):", default="matt/s3etagtest").ask()
76+
if not bucket_name:
77+
return
78+
79+
dir_path = questionary.path("Directory containing files to upload:").ask()
80+
if not dir_path:
81+
return
82+
83+
if not os.path.exists(dir_path):
84+
print(f"Directory {dir_path} does not exist.")
85+
return
86+
87+
files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and not f.startswith('.')]
88+
89+
print(f"Found {len(files)} files in {dir_path}")
90+
91+
for filename in files:
92+
file_path = os.path.join(dir_path, filename)
93+
94+
print(f"\n--- Processing {filename} ---")
95+
96+
# 1. Upload as Single Part (Standard MD5)
97+
print("1. Uploading as Single Part...")
98+
etag_single = upload_file(file_path, bucket_name, s3_prefix + f"/single_part/{filename}", force_multipart=False)
99+
100+
# 2. Upload as Multipart
101+
print("2. Uploading as Multipart...")
102+
etag_multipart = upload_file(file_path, bucket_name, s3_prefix + f"/multipart/{filename}", force_multipart=True)
103+
print(f"Summary for {filename}:")
104+
print(f" Single Part ETag: {etag_single}")
105+
print(f" Multipart ETag: {etag_multipart}")
106+
107+
108+
if __name__ == "__main__":
109+
main()

src/rsxml/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "2.2.0"
1+
__version__ = "2.2.1"

src/rsxml/etag.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,20 @@
44
from rsxml.constants import MULTIPART_CHUNK_SIZE, MULTIPART_THRESHOLD
55

66

7-
def _md5(contents: str):
8-
return hashlib.md5(contents).hexdigest()
9-
10-
117
def calculate_etag(
128
filePath: str,
139
chunk_size_bytes: int = MULTIPART_CHUNK_SIZE,
1410
chunk_thresh_bytes: int = MULTIPART_THRESHOLD,
15-
):
11+
force_single_part: bool = False,
12+
) -> str:
1613
"""Calculate the Etag of a file. This is useful for figuring out if
1714
it needs to be uploaded to the warehouse or not.
1815
1916
Args:
2017
filePath (str): path to the file we want an etag for
2118
chunk_size_bytes (int): The size of a multipart upload
2219
chunk_thresh_bytes (int): The threshold before we start using multipart uploads
20+
force_single_part (bool): If True, force a single part etag (standard MD5) even for large files.
2321
2422
Returns:
2523
str: The Etag of the file
@@ -29,26 +27,30 @@ def calculate_etag(
2927

3028
etag = ""
3129
# For files smaller than the threshold size we just MD5 the whole file
32-
if filesize_bytes < chunk_thresh_bytes:
33-
with open(filePath, "rb") as file:
34-
etag = _md5(file.read())
30+
if filesize_bytes < chunk_thresh_bytes or force_single_part:
31+
hash_obj = hashlib.md5()
32+
with open(filePath, "rb") as f:
33+
# Read in chunks to avoid memory issues with large files
34+
for chunk in iter(lambda: f.read(4096 * 1024), b""):
35+
hash_obj.update(chunk)
36+
etag = hash_obj.hexdigest()
3537
# For large files we need to use the MD5 hashing schem prescribed by S3 for multipart uploads
3638
else:
3739
parts = filesize_bytes // chunk_size_bytes
3840
if filesize_bytes % chunk_size_bytes > 0:
3941
parts += 1
4042

41-
total_md5 = ""
43+
md5_digests = []
4244
with open(filePath, "rb") as file:
4345
for part in range(parts):
4446
skip_bytes = chunk_size_bytes * part
4547
total_bytes_left = filesize_bytes - skip_bytes
4648
bytes_to_read = min(total_bytes_left, chunk_size_bytes)
4749
file.seek(skip_bytes)
4850
buffer = file.read(bytes_to_read)
49-
total_md5 += _md5(buffer)
51+
md5_digests.append(hashlib.md5(buffer).digest())
5052

51-
combined_hash = _md5(bytes.fromhex(total_md5))
53+
combined_hash = hashlib.md5(b"".join(md5_digests)).hexdigest()
5254
etag = f"{combined_hash}-{parts}"
5355

5456
return f'"{etag}"'

tests/test_etags.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,20 @@ def test_calculate_etag(self):
2424
data_dir = env["ETAG_TEST_DATA"]
2525

2626
etag_verify = {
27-
"cat.jpg": '"a7eac640e7a66bdb9a0855c5137c81e5"',
28-
"5mb.zip": '"91eccd44f01401d67e88ffab3ed9bb29"',
29-
"100Mb.zip": '"bcae7ed6c3162532280c2a5447ba9484-2"',
30-
"262Mb.zip": '"f028a132dcd7c2180b4bd530ce952a8f-6"',
31-
"2Gb.zip": '"b61fe85ee6b4ab829b568c5f4453e95c-41"',
27+
"cat.jpg": ['"a7eac640e7a66bdb9a0855c5137c81e5"',
28+
'"665bec20e3e85efd5055ecb9ae5a1c99-1"'],
29+
"5mb.zip": ['"0bcc4b703f25a9caf1b79316a79555c6"',
30+
'"80ab5d2025dea57e7f6977cef01b0d25-1"'],
31+
"100Mb.zip": ['"a7bf4a3167615963ec5216b0ae395792"',
32+
'"3d7a5327d0882dfe163f2176e5619b4c-2"'],
33+
"262Mb.zip": ['"4c19ac8705002920e5ef7535fb2f35e1"',
34+
'"12878d9ab1bb5f9e0d35470ebd468f21-6"'],
35+
"2Gb.zip": ['"f77c0c2655ccdaf6f6363b981b149fc9"',
36+
'"0d61a9abe6db277e0c84407122404529-41"'],
3237
}
3338
for filename, etag in etag_verify.items():
3439
filename = os.path.join(data_dir, filename)
35-
self.assertEqual(etag, calculate_etag(filename))
40+
# Standard single part calculation. This will give us the etag form for small files that doesn't end with "-N"
41+
self.assertEqual(etag[0], calculate_etag(filename, force_single_part=True))
42+
# Force multipart calculation by setting threshold to 0. This ensures we get the multipart etag even for small files.
43+
self.assertEqual(etag[1], calculate_etag(filename, force_single_part=False, chunk_thresh_bytes=0))

0 commit comments

Comments
 (0)