Skip to content

Commit bebc032

Browse files
authored
Feat: Use separate file to record product info (#120)
We found that AWS file metadata has length limitation, which means not suitable to store long information like products info. So we need to use a new file *.prodinfo for each uploaded files. Rules: * Only product tarball valid files should contain this prodinfo file * Metadta files like maven-metadata or npm package leveled package.json or index files will not have this prodinfo.
1 parent b352bcd commit bebc032

File tree

13 files changed

+477
-493
lines changed

13 files changed

+477
-493
lines changed

charon/constants.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,9 @@
134134
<hr/>
135135
<main>
136136
<ul style="list-style: none outside;" id="contents">
137-
{% for item in index.items %}{% if item.endswith("/") %}
138-
<li><a href="{{ item }}index.html" title="{{ item }}">{{ item }}</a></li>
139-
{% else %}
137+
{% for item in index.items %}
140138
<li><a href="{{ item }}" title="{{ item }}">{{ item }}</a></li>
141-
{% endif %}{% endfor%}
139+
{% endfor%}
142140
</ul>
143141
</main>
144142
<hr/>
@@ -175,3 +173,5 @@
175173
</body>
176174
</html>
177175
'''
176+
177+
PROD_INFO_SUFFIX = ".prodinfo"

charon/pkgs/npm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def handle_npm_uploading(
113113
_, _failed_metas = client.upload_metadatas(
114114
meta_file_paths=[meta_files[META_FILE_GEN_KEY]],
115115
bucket_name=bucket,
116-
product=product,
116+
product=None,
117117
root=target_dir,
118118
key_prefix=prefix_
119119
)
@@ -189,7 +189,7 @@ def handle_npm_del(
189189
all_meta_files.append(file)
190190
client.delete_files(
191191
file_paths=all_meta_files, bucket_name=bucket,
192-
product=product, root=target_dir, key_prefix=prefix_
192+
product=None, root=target_dir, key_prefix=prefix_
193193
)
194194
failed_metas = []
195195
if META_FILE_GEN_KEY in meta_files:

charon/storage.py

Lines changed: 91 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""
1616
from boto3_type_annotations.s3.service_resource import Object
1717
from charon.utils.files import read_sha1
18+
from charon.constants import PROD_INFO_SUFFIX
1819

1920
from boto3 import session
2021
from botocore.errorfactory import ClientError
@@ -134,8 +135,6 @@ def path_upload_handler(full_file_path: str, path: str, index: int, total: int)
134135
f_meta = {}
135136
if sha1.strip() != "":
136137
f_meta[CHECKSUM_META_KEY] = sha1
137-
if product:
138-
f_meta[PRODUCT_META_KEY] = product
139138
try:
140139
if not self.__dry_run:
141140
if len(f_meta) > 0:
@@ -149,6 +148,8 @@ def path_upload_handler(full_file_path: str, path: str, index: int, total: int)
149148
full_file_path,
150149
ExtraArgs={'ContentType': content_type}
151150
)
151+
if product:
152+
self.__update_prod_info(path_key, bucket_name, [product])
152153
logger.info('Uploaded %s to bucket %s', path, bucket_name)
153154
uploaded_files.append(path_key)
154155
except (ClientError, HTTPClientError) as e:
@@ -170,25 +171,15 @@ def path_upload_handler(full_file_path: str, path: str, index: int, total: int)
170171
'one in S3. Product: %s', path_key, product)
171172
return False
172173

173-
prods = []
174-
try:
175-
prods = f_meta[PRODUCT_META_KEY].split(",")
176-
except KeyError:
177-
pass
178-
if not self.__dry_run and product not in prods:
174+
(prods, no_error) = self.__get_prod_info(path_key, bucket_name)
175+
if not self.__dry_run and no_error and product not in prods:
179176
logger.info(
180177
"File %s has new product, updating the product %s",
181178
full_file_path,
182179
product,
183180
)
184181
prods.append(product)
185-
try:
186-
self.__update_file_metadata(file_object, bucket_name,
187-
{PRODUCT_META_KEY: ",".join(prods)})
188-
except (ClientError, HTTPClientError) as e:
189-
logger.error("ERROR: file %s not uploaded to bucket"
190-
" %s due to error: %s ", full_file_path,
191-
bucket_name, e)
182+
if not self.__update_prod_info(path_key, bucket_name, prods):
192183
return False
193184
return True
194185

@@ -198,7 +189,7 @@ def path_upload_handler(full_file_path: str, path: str, index: int, total: int)
198189

199190
def upload_metadatas(
200191
self, meta_file_paths: List[str], bucket_name: str,
201-
product: Optional[str], root="/", key_prefix: str = None
192+
product: Optional[str] = None, root="/", key_prefix: str = None
202193
) -> Tuple[List[str], List[str]]:
203194
""" Upload a list of metadata files to s3 bucket. This function is very similar to
204195
upload_files, except:
@@ -237,14 +228,6 @@ def path_upload_handler(full_file_path: str, path: str, index: int, total: int):
237228
)
238229

239230
f_meta[CHECKSUM_META_KEY] = sha1
240-
prods = (
241-
f_meta[PRODUCT_META_KEY].split(",")
242-
if PRODUCT_META_KEY in f_meta
243-
else []
244-
)
245-
if product and product not in prods:
246-
prods.append(product)
247-
f_meta[PRODUCT_META_KEY] = ",".join(prods)
248231
try:
249232
if not self.__dry_run:
250233
if need_overwritten:
@@ -253,15 +236,16 @@ def path_upload_handler(full_file_path: str, path: str, index: int, total: int):
253236
Metadata=f_meta,
254237
ContentType=content_type
255238
)
256-
257-
else:
258-
# Should we update the s3 object metadata for metadata files?
259-
try:
260-
self.__update_file_metadata(file_object, bucket_name, f_meta)
261-
except (ClientError, HTTPClientError) as e:
262-
logger.error("ERROR: metadata %s not updated to bucket"
263-
" %s due to error: %s ", full_file_path,
264-
bucket_name, e)
239+
if product:
240+
# NOTE: This should not happen for most cases, as most of the metadata
241+
# file does not have product info. Just leave for requirement change in
242+
# future
243+
(prods, no_error) = self.__get_prod_info(path_key, bucket_name)
244+
if not no_error:
245+
return False
246+
if no_error and product not in prods:
247+
prods.append(product)
248+
if not self.__update_prod_info(path_key, bucket_name, prods):
265249
return False
266250
logger.info('Updated metadata %s to bucket %s', path, bucket_name)
267251
uploaded_files.append(path_key)
@@ -306,11 +290,9 @@ def path_delete_handler(full_file_path: str, path: str, index: int, total: int):
306290
# the product reference counts will be used (from object metadata).
307291
prods = []
308292
if product:
309-
try:
310-
prods = file_object.metadata[PRODUCT_META_KEY].split(",")
311-
except KeyError:
312-
pass
313-
293+
(prods, no_error) = self.__get_prod_info(path_key, bucket_name)
294+
if not no_error:
295+
return False
314296
if product in prods:
315297
prods.remove(product)
316298

@@ -321,10 +303,10 @@ def path_delete_handler(full_file_path: str, path: str, index: int, total: int):
321303
" will remove %s from its metadata",
322304
path, product
323305
)
324-
self.__update_file_metadata(
325-
file_object,
306+
self.__update_prod_info(
307+
path_key,
326308
bucket_name,
327-
{PRODUCT_META_KEY: ",".join(prods)},
309+
prods,
328310
)
329311
logger.info(
330312
"Removed product %s from metadata of file %s",
@@ -341,6 +323,8 @@ def path_delete_handler(full_file_path: str, path: str, index: int, total: int):
341323
try:
342324
if not self.__dry_run:
343325
bucket.delete_objects(Delete={"Objects": [{"Key": path_key}]})
326+
if not self.__update_prod_info(path_key, bucket_name, prods):
327+
return False
344328
logger.info("Deleted %s from bucket %s", path, bucket_name)
345329
deleted_files.append(path)
346330
return True
@@ -439,23 +423,74 @@ def __file_exists(self, file_object: Object) -> bool:
439423
try:
440424
file_object.load()
441425
return True
442-
except ClientError as e:
443-
if e.response["Error"]["Code"] == "404":
426+
except (ClientError, HTTPClientError) as e:
427+
if isinstance(e, ClientError) and e.response["Error"]["Code"] == "404":
444428
return False
445429
else:
446-
raise e
447-
448-
def __update_file_metadata(
449-
self, file_object: s3.Object, bucket_name: str, metadata: Dict
450-
):
451-
if not self.__dry_run:
452-
file_object.metadata.update(metadata)
453-
file_object.copy_from(
454-
CopySource={"Bucket": bucket_name, "Key": file_object.key},
455-
Metadata=file_object.metadata,
456-
ContentType=file_object.content_type,
457-
MetadataDirective="REPLACE",
458-
)
430+
logger.error("Error: file existence check failed due "
431+
"to error: %s", e)
432+
433+
# def __update_file_metadata(
434+
# self, file_object: s3.Object, bucket_name: str, metadata: Dict
435+
# ):
436+
# if not self.__dry_run:
437+
# file_object.metadata.update(metadata)
438+
# file_object.copy_from(
439+
# CopySource={"Bucket": bucket_name, "Key": file_object.key},
440+
# Metadata=file_object.metadata,
441+
# ContentType=file_object.content_type,
442+
# MetadataDirective="REPLACE",
443+
# )
444+
445+
def __get_prod_info(
446+
self, file: str, bucket_name: str
447+
) -> Tuple[List[str], bool]:
448+
logger.debug("Getting product infomation for file %s", file)
449+
prod_info_file = file + PROD_INFO_SUFFIX
450+
try:
451+
info_file_content = self.read_file_content(bucket_name, prod_info_file)
452+
prods = [p.strip() for p in info_file_content.split("\n")]
453+
logger.debug("Got product information as below %s", prods)
454+
return (prods, True)
455+
except (ClientError, HTTPClientError) as e:
456+
logger.error("ERROR: Can not get product info for file %s "
457+
"due to error: %s", file, e)
458+
return ([], False)
459+
460+
def __update_prod_info(
461+
self, file: str, bucket_name: str, prods: List[str]
462+
) -> bool:
463+
prod_info_file = file + PROD_INFO_SUFFIX
464+
bucket = self.__get_bucket(bucket_name)
465+
file_obj = bucket.Object(prod_info_file)
466+
content_type = "text/plain"
467+
if len(prods) > 0:
468+
logger.debug("Updating product infomation for file %s "
469+
"with products: %s", file, prods)
470+
try:
471+
file_obj.put(
472+
Body="\n".join(prods).encode("utf-8"),
473+
ContentType=content_type
474+
)
475+
logger.debug("Updated product infomation for file %s", file)
476+
return True
477+
except (ClientError, HTTPClientError) as e:
478+
logger.error("ERROR: Can not update product info for file %s "
479+
"due to error: %s", file, e)
480+
return False
481+
else:
482+
logger.debug("Removing product infomation file for file %s "
483+
"because no products left", file)
484+
try:
485+
if self.__file_exists(file_obj):
486+
bucket.delete_objects(
487+
Delete={"Objects": [{"Key": prod_info_file}]})
488+
logger.debug("Removed product infomation file for file %s", file)
489+
return True
490+
except (ClientError, HTTPClientError) as e:
491+
logger.error("ERROR: Can not delete product info file for file %s "
492+
"due to error: %s", file, e)
493+
return False
459494

460495
def __do_path_cut_and(
461496
self, file_paths: List[str],

tests/base.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,16 @@
1717
import tempfile
1818
import os
1919
import shutil
20+
import boto3
21+
from typing import List
2022
from charon.utils.files import overwrite_file
2123
from charon.config import CONFIG_FILE
24+
from charon.constants import PROD_INFO_SUFFIX
25+
from charon.pkgs.pkg_utils import is_metadata
26+
from charon.storage import PRODUCT_META_KEY, CHECKSUM_META_KEY
27+
from tests.commons import TEST_BUCKET
28+
from boto3_type_annotations import s3
29+
2230

2331
SHORT_TEST_PREFIX = "ga"
2432
LONG_TEST_PREFIX = "earlyaccess/all"
@@ -72,3 +80,49 @@ def get_temp_dir(self) -> str:
7280

7381
def get_config_base(self) -> str:
7482
return os.path.join(self.get_temp_dir(), '.charon')
83+
84+
85+
class PackageBaseTest(BaseTest):
86+
def setUp(self):
87+
super().setUp()
88+
# mock_s3 is used to generate expected content
89+
self.mock_s3 = self.__prepare_s3()
90+
self.mock_s3.create_bucket(Bucket=TEST_BUCKET)
91+
self.test_bucket = self.mock_s3.Bucket(TEST_BUCKET)
92+
93+
def tearDown(self):
94+
bucket = self.mock_s3.Bucket(TEST_BUCKET)
95+
try:
96+
bucket.objects.all().delete()
97+
bucket.delete()
98+
except ValueError:
99+
pass
100+
super().tearDown()
101+
102+
def __prepare_s3(self):
103+
return boto3.resource('s3')
104+
105+
def check_product(self, file: str, prods: List[str]):
106+
prod_file = file + PROD_INFO_SUFFIX
107+
prod_f_obj = self.test_bucket.Object(prod_file)
108+
content = str(prod_f_obj.get()['Body'].read(), 'utf-8')
109+
self.assertEqual(
110+
set(prods),
111+
set([f for f in content.split("\n") if f.strip() != ""])
112+
)
113+
114+
def check_content(self, objs: List[s3.ObjectSummary], products: List[str]):
115+
for obj in objs:
116+
file_obj = obj.Object()
117+
if not file_obj.key.endswith(PROD_INFO_SUFFIX):
118+
if not is_metadata(file_obj.key):
119+
self.check_product(file_obj.key, products)
120+
else:
121+
self.assertNotIn(PRODUCT_META_KEY, file_obj.metadata)
122+
if file_obj.key.endswith("maven-metadata.xml"):
123+
sha1_checksum = file_obj.metadata[CHECKSUM_META_KEY].strip()
124+
sha1_obj = self.test_bucket.Object(file_obj.key + ".sha1")
125+
sha1_file_content = str(sha1_obj.get()['Body'].read(), 'utf-8')
126+
self.assertEqual(sha1_checksum, sha1_file_content)
127+
self.assertIn(CHECKSUM_META_KEY, file_obj.metadata)
128+
self.assertNotEqual("", file_obj.metadata[CHECKSUM_META_KEY].strip())

tests/commons.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# For maven
2-
TEST_MVN_BUCKET = "test_bucket"
2+
TEST_BUCKET = "test_bucket"
33
COMMONS_CLIENT_456_FILES = [
44
"org/apache/httpcomponents/httpclient/4.5.6/httpclient-4.5.6.pom.sha1",
55
"org/apache/httpcomponents/httpclient/4.5.6/httpclient-4.5.6.jar",
@@ -49,6 +49,20 @@
4949
"commons-client-4.5.9/licenses/licenses.txt",
5050
"commons-client-4.5.9/README.md"
5151
]
52+
COMMONS_CLIENT_456_MVN_NUM = (
53+
len(COMMONS_CLIENT_456_FILES) +
54+
len(COMMONS_LOGGING_FILES))
55+
COMMONS_CLIENT_459_MVN_NUM = (
56+
len(COMMONS_CLIENT_459_FILES) +
57+
len(COMMONS_LOGGING_FILES))
58+
COMMONS_CLIENT_MVN_NUM = (
59+
len(COMMONS_CLIENT_456_FILES) +
60+
len(COMMONS_CLIENT_459_FILES) +
61+
len(COMMONS_LOGGING_FILES))
62+
COMMONS_CLIENT_META_NUM = (
63+
len(COMMONS_CLIENT_METAS) +
64+
len(COMMONS_LOGGING_METAS) +
65+
len(ARCHETYPE_CATALOG_FILES))
5266
# For maven indexes
5367
COMMONS_CLIENT_456_INDEXES = [
5468
"index.html",

0 commit comments

Comments
 (0)