Skip to content

Commit c43e01d

Browse files
committed
Add better error handling for repover duplicate content
Added a proper error class for duplicate content handling and some more logging to inform exactly what are the conflicting content. Closes: #7184
1 parent 86a7293 commit c43e01d

5 files changed

Lines changed: 75 additions & 24 deletions

File tree

CHANGES/7184.feature

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Added better error handling for errors when creating RepositoryVersions.
2+
Now the duplicate content pks are shown in the logs.

pulpcore/exceptions/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,6 @@
2424
ValidationError,
2525
MissingDigestValidationError,
2626
UnsupportedDigestValidationError,
27+
DuplicateContentInRepositoryError,
2728
)
2829
from .plugin import MissingPlugin

pulpcore/exceptions/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from __future__ import annotations
12
import http.client
23
from gettext import gettext as _
34

pulpcore/exceptions/validation.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,21 @@ def __init__(self, message=None, verified=None):
119119

120120
def __str__(self):
121121
return f"[{self.error_code}] {self.message}"
122+
123+
124+
class DuplicateContentInRepositoryError(ValidationError):
125+
"""
126+
Raised when duplicate content is detected within a Repository (Version).
127+
"""
128+
129+
error_code = "PLP0022"
130+
131+
def __init__(self, duplicate_count: int, correlation_id: str):
132+
self.dup_count = duplicate_count
133+
self.cid = correlation_id
134+
135+
def __str__(self):
136+
return f"[{self.error_code}] " + _(
137+
"Found {n} duplicate contents in repository version"
138+
"(see the logs (cid={cid}) for details).".format(n=self.dup_count, cid=self.cid)
139+
)

pulpcore/plugin/repo_version_utils.py

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
from pulpcore.app.files import validate_file_paths
77
from pulpcore.app.models import Content, ContentArtifact
88
from pulpcore.app.util import batch_qs
9+
from pulpcore.exceptions import DuplicateContentInRepositoryError
10+
from collections import defaultdict
11+
from django_guid import get_guid
12+
from typing import NamedTuple
13+
from uuid import UUID
914

1015

1116
_logger = logging.getLogger(__name__)
@@ -78,35 +83,59 @@ def validate_duplicate_content(version):
7883
Uses repo_key_fields to determine if content is duplicated.
7984
8085
Raises:
81-
ValueError: If repo version has duplicate content.
86+
RepositoryVersionCreateError: If repo version has duplicate content.
8287
"""
83-
error_messages = []
84-
88+
dup_count = 0
89+
correlation_id = get_guid()
8590
for type_obj in version.repository.CONTENT_TYPES:
8691
if type_obj.repo_key_fields == ():
8792
continue
88-
8993
pulp_type = type_obj.get_pulp_type()
90-
repo_key_fields = type_obj.repo_key_fields
91-
new_content_total = type_obj.objects.filter(
92-
pk__in=version.content.filter(pulp_type=pulp_type)
93-
).count()
94-
unique_new_content_total = (
95-
type_obj.objects.filter(pk__in=version.content.filter(pulp_type=pulp_type))
96-
.distinct(*repo_key_fields)
97-
.count()
98-
)
99-
100-
if unique_new_content_total < new_content_total:
101-
error_messages.append(
102-
_(
103-
"More than one {pulp_type} content with the duplicate values for {fields}."
104-
).format(pulp_type=pulp_type, fields=", ".join(repo_key_fields))
105-
)
106-
if error_messages:
107-
raise ValueError(
108-
_("Cannot create repository version. {msg}").format(msg=", ".join(error_messages))
109-
)
94+
unique_keys = type_obj.repo_key_fields
95+
content_qs = type_obj.objects.filter(pk__in=version.content.filter(pulp_type=pulp_type))
96+
dup_count = count_duplicates(content_qs, unique_keys)
97+
if dup_count > 0:
98+
# At this point the task already failed, so we'll pay extra queries
99+
# to collect duplicates and provide more useful logs
100+
for duplicate in collect_duplicates(content_qs, unique_keys):
101+
keyset_value = duplicate.keyset_value
102+
duplicate_pks = duplicate.duplicate_pks
103+
_logger.info(f"Duplicates found: {pulp_type=}; {keyset_value=}; {duplicate_pks=}")
104+
if dup_count > 0:
105+
raise DuplicateContentInRepositoryError(dup_count, correlation_id)
106+
107+
108+
class DuplicateEntry(NamedTuple):
109+
keyset_value: tuple[str, ...]
110+
duplicate_pks: list[UUID]
111+
112+
113+
def count_duplicates(content_qs, unique_keys: tuple[str]) -> int:
114+
new_content_total = content_qs.count()
115+
unique_new_content_total = content_qs.distinct(*unique_keys).count()
116+
return new_content_total - unique_new_content_total
117+
118+
119+
def collect_duplicates(content_qs, unique_keys: tuple[str]) -> list[DuplicateEntry]:
120+
last_keyset = None
121+
last_pk = None
122+
keyset_to_contents = defaultdict(list)
123+
content_qs = content_qs.values_list(*unique_keys, "pk")
124+
for values in content_qs.order_by(*unique_keys).iterator():
125+
keyset_value = values[:-1]
126+
pk = str(values[-1])
127+
if keyset_value == last_keyset:
128+
dup_pk_list = keyset_to_contents[keyset_value]
129+
# the previous duplicate didn't know it was a duplicate
130+
if len(dup_pk_list) == 0:
131+
dup_pk_list.append(last_pk)
132+
dup_pk_list.append(pk)
133+
last_keyset = keyset_value
134+
last_pk = pk
135+
duplicate_entries = []
136+
for keyset_value, pk_list in keyset_to_contents.items():
137+
duplicate_entries.append(DuplicateEntry(duplicate_pks=pk_list, keyset_value=keyset_value))
138+
return duplicate_entries
110139

111140

112141
def validate_version_paths(version):

0 commit comments

Comments
 (0)