Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES/7184.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Added better error handling for errors when creating RepositoryVersions.
Now the duplicate content pks are shown in the logs.
1 change: 1 addition & 0 deletions pulpcore/exceptions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@
ValidationError,
MissingDigestValidationError,
UnsupportedDigestValidationError,
DuplicateContentInRepositoryError,
)
from .plugin import MissingPlugin
1 change: 1 addition & 0 deletions pulpcore/exceptions/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import annotations
import http.client
from gettext import gettext as _

Expand Down
18 changes: 18 additions & 0 deletions pulpcore/exceptions/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,21 @@ def __init__(self, message=None, verified=None):

def __str__(self):
return f"[{self.error_code}] {self.message}"


class DuplicateContentInRepositoryError(ValidationError):
"""
Raised when duplicate content is detected within a Repository (Version).
"""

error_code = "PLP0022"

def __init__(self, duplicate_count: int, correlation_id: str):
self.dup_count = duplicate_count
self.cid = correlation_id

def __str__(self):
return f"[{self.error_code}] " + _(
"Found {n} duplicate contents in repository version"
"(see the logs (cid={cid}) for details).".format(n=self.dup_count, cid=self.cid)
)
81 changes: 57 additions & 24 deletions pulpcore/plugin/repo_version_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from pulpcore.app.files import validate_file_paths
from pulpcore.app.models import Content, ContentArtifact
from pulpcore.app.util import batch_qs
from pulpcore.exceptions import DuplicateContentInRepositoryError
from collections import defaultdict
from django_guid import get_guid
from typing import NamedTuple
from uuid import UUID


_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -78,35 +83,63 @@ def validate_duplicate_content(version):
Uses repo_key_fields to determine if content is duplicated.

Raises:
ValueError: If repo version has duplicate content.
RepositoryVersionCreateError: If repo version has duplicate content.
"""
error_messages = []

dup_count = 0
correlation_id = get_guid()
for type_obj in version.repository.CONTENT_TYPES:
if type_obj.repo_key_fields == ():
continue

pulp_type = type_obj.get_pulp_type()
repo_key_fields = type_obj.repo_key_fields
new_content_total = type_obj.objects.filter(
pk__in=version.content.filter(pulp_type=pulp_type)
).count()
unique_new_content_total = (
type_obj.objects.filter(pk__in=version.content.filter(pulp_type=pulp_type))
.distinct(*repo_key_fields)
.count()
)

if unique_new_content_total < new_content_total:
error_messages.append(
_(
"More than one {pulp_type} content with the duplicate values for {fields}."
).format(pulp_type=pulp_type, fields=", ".join(repo_key_fields))
)
if error_messages:
raise ValueError(
_("Cannot create repository version. {msg}").format(msg=", ".join(error_messages))
)
unique_keys = type_obj.repo_key_fields
content_qs = type_obj.objects.filter(pk__in=version.content.filter(pulp_type=pulp_type))
dup_count = count_duplicates(content_qs, unique_keys)
if dup_count > 0:
# At this point the task already failed, so we'll pay extra queries
# to collect duplicates and provide more useful logs
for duplicate in collect_duplicates(content_qs, unique_keys):
log_duplicate(pulp_type, duplicate)
if dup_count > 0:
raise DuplicateContentInRepositoryError(dup_count, correlation_id)


class DuplicateEntry(NamedTuple):
keyset_value: tuple[str, ...]
duplicate_pks: list[UUID]


def log_duplicate(pulp_type: str, duplicate: DuplicateEntry):
keyset_value = duplicate.keyset_value
duplicate_pks = duplicate.duplicate_pks
_logger.info(f"Duplicates found: {pulp_type=}; {keyset_value=}; {duplicate_pks=}")


def count_duplicates(content_qs, unique_keys: tuple[str]) -> int:
new_content_total = content_qs.count()
unique_new_content_total = content_qs.distinct(*unique_keys).count()
return new_content_total - unique_new_content_total


def collect_duplicates(content_qs, unique_keys: tuple[str]) -> list[DuplicateEntry]:
last_keyset = None
last_pk = None
keyset_to_contents = defaultdict(list)
content_qs = content_qs.values_list(*unique_keys, "pk")
for values in content_qs.order_by(*unique_keys).iterator():
keyset_value = values[:-1]
pk = str(values[-1])
if keyset_value == last_keyset:
dup_pk_list = keyset_to_contents[keyset_value]
# the previous duplicate didn't know it was a duplicate
if len(dup_pk_list) == 0:
dup_pk_list.append(last_pk)
dup_pk_list.append(pk)
last_keyset = keyset_value
last_pk = pk
duplicate_entries = []
for keyset_value, pk_list in keyset_to_contents.items():
duplicate_entries.append(DuplicateEntry(duplicate_pks=pk_list, keyset_value=keyset_value))
return duplicate_entries


def validate_version_paths(version):
Expand Down