From 27743d2340d5a538a1693170b00bcfc476f34138 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Tue, 10 Jun 2025 16:03:17 +0100 Subject: [PATCH 01/23] [CASSANDRA-20736] Add CMS membership directly to ClusterMetadata --- .../apache/cassandra/tcm/CMSMembership.java | 203 ++++++++++++++++++ .../apache/cassandra/tcm/ClusterMetadata.java | 127 ++++++++--- .../cassandra/tcm/ClusterMetadataService.java | 17 +- .../cassandra/tcm/membership/NodeVersion.java | 2 +- .../cassandra/tcm/serialization/Version.java | 38 ++-- 5 files changed, 330 insertions(+), 57 deletions(-) create mode 100644 src/java/org/apache/cassandra/tcm/CMSMembership.java diff --git a/src/java/org/apache/cassandra/tcm/CMSMembership.java b/src/java/org/apache/cassandra/tcm/CMSMembership.java new file mode 100644 index 000000000000..d83a5b950e90 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/CMSMembership.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm; + +import java.io.IOException; +import java.util.Objects; +import java.util.SortedSet; +import java.util.TreeSet; + +import com.google.common.collect.ImmutableSet; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.btree.BTreeSet; + +public class CMSMembership implements MetadataValue +{ + public static final Serializer serializer = new Serializer(); + public static final CMSMembership EMPTY = new CMSMembership(); + + private final Epoch lastModified; + private final BTreeSet fullMembers; + private final BTreeSet joiningMembers; + + private CMSMembership() + { + this(Epoch.EMPTY, + BTreeSet.empty(NodeId::compareTo), + BTreeSet.empty(NodeId::compareTo)); + } + + private CMSMembership(Epoch lastModified, BTreeSet fullMembers, BTreeSet joiningMembers) + { + this.lastModified = lastModified; + this.fullMembers = fullMembers; + this.joiningMembers = joiningMembers; + } + + + @Override + public CMSMembership withLastModified(Epoch epoch) + { + return lastModified.is(epoch) ? this : new CMSMembership(epoch, fullMembers, joiningMembers); + } + + @Override + public Epoch lastModified() + { + return lastModified; + } + + public ImmutableSet joiningMembers() + { + return ImmutableSet.copyOf(joiningMembers); + } + + public ImmutableSet fullMembers() + { + return ImmutableSet.copyOf(fullMembers); + } + + public CMSMembership startJoining(NodeId id) + { + if (joiningMembers.contains(id)) + throw new IllegalStateException(id + " is already joining the CMS"); + if (fullMembers.contains(id)) + throw new IllegalStateException(id + " has already fully joined the CMS"); + + return new CMSMembership(lastModified, fullMembers, joiningMembers.with(id)); + } + + public CMSMembership cancelJoining(NodeId id) + { + if (!joiningMembers.contains(id)) + throw new IllegalStateException(id + " is not currently joining the CMS"); + if (fullMembers.contains(id)) + throw new IllegalStateException(id + " has already fully joined the CMS"); + + return new CMSMembership(lastModified, fullMembers, joiningMembers.without(id)); + } + + public CMSMembership finishJoining(NodeId id) + { + if (!joiningMembers.contains(id)) + throw new IllegalStateException(id + " is not currently joining the CMS"); + if (fullMembers.contains(id)) + throw new IllegalStateException(id + " has already fully joined the CMS"); + + return new CMSMembership(lastModified, fullMembers.with(id), joiningMembers.without(id)); + } + + public CMSMembership leave(NodeId id) + { + if (joiningMembers.contains(id)) + throw new IllegalStateException(id + " is currently joining the CMS, "); + if (!fullMembers.contains(id)) + throw new IllegalStateException(id + " is not a CMS member"); + + return new CMSMembership(lastModified, fullMembers.without(id), joiningMembers); + } + + @Override + public String toString() + { + return "CMSMembership{" + + "lastModified=" + lastModified + + ", fullMembers=" + fullMembers + + ", joiningMembers=" + joiningMembers + + '}'; + } + + @Override + public final boolean equals(Object o) + { + if (!(o instanceof CMSMembership)) return false; + + CMSMembership that = (CMSMembership) o; + return Objects.equals(lastModified, that.lastModified) && + Objects.equals(fullMembers, that.fullMembers) && + Objects.equals(joiningMembers, that.joiningMembers); + } + + @Override + public int hashCode() + { + int result = Objects.hashCode(lastModified); + result = 31 * result + Objects.hashCode(fullMembers); + result = 31 * result + Objects.hashCode(joiningMembers); + return result; + } + + public static class Serializer implements MetadataSerializer + { + @Override + public void serialize(CMSMembership t, DataOutputPlus out, Version version) throws IOException + { + Epoch.serializer.serialize(t.lastModified, out); + + out.writeUnsignedVInt32(t.fullMembers.size()); + for (NodeId id : t.fullMembers) + NodeId.serializer.serialize(id, out, version); + + out.writeUnsignedVInt32(t.joiningMembers.size()); + for (NodeId id : t.joiningMembers) + NodeId.serializer.serialize(id, out, version); + } + + @Override + public CMSMembership deserialize(DataInputPlus in, Version version) throws IOException + { + Epoch lastModified = Epoch.serializer.deserialize(in, version); + + int fullMemberCount = in.readUnsignedVInt32(); + SortedSet fullMembers = new TreeSet<>(); + for (int i = 0; i < fullMemberCount; i++) + fullMembers.add(NodeId.serializer.deserialize(in, version)); + + int joiningMemberCount = in.readUnsignedVInt32(); + SortedSet joiningMembers = new TreeSet<>(); + for (int i = 0; i < joiningMemberCount; i++) + joiningMembers.add(NodeId.serializer.deserialize(in, version)); + + return new CMSMembership(lastModified, BTreeSet.of(fullMembers), BTreeSet.of(joiningMembers)) ; + } + + @Override + public long serializedSize(CMSMembership t, Version version) + { + long size = Epoch.serializer.serializedSize(t.lastModified); + + size += TypeSizes.sizeofUnsignedVInt(t.fullMembers.size()); + for (NodeId id : t.fullMembers) + size += NodeId.serializer.serializedSize(id, version); + + size += TypeSizes.sizeofUnsignedVInt(t.joiningMembers.size()); + for (NodeId id : t.joiningMembers) + size += NodeId.serializer.serializedSize(id, version); + + return size; + } + } + +} diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index fa2d8f5ee5f6..76bcb84757a9 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.stream.Collectors; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; @@ -58,7 +59,6 @@ import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.Keyspaces; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.accord.AccordFastPath; import org.apache.cassandra.service.accord.AccordStaleReplicas; @@ -86,7 +86,6 @@ import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; -import static com.google.common.collect.ImmutableSet.toImmutableSet; import static org.apache.cassandra.config.CassandraRelevantProperties.LINE_SEPARATOR; import static org.apache.cassandra.db.TypeSizes.sizeof; import static org.apache.cassandra.tcm.serialization.Version.MIN_ACCORD_VERSION; @@ -111,6 +110,7 @@ public class ClusterMetadata public final ConsensusMigrationState consensusMigrationState; public final ImmutableMap, ExtensionValue> extensions; public final AccordStaleReplicas accordStaleReplicas; + public final CMSMembership cmsMembership; // This isn't serialized as part of ClusterMetadata it's really just a view over the Directory. public final Locator locator; @@ -118,7 +118,6 @@ public class ClusterMetadata // These fields are lazy but only for the test purposes, since their computation requires initialization of the log ks private EndpointsForRange fullCMSReplicas; private Set fullCMSEndpoints; - private Set fullCMSIds; private DataPlacements writePlacementAllSettled; public ClusterMetadata(IPartitioner partitioner) @@ -147,7 +146,8 @@ public ClusterMetadata(IPartitioner partitioner, Directory directory, Distribute InProgressSequences.EMPTY, ConsensusMigrationState.EMPTY, ImmutableMap.of(), - AccordStaleReplicas.EMPTY); + AccordStaleReplicas.EMPTY, + CMSMembership.EMPTY); } public ClusterMetadata(Epoch epoch, @@ -161,7 +161,8 @@ public ClusterMetadata(Epoch epoch, InProgressSequences inProgressSequences, ConsensusMigrationState consensusMigrationState, Map, ExtensionValue> extensions, - AccordStaleReplicas accordStaleReplicas) + AccordStaleReplicas accordStaleReplicas, + CMSMembership cmsMembership) { this(EMPTY_METADATA_IDENTIFIER, epoch, @@ -175,22 +176,25 @@ public ClusterMetadata(Epoch epoch, inProgressSequences, consensusMigrationState, extensions, - accordStaleReplicas); + accordStaleReplicas, + cmsMembership); } + private ClusterMetadata(int metadataIdentifier, - Epoch epoch, - IPartitioner partitioner, - DistributedSchema schema, - Directory directory, - TokenMap tokenMap, - DataPlacements placements, - AccordFastPath accordFastPath, - LockedRanges lockedRanges, - InProgressSequences inProgressSequences, - ConsensusMigrationState consensusMigrationState, - Map, ExtensionValue> extensions, - AccordStaleReplicas accordStaleReplicas) + Epoch epoch, + IPartitioner partitioner, + DistributedSchema schema, + Directory directory, + TokenMap tokenMap, + DataPlacements placements, + AccordFastPath accordFastPath, + LockedRanges lockedRanges, + InProgressSequences inProgressSequences, + ConsensusMigrationState consensusMigrationState, + Map, ExtensionValue> extensions, + AccordStaleReplicas accordStaleReplicas, + CMSMembership cmsMembership) { // TODO: token map is a feature of the specific placement strategy, and so may not be a relevant component of // ClusterMetadata in the long term. We need to consider how the actual components of metadata can be evolved @@ -210,26 +214,35 @@ private ClusterMetadata(int metadataIdentifier, this.extensions = ImmutableMap.copyOf(extensions); this.locator = Locator.usingDirectory(directory); this.accordStaleReplicas = accordStaleReplicas; + this.cmsMembership = cmsMembership; } public Set fullCMSMembers() { if (fullCMSEndpoints == null) - this.fullCMSEndpoints = ImmutableSet.copyOf(placements.get(ReplicationParams.meta(this)).reads.byEndpoint().keySet()); + { + fullCMSEndpoints = ImmutableSet.copyOf(cmsMembership.fullMembers() + .stream() + .map(directory::endpoint) + .collect(Collectors.toSet())); + } return fullCMSEndpoints; } public Set fullCMSMemberIds() { - if (fullCMSIds == null) - this.fullCMSIds = placements.get(ReplicationParams.meta(this)).reads.byEndpoint().keySet().stream().map(directory::peerId).collect(toImmutableSet()); - return fullCMSIds; + return cmsMembership.fullMembers(); } public EndpointsForRange fullCMSMembersAsReplicas() { if (fullCMSReplicas == null) - fullCMSReplicas = placements.get(ReplicationParams.meta(this)).reads.forRange(MetaStrategy.entireRange).get(); + { + EndpointsForRange.Builder builder = EndpointsForRange.builder(MetaStrategy.entireRange); + for (NodeId nodeId : fullCMSMemberIds()) + builder.add(MetaStrategy.replica(directory.endpoint(nodeId))); + fullCMSReplicas = builder.build(); + } return fullCMSReplicas; } @@ -266,7 +279,8 @@ public ClusterMetadata forceEpoch(Epoch epoch) capLastModified(inProgressSequences, epoch), capLastModified(consensusMigrationState, epoch), capLastModified(extensions, epoch), - capLastModified(accordStaleReplicas, epoch)); + capLastModified(accordStaleReplicas, epoch), + capLastModified(cmsMembership, epoch)); } public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier) @@ -289,7 +303,8 @@ public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier) inProgressSequences, consensusMigrationState, extensions, - accordStaleReplicas); + accordStaleReplicas, + cmsMembership); } private static Map, ExtensionValue> capLastModified(Map, ExtensionValue> original, Epoch maxEpoch) @@ -419,6 +434,7 @@ public static class Transformer private final Map, ExtensionValue> extensions; private final Set modifiedKeys; private AccordStaleReplicas accordStaleReplicas; + private CMSMembership cmsMembership; private Transformer(ClusterMetadata metadata, Epoch epoch) { @@ -561,6 +577,30 @@ public Transformer left(NodeId id) return this; } + public Transformer startJoiningCMS(NodeId id) + { + cmsMembership = cmsMembership.startJoining(id); + return this; + } + + public Transformer finishJoiningCMS(NodeId id) + { + cmsMembership = cmsMembership.finishJoining(id); + return this; + } + + public Transformer cancelJoiningCMS(NodeId id) + { + cmsMembership = cmsMembership.cancelJoining(id); + return this; + } + + public Transformer leaveCMS(NodeId id) + { + cmsMembership = cmsMembership.leave(id); + return this; + } + public Transformer with(DataPlacements placements) { this.placements = placements; @@ -758,7 +798,8 @@ public Transformed build() inProgressSequences, consensusMigrationState, extensions, - accordStaleReplicas), + accordStaleReplicas, + cmsMembership), ImmutableSet.copyOf(modifiedKeys)); } @@ -776,7 +817,8 @@ public ClusterMetadata buildForGossipMode() inProgressSequences, consensusMigrationState, extensions, - accordStaleReplicas); + accordStaleReplicas, + cmsMembership); } @Override @@ -795,6 +837,7 @@ public String toString() ", inProgressSequences=" + inProgressSequences + ", consensusMigrationState=" + consensusMigrationState + ", extensions=" + extensions + + ", cmsMembership=" + cmsMembership + ", modifiedKeys=" + modifiedKeys + '}'; } @@ -881,7 +924,6 @@ public String legacyToString() @Override public String toString() { - // TODO is this supposed to be missing fields? return "ClusterMetadata{" + "epoch=" + epoch + ", schema=" + schema + @@ -890,6 +932,13 @@ public String toString() ", placements=" + placements + ", lockedRanges=" + lockedRanges + ", consensusMigrationState=" + lockedRanges + + ", accordFastPath=" + accordFastPath + + ", inProgressSequences=" + inProgressSequences + + ", consensusMigrationState=" + consensusMigrationState + + ", extensions=" + extensions + + ", accordStaleReplicas=" + accordStaleReplicas + + ", cmsMembership=" + cmsMembership + + ", cmsPlacement=" + cmsDataPlacement + '}'; } @@ -909,7 +958,8 @@ public boolean equals(Object o) inProgressSequences.equals(that.inProgressSequences) && consensusMigrationState.equals(that.consensusMigrationState) && accordStaleReplicas.equals(that.accordStaleReplicas) && - extensions.equals(that.extensions); + extensions.equals(that.extensions) && + cmsMembership.equals(that.cmsMembership); } private static final Logger logger = LoggerFactory.getLogger(ClusterMetadata.class); @@ -952,12 +1002,16 @@ public void dumpDiff(ClusterMetadata other) { logger.warn("Extensions differ: {} != {}", extensions, other.extensions); } + if (!cmsMembership.equals(other.cmsMembership)) + { + logger.warn("CMS Membership differ: {} != {}", cmsMembership, other.cmsMembership); + } } @Override public int hashCode() { - return Objects.hash(epoch, schema, directory, tokenMap, placements, accordFastPath, lockedRanges, inProgressSequences, consensusMigrationState, accordStaleReplicas, extensions); + return Objects.hash(epoch, schema, directory, tokenMap, placements, accordFastPath, lockedRanges, inProgressSequences, consensusMigrationState, accordStaleReplicas, extensions, cmsMembership); } public static ClusterMetadata current() @@ -1052,6 +1106,8 @@ public void serialize(ClusterMetadata metadata, DataOutputPlus out, Version vers assert key.valueType.isInstance(value); value.serialize(out, version); } + if (version.isAtLeast(Version.V7)) + CMSMembership.serializer.serialize(metadata.cmsMembership, out, version); } @Override @@ -1108,6 +1164,11 @@ public ClusterMetadata deserialize(DataInputPlus in, Version version) throws IOE value.deserialize(in, version); extensions.put(key, value); } + + CMSMembership cmsMembership = CMSMembership.EMPTY; + if (version.isAtLeast(Version.V8)) + cmsMembership = CMSMembership.serializer.deserialize(in, version); + return new ClusterMetadata(clusterIdentifier, epoch, partitioner, @@ -1120,7 +1181,8 @@ public ClusterMetadata deserialize(DataInputPlus in, Version version) throws IOE ips, consensusMigrationState, extensions, - staleReplicas); + staleReplicas, + cmsMembership); } private DistributedSchema deduplicateReplicationParams(DistributedSchema schema, DataPlacements placements) @@ -1168,6 +1230,9 @@ public long serializedSize(ClusterMetadata metadata, Version version) size += LockedRanges.serializer.serializedSize(metadata.lockedRanges, version) + InProgressSequences.serializer.serializedSize(metadata.inProgressSequences, version); + if (version.isAtLeast(Version.V8)) + size += CMSMembership.serializer.serializedSize(metadata.cmsMembership, version); + return size; } diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index e582271a84d9..d2328a993076 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -307,14 +307,15 @@ public static void empty(Keyspaces keyspaces) DatabaseDescriptor.getPartitioner(), new DistributedSchema(keyspaces), Directory.EMPTY, - new TokenMap(DatabaseDescriptor.getPartitioner()), - DataPlacements.empty(), - AccordFastPath.EMPTY, - LockedRanges.EMPTY, - InProgressSequences.EMPTY, - ConsensusMigrationState.EMPTY, - Collections.emptyMap(), - AccordStaleReplicas.EMPTY); + new TokenMap(DatabaseDescriptor.getPartitioner()), + DataPlacements.empty(), + AccordFastPath.EMPTY, + LockedRanges.EMPTY, + InProgressSequences.EMPTY, + ConsensusMigrationState.EMPTY, + Collections.emptyMap(), + AccordStaleReplicas.EMPTY, + CMSMembership.EMPTY); LocalLog.LogSpec logSpec = LocalLog.logSpec() diff --git a/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java b/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java index de1bc71250d1..f3b78a1ee9b5 100644 --- a/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java +++ b/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java @@ -36,7 +36,7 @@ public class NodeVersion implements Comparable { public static final Serializer serializer = new Serializer(); - public static final Version CURRENT_METADATA_VERSION = Version.V8; + public static final Version CURRENT_METADATA_VERSION = Version.V9; public static final NodeVersion CURRENT = new NodeVersion(new CassandraVersion(FBUtilities.getReleaseVersionString()), CURRENT_METADATA_VERSION); private static final CassandraVersion SINCE_VERSION = CassandraVersion.CASSANDRA_5_1; diff --git a/src/java/org/apache/cassandra/tcm/serialization/Version.java b/src/java/org/apache/cassandra/tcm/serialization/Version.java index 9cbcb8587a74..1f15a2e31b95 100644 --- a/src/java/org/apache/cassandra/tcm/serialization/Version.java +++ b/src/java/org/apache/cassandra/tcm/serialization/Version.java @@ -38,46 +38,49 @@ public enum Version */ V0(0), /** - * - Moved Partitioner in ClusterMetadata serializer to be the first field - * - Added a counter to Directory serializer to keep track of NodeIds + * - Moved Partitioner in ClusterMetadata serializer to be the first field + * - Added a counter to Directory serializer to keep track of NodeIds */ V1(1), /** - * - Added version to PlacementForRange serializer - * - Serialize MemtableParams when serializing TableParams + * - Added version to PlacementForRange serializer + * - Serialize MemtableParams when serializing TableParams */ V2(2), /** - * - down nodes serialized in PrepareCMSReconfiguration + * - down nodes serialized in PrepareCMSReconfiguration */ V3(3), /** - * - Serialize allowAutoSnapshot and incrementalBackups when serializing TableParams + * - Serialize allowAutoSnapshot and incrementalBackups when serializing TableParams */ V4(4), /** - * - AlterSchema includes execution timestamp - * - PreInitialize includes datacenter (affects local serialization on first CMS node only) + * - AlterSchema includes execution timestamp + * - PreInitialize includes datacenter (affects local serialization on first CMS node only) */ V5(5), /** - * - CEP-42 - Constraints framework. New version due to modifications in table metadata serialization. + * - CEP-42 - Constraints framework. New version due to modifications in table metadata serialization. */ V6(6), /** - * - Track nodes removed - * - Column Metadata now stores a unique id - * - Added AccordFastPath - * - Added ConsensusMigrationState - * - Added AccordStaleReplicas - * - TableParam now has pendingDrop (accord table drop is multistep) + * - Track nodes removed + * - Column Metadata now stores a unique id + * - Added AccordFastPath + * - Added ConsensusMigrationState + * - Added AccordStaleReplicas + * - TableParam now has pendingDrop (accord table drop is multistep) */ V7(7), - /** - * - Comments and security labels for schema elements (keyspaces, tables, columns, UDTs, and UDT fields) + * - Comments and security labels for schema elements (keyspaces, tables, columns, UDTs, and UDT fields) */ V8(8), + /** + * - DataPlacements don't include MetaStrategy, replaced by ClusterMetadata.CMSMembership + */ + V9(9), UNKNOWN(Integer.MAX_VALUE); @@ -95,6 +98,7 @@ public enum Version } private final int version; + Version(int version) { this.version = version; From 6f1b7ded53bb04e6fd909aa67f986e605a8d0525 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Wed, 11 Jun 2025 17:49:49 +0100 Subject: [PATCH 02/23] [CASSANDRA-20736] Init singleton CMS cluster with restarts --- .../cassandra/tcm/AbstractLocalProcessor.java | 2 +- .../apache/cassandra/tcm/ClusterMetadata.java | 67 ++++++++++++++++--- .../cassandra/tcm/ClusterMetadataService.java | 2 +- .../org/apache/cassandra/tcm/Startup.java | 18 +++-- .../cassandra/tcm/migration/Election.java | 5 -- .../transformations/cms/PreInitialize.java | 17 ----- 6 files changed, 71 insertions(+), 40 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java b/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java index 0f42bae46261..40416a782157 100644 --- a/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java +++ b/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java @@ -53,7 +53,7 @@ public final Commit.Result commit(Entry.Id entryId, Transformation transform, fi while (!retryPolicy.hasExpired()) { ClusterMetadata previous = log.waitForHighestConsecutive(); - if (!previous.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort())) + if (!previous.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort()) && previous.epoch.isAfter(Epoch.FIRST)) { String msg = String.format("Node %s is not a CMS member in epoch %s; members=%s", FBUtilities.getBroadcastAddressAndPort(), diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 76bcb84757a9..c7e4660f4e6e 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -59,6 +59,7 @@ import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.accord.AccordFastPath; import org.apache.cassandra.service.accord.AccordStaleReplicas; @@ -206,8 +207,8 @@ private ClusterMetadata(int metadataIdentifier, this.schema = schema; this.directory = directory; this.tokenMap = tokenMap; - this.placements = placements; this.accordFastPath = accordFastPath; + this.placements = addMetaPlacement(placements, cmsMembership); this.lockedRanges = lockedRanges; this.inProgressSequences = inProgressSequences; this.consensusMigrationState = consensusMigrationState; @@ -217,6 +218,16 @@ private ClusterMetadata(int metadataIdentifier, this.cmsMembership = cmsMembership; } + public Set fullCMSMemberIds() + { + return cmsMembership.fullMembers(); + } + + public boolean isCMSMember(InetAddressAndPort endpoint) + { + return fullCMSMembers().contains(endpoint); + } + public Set fullCMSMembers() { if (fullCMSEndpoints == null) @@ -229,11 +240,6 @@ public Set fullCMSMembers() return fullCMSEndpoints; } - public Set fullCMSMemberIds() - { - return cmsMembership.fullMembers(); - } - public EndpointsForRange fullCMSMembersAsReplicas() { if (fullCMSReplicas == null) @@ -246,9 +252,34 @@ public EndpointsForRange fullCMSMembersAsReplicas() return fullCMSReplicas; } - public boolean isCMSMember(InetAddressAndPort endpoint) + private DataPlacements addMetaPlacement(DataPlacements placements, CMSMembership cms) { - return fullCMSMembers().contains(endpoint); + if (epoch.isBefore(Epoch.FIRST)) + return placements; + + DataPlacement.Builder metaBuilder = DataPlacement.builder(); + if (epoch.is(Epoch.FIRST)) + { + metaBuilder.withReadReplica(Epoch.FIRST, MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort())); + metaBuilder.withWriteReplica(Epoch.FIRST, MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort())); + } + else + { + for (NodeId id : cms.fullMembers()) + { + Replica replica = MetaStrategy.replica(directory.endpoint(id)); + metaBuilder.withReadReplica(cms.lastModified(), replica); + metaBuilder.withWriteReplica(cms.lastModified(), replica); + } + + for(NodeId id : cms.joiningMembers()) + { + metaBuilder.withWriteReplica(cms.lastModified(), MetaStrategy.replica(directory.endpoint(id))); + } + } + return placements.unbuild() + .with(ReplicationParams.meta(this), metaBuilder.build()) + .build(); } public Transformer transformer() @@ -283,7 +314,10 @@ public ClusterMetadata forceEpoch(Epoch epoch) capLastModified(cmsMembership, epoch)); } - public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier) + public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier, + NodeAddresses addresses, + NodeVersion version, + Location location) { if (this.metadataIdentifier != EMPTY_METADATA_IDENTIFIER) throw new IllegalStateException(String.format("Can only initialize cluster identifier once, but it was already set to %d", this.metadataIdentifier)); @@ -291,11 +325,21 @@ public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier) if (clusterIdentifier == EMPTY_METADATA_IDENTIFIER) throw new IllegalArgumentException("Can not initialize cluster with empty cluster identifier"); + if (this.epoch.isAfter(Epoch.FIRST)) + throw new IllegalStateException(String.format("Can only initialize cluster identifier during epoch %d, but current epoch is %d", Epoch.FIRST.getEpoch(), epoch.getEpoch())); + + // Maybe register the first CMS node. If upgrading from gossip, this should be a no-op + Directory withRegistered = directory.with(addresses, location, version); + NodeId firstNode = withRegistered.peerId(addresses.broadcastAddress); + if (firstNode == null) + throw new IllegalStateException("Failed to find first CMS node in directory"); + + CMSMembership initialCMS = cmsMembership.startJoining(firstNode).finishJoining(firstNode); return new ClusterMetadata(clusterIdentifier, epoch, partitioner, schema, - directory, + withRegistered, tokenMap, placements, accordFastPath, @@ -304,7 +348,7 @@ public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier) consensusMigrationState, extensions, accordStaleReplicas, - cmsMembership); + initialCMS); } private static Map, ExtensionValue> capLastModified(Map, ExtensionValue> original, Epoch maxEpoch) @@ -452,6 +496,7 @@ private Transformer(ClusterMetadata metadata, Epoch epoch) extensions = new HashMap<>(metadata.extensions); modifiedKeys = new HashSet<>(); accordStaleReplicas = metadata.accordStaleReplicas; + cmsMembership = metadata.cmsMembership; } public Epoch epoch() diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index d2328a993076..9c35f6a2b7bf 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -164,7 +164,7 @@ public static State state(ClusterMetadata metadata) // The node is a full member of the CMS if it has started participating in reads for distributed metadata table (which // implies it is a write replica as well). In other words, it's a fully joined member of the replica set responsible for // the distributed metadata table. - if (ClusterMetadata.current().isCMSMember(FBUtilities.getBroadcastAddressAndPort())) + if (metadata.epoch.isEqualOrBefore(Epoch.FIRST) || ClusterMetadata.current().isCMSMember(FBUtilities.getBroadcastAddressAndPort())) return LOCAL; return REMOTE; } diff --git a/src/java/org/apache/cassandra/tcm/Startup.java b/src/java/org/apache/cassandra/tcm/Startup.java index 970ea190c6e2..ebf24e47c3d2 100644 --- a/src/java/org/apache/cassandra/tcm/Startup.java +++ b/src/java/org/apache/cassandra/tcm/Startup.java @@ -63,8 +63,11 @@ import org.apache.cassandra.tcm.log.LocalLog; import org.apache.cassandra.tcm.log.LogStorage; import org.apache.cassandra.tcm.log.SystemKeyspaceStorage; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeAddresses; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeState; +import org.apache.cassandra.tcm.membership.NodeVersion; import org.apache.cassandra.tcm.migration.CMSInitializationRequest; import org.apache.cassandra.tcm.migration.Election; import org.apache.cassandra.tcm.ownership.UniformRangePlacement; @@ -144,13 +147,18 @@ public static void initialize(Set seeds, */ public static void initializeAsFirstCMSNode() { - InetAddressAndPort addr = FBUtilities.getBroadcastAddressAndPort(); - String datacenter = DatabaseDescriptor.getLocator().local().datacenter; - ClusterMetadataService.instance().log().bootstrap(addr, datacenter); + NodeAddresses addresses = NodeAddresses.current(); + Location location = DatabaseDescriptor.getLocator().local(); + ClusterMetadataService.instance().log().bootstrap(addresses.broadcastAddress, location.datacenter); ClusterMetadata metadata = ClusterMetadata.current(); assert ClusterMetadataService.state() == LOCAL : String.format("Can't initialize as node hasn't transitioned to CMS state. State: %s.\n%s", ClusterMetadataService.state(), metadata); - Initialize initialize = new Initialize(metadata.initializeClusterIdentifier(addr.hashCode())); - ClusterMetadataService.instance().commit(initialize); + Initialize initialize = new Initialize(metadata.initializeClusterIdentifier(addresses.broadcastAddress.hashCode(), + addresses, + NodeVersion.CURRENT, + location)); + ClusterMetadata initialized = ClusterMetadataService.instance().commit(initialize); + NodeId id = initialized.myNodeId(); + SystemKeyspace.setLocalHostId(id.toUUID()); } public static void initializeAsNonCmsNode(Function wrapProcessor) throws StartupException diff --git a/src/java/org/apache/cassandra/tcm/migration/Election.java b/src/java/org/apache/cassandra/tcm/migration/Election.java index 34ce077583a7..1f150a2ffa91 100644 --- a/src/java/org/apache/cassandra/tcm/migration/Election.java +++ b/src/java/org/apache/cassandra/tcm/migration/Election.java @@ -33,7 +33,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; @@ -49,7 +48,6 @@ import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeState; import org.apache.cassandra.tcm.ownership.TokenMap; -import org.apache.cassandra.tcm.transformations.Register; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; @@ -137,9 +135,6 @@ private void finish(Set sendTo) initiator.compareAndSet(currentInitiator, MIGRATING)) { Startup.initializeAsFirstCMSNode(); - Register.maybeRegister(); - SystemKeyspace.setLocalHostId(ClusterMetadata.current().myNodeId().toUUID()); - updateInitiator(MIGRATING, MIGRATED); MessageDelivery.fanoutAndWait(messaging, sendTo, Verb.TCM_NOTIFY_REQ, DistributedMetadataLogKeyspace.getLogState(Epoch.EMPTY, false)); } diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java b/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java index e07d1f1a2989..ef7907bde93c 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java @@ -24,17 +24,12 @@ import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.Replica; import org.apache.cassandra.schema.DistributedMetadataLogKeyspace; import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.Keyspaces; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Transformation; -import org.apache.cassandra.tcm.ownership.DataPlacement; -import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.sequences.LockedRanges; import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; @@ -95,18 +90,6 @@ public Result execute(ClusterMetadata metadata) // has been committed to the log, the actual content of PRE_INITIALIZE_CMS is irrelevant, even on // the first CMS node if it happens to replay it from its local storage after a restart. - DataPlacement.Builder dataPlacementBuilder = DataPlacement.builder(); - Replica replica = new Replica(addr, - MetaStrategy.partitioner.getMinimumToken(), - MetaStrategy.partitioner.getMinimumToken(), - true); - dataPlacementBuilder.reads.withReplica(Epoch.FIRST, replica); - dataPlacementBuilder.writes.withReplica(Epoch.FIRST, replica); - DataPlacements initialPlacement = metadata.placements.unbuild() - .with(ReplicationParams.simpleMeta(1, datacenter), - dataPlacementBuilder.build()).build(); - - transformer.with(initialPlacement); // re-initialise the schema distributed metadata keyspace so it gets the // correct replication settings based on the DC of the initial CMS node Keyspaces updated = metadata.schema.getKeyspaces() From c7871bd1e0b92bf8c35b38344655ea6a1c21ac73 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Thu, 12 Jun 2025 14:53:30 +0100 Subject: [PATCH 03/23] [CASSANDRA-20736] Update CMS reconfiguration --- .../locator/CMSPlacementStrategy.java | 3 +- .../cms/AdvanceCMSReconfiguration.java | 68 +++++++------------ .../cms/PrepareCMSReconfiguration.java | 8 +-- 3 files changed, 28 insertions(+), 51 deletions(-) diff --git a/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java b/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java index e00289aef667..cb4dfa82c676 100644 --- a/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java +++ b/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java @@ -90,7 +90,8 @@ public Set reconfigure(ClusterMetadata metadata) // Although MetaStrategy has its own entireRange, it uses a custom partitioner which isn't compatible with // regular, non-CMS placements. For that reason, we select replicas here using tokens provided by the - // globally configured partitioner. + // globally configured partitioner. This also has the benefit of making concurrent operations, such as + // bounces/upgrades/etc, safe for the CMS if they are replica aware. Token minToken = DatabaseDescriptor.getPartitioner().getMinimumToken(); EndpointsForRange endpoints = NetworkTopologyStrategy.calculateNaturalReplicas(minToken, new Range<>(minToken, minToken), diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/AdvanceCMSReconfiguration.java b/src/java/org/apache/cassandra/tcm/transformations/cms/AdvanceCMSReconfiguration.java index 614dd8a4eddb..f31a3fe503fb 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/AdvanceCMSReconfiguration.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/AdvanceCMSReconfiguration.java @@ -29,15 +29,12 @@ import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.RangesByEndpoint; -import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.tcm.CMSMembership; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.MultiStepOperation; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.NodeId; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.sequences.InProgressSequences; import org.apache.cassandra.tcm.sequences.LockedRanges; import org.apache.cassandra.tcm.sequences.ReconfigureCMS; @@ -45,7 +42,6 @@ import org.apache.cassandra.tcm.serialization.Version; import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; -import static org.apache.cassandra.locator.MetaStrategy.entireRange; import static org.apache.cassandra.tcm.MultiStepOperation.Kind.RECONFIGURE_CMS; /** @@ -154,38 +150,26 @@ private Transformation.Result startAdd(ClusterMetadata prev, ReconfigureCMS sequ { // Pop the next node to be added from the list diff.additions NodeId addition = diff.additions.get(0); - InetAddressAndPort endpoint = prev.directory.endpoint(addition); - Replica replica = new Replica(endpoint, entireRange, true); - List newAdditions = new ArrayList<>(diff.additions.subList(1, diff.additions.size())); // Check that the candidate is not already a CMS member - ReplicationParams metaParams = ReplicationParams.meta(prev); - RangesByEndpoint readReplicas = prev.placements.get(metaParams).reads.byEndpoint(); - RangesByEndpoint writeReplicas = prev.placements.get(metaParams).writes.byEndpoint(); - if (readReplicas.containsKey(endpoint) || writeReplicas.containsKey(endpoint)) - return new Transformation.Rejected(INVALID, "Endpoint is already a member of CMS"); - + CMSMembership cms = prev.cmsMembership; + if (cms.joiningMembers().contains(addition) || cms.fullMembers().contains(addition)) + return new Transformation.Rejected(INVALID, "Endpoint is already a full or joining member of the CMS: " + prev.directory.endpoint(addition)); - ClusterMetadata.Transformer transformer = prev.transformer(); - // Add the candidate as a write replica - DataPlacement.Builder builder = prev.placements.get(metaParams).unbuild() - .withWriteReplica(prev.nextEpoch(), replica); - transformer.with(prev.placements.unbuild().with(metaParams, builder.build()).build()); + // Add the candidate as a joining member + ClusterMetadata.Transformer transformer = prev.transformer().startJoiningCMS(addition); // Construct a set of sources for the new member to stream log tables from (essentially this is the existing members) - Set streamCandidates = new HashSet<>(); - for (Replica r : prev.placements.get(metaParams).reads.byEndpoint().flattenValues()) - { - if (!replica.equals(r)) - streamCandidates.add(r.endpoint()); - } + Set streamCandidates = prev.fullCMSMembers(); // Set up the next step in the sequence. This encapsulates the entire state of the reconfiguration sequence, // including the remaining add/remove operations and the streaming that needs to be done by the joining node + List newAdditions = new ArrayList<>(diff.additions.subList(1, diff.additions.size())); AdvanceCMSReconfiguration next = next(prev.nextEpoch(), newAdditions, diff.removals, new ReconfigureCMS.ActiveTransition(addition, streamCandidates)); + // Create a new sequence instance with the next step to reflect that the state has progressed. ReconfigureCMS advanced = sequence.advance(next); // Finally, replace the existing reconfiguration sequence with this updated one. @@ -206,21 +190,21 @@ private Transformation.Result startAdd(ClusterMetadata prev, ReconfigureCMS sequ */ private Transformation.Result finishAdd(ClusterMetadata prev, ReconfigureCMS sequence, NodeId addition) { + // Check that the candidate is already a joining CMS member + CMSMembership cms = prev.cmsMembership; + if (!cms.joiningMembers().contains(addition)) + return new Transformation.Rejected(INVALID, "Endpoint is not a in the process of joining the CMS: " + prev.directory.endpoint(addition)); + // Add the new member as a full read replica, able to participate in quorums for log updates - ReplicationParams metaParams = ReplicationParams.meta(prev); - InetAddressAndPort endpoint = prev.directory.endpoint(addition); - Replica replica = new Replica(endpoint, entireRange, true); - ClusterMetadata.Transformer transformer = prev.transformer(); - DataPlacement.Builder builder = prev.placements.get(metaParams) - .unbuild() - .withReadReplica(prev.nextEpoch(), replica); - transformer = transformer.with(prev.placements.unbuild().with(metaParams, builder.build()).build()); + ClusterMetadata.Transformer transformer = prev.transformer().finishJoiningCMS(addition); // Set up the next step in the sequence. This encapsulates the entire state of the reconfiguration sequence, // which includes the remaining add/remove operations AdvanceCMSReconfiguration next = next(prev.nextEpoch(), diff.additions, diff.removals, null); + // Create a new sequence instance with the next step to reflect that the state has progressed. ReconfigureCMS advanced = sequence.advance(next); + // Finally, replace the existing reconfiguration sequence with this updated one. transformer.with(prev.inProgressSequences.with(ReconfigureCMS.SequenceKey.instance, (ReconfigureCMS old) -> advanced)); return Transformation.success(transformer, MetaStrategy.affectedRanges(prev)); @@ -238,29 +222,25 @@ private Transformation.Result executeRemove(ClusterMetadata prev, ReconfigureCMS List newRemovals = new ArrayList<>(diff.removals.subList(1, diff.removals.size())); // Check that the candidate is actually a CMS member - ClusterMetadata.Transformer transformer = prev.transformer(); + Set cms = prev.fullCMSMemberIds(); InetAddressAndPort endpoint = prev.directory.endpoint(removal); - Replica replica = new Replica(endpoint, entireRange, true); - ReplicationParams metaParams = ReplicationParams.meta(prev); - if (!prev.fullCMSMembers().contains(endpoint)) + if (!prev.fullCMSMemberIds().contains(removal)) return new Transformation.Rejected(INVALID, String.format("%s is not currently a CMS member, cannot remove it", endpoint)); // Check that the candidate is not the only CMS member - DataPlacement.Builder builder = prev.placements.get(metaParams).unbuild(); - builder.reads.withoutReplica(prev.nextEpoch(), replica); - builder.writes.withoutReplica(prev.nextEpoch(), replica); - DataPlacement proposed = builder.build(); - if (proposed.reads.byEndpoint().isEmpty() || proposed.writes.byEndpoint().isEmpty()) + if (cms.size() == 1) return new Transformation.Rejected(INVALID, String.format("Removing %s will leave no nodes in CMS", endpoint)); - // Actually remove the candidate - transformer = transformer.with(prev.placements.unbuild().with(metaParams, proposed).build()); + // Remove the CMS member + ClusterMetadata.Transformer transformer = prev.transformer().leaveCMS(removal); // Set up the next step in the sequence. This encapsulates the entire state of the reconfiguration sequence, // which includes the remaining add/remove operations AdvanceCMSReconfiguration next = next(prev.nextEpoch(), diff.additions, newRemovals, null); + // Create a new sequence instance with the next step to reflect that the state has progressed. ReconfigureCMS advanced = sequence.advance(next); + // Finally, replace the existing reconfiguration sequence with this updated one. transformer.with(prev.inProgressSequences.with(ReconfigureCMS.SequenceKey.instance, (ReconfigureCMS old) -> advanced)); return Transformation.success(transformer, MetaStrategy.affectedRanges(prev)); diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java b/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java index e94292831e4b..0e8c60449137 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java @@ -29,7 +29,6 @@ import java.util.Set; import java.util.function.Function; import java.util.function.Predicate; -import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,7 +81,7 @@ protected Transformation.Result executeInternal(ClusterMetadata prev, Function cms = prev.fullCMSMembers().stream().map(prev.directory::peerId).collect(Collectors.toSet()); + Set cms = prev.fullCMSMemberIds(); Set tmp = new HashSet<>(cms); tmp.addAll(diff.additions); tmp.removeAll(diff.removals); @@ -354,10 +353,7 @@ private static Map extractRf(ReplicationParams params) public static boolean needsReconfiguration(ClusterMetadata metadata) { Map dcRf = extractRf(ReplicationParams.meta(metadata)); - Set currentCms = metadata.fullCMSMembers() - .stream() - .map(metadata.directory::peerId) - .collect(Collectors.toSet()); + Set currentCms = metadata.fullCMSMemberIds(); int expectedSize = dcRf.values().stream().mapToInt(Integer::intValue).sum(); if (currentCms.size() != expectedSize) return true; From 8a68f97f920c5d3a7b9fcef1559a09b257c5f606 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Thu, 12 Jun 2025 15:23:30 +0100 Subject: [PATCH 04/23] [CASSANDRA-20736] Update Startup transformation --- .../tcm/transformations/Startup.java | 27 +------------------ 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/transformations/Startup.java b/src/java/org/apache/cassandra/tcm/transformations/Startup.java index a1abfdbff4ec..4476be31901e 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/Startup.java +++ b/src/java/org/apache/cassandra/tcm/transformations/Startup.java @@ -24,10 +24,6 @@ import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.schema.Keyspaces; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Transformation; @@ -35,14 +31,12 @@ import org.apache.cassandra.tcm.membership.NodeAddresses; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeVersion; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.sequences.LockedRanges; import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; -import static org.apache.cassandra.locator.MetaStrategy.entireRange; public class Startup implements Transformation { @@ -81,32 +75,13 @@ public Result execute(ClusterMetadata prev) if (!nodeId.equals(existingNodeId) && addresses.conflictsWith(existingAddresses)) return new Rejected(INVALID, String.format("New addresses %s conflicts with existing node %s with addresses %s", addresses, entry.getKey(), existingAddresses)); } - next = next.withNewAddresses(nodeId, addresses); - Keyspaces allKeyspaces = prev.schema.getKeyspaces().withAddedOrReplaced(prev.schema.getKeyspaces()); - DataPlacements newPlacement = ClusterMetadataService.instance() .placementProvider() .calculatePlacements(prev.nextEpoch(), prev.tokenMap.toRanges(), next.build().metadata, - allKeyspaces); - - if (prev.isCMSMember(prev.directory.endpoint(nodeId))) - { - ReplicationParams metaParams = ReplicationParams.meta(prev); - InetAddressAndPort endpoint = prev.directory.endpoint(nodeId); - Replica leavingReplica = new Replica(endpoint, entireRange, true); - Replica joiningReplica = new Replica(addresses.broadcastAddress, entireRange, true); - - DataPlacement.Builder builder = prev.placements.get(metaParams).unbuild(); - builder.reads.withoutReplica(prev.nextEpoch(), leavingReplica); - builder.writes.withoutReplica(prev.nextEpoch(), leavingReplica); - builder.reads.withReplica(prev.nextEpoch(), joiningReplica); - builder.writes.withReplica(prev.nextEpoch(), joiningReplica); - newPlacement = newPlacement.unbuild().with(metaParams, builder.build()).build(); - } - + prev.schema.getKeyspaces()); next = next.with(newPlacement); } From 307e8781ea6cbee4aa50809bea75fe4479504a44 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Thu, 12 Jun 2025 16:11:29 +0100 Subject: [PATCH 05/23] [CASSANDRA-20736] Update legacy CMS membership transformations - TODO remove these --- .../transformations/cms/FinishAddToCMS.java | 10 ++----- .../transformations/cms/RemoveFromCMS.java | 25 ++-------------- .../transformations/cms/StartAddToCMS.java | 29 ++++--------------- 3 files changed, 11 insertions(+), 53 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/FinishAddToCMS.java b/src/java/org/apache/cassandra/tcm/transformations/cms/FinishAddToCMS.java index a1bed8ed2ac2..9cef32aef5cd 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/FinishAddToCMS.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/FinishAddToCMS.java @@ -26,7 +26,6 @@ import org.apache.cassandra.tcm.MultiStepOperation; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.NodeId; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.sequences.AddToCMS; import org.apache.cassandra.tcm.sequences.InProgressSequences; import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; @@ -84,12 +83,9 @@ public Result execute(ClusterMetadata prev) InetAddressAndPort endpoint = prev.directory.endpoint(targetNode); Replica replica = new Replica(endpoint, entireRange, true); - ClusterMetadata.Transformer transformer = prev.transformer(); - DataPlacement.Builder builder = prev.placements.get(metaParams) - .unbuild() - .withReadReplica(prev.nextEpoch(), replica); - transformer = transformer.with(prev.placements.unbuild().with(metaParams, builder.build()).build()) - .with(prev.inProgressSequences.without(targetNode)); + ClusterMetadata.Transformer transformer = prev.transformer() + .finishJoiningCMS(targetNode) + .with(prev.inProgressSequences.without(targetNode)); return Transformation.success(transformer, MetaStrategy.affectedRanges(prev)); } diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/RemoveFromCMS.java b/src/java/org/apache/cassandra/tcm/transformations/cms/RemoveFromCMS.java index 8e37875e941d..d958309ed8e1 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/RemoveFromCMS.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/RemoveFromCMS.java @@ -29,20 +29,16 @@ import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.MultiStepOperation; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.NodeId; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.sequences.InProgressSequences; import org.apache.cassandra.tcm.sequences.ReconfigureCMS; import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; -import static org.apache.cassandra.locator.MetaStrategy.entireRange; /** * This class along with AddToCMS, StartAddToCMS & FinishAddToCMS, contain a high degree of duplication with their intended @@ -87,11 +83,7 @@ public Result execute(ClusterMetadata prev) if (sequence != null) return new Transformation.Rejected(INVALID, String.format("Can't remove %s from CMS as there are ongoing range movements on it", endpoint)); - ReplicationParams metaParams = ReplicationParams.meta(prev); - DataPlacement placements = prev.placements.get(metaParams); - - int minProposedSize = (int) Math.min(placements.reads.forRange(replica.range()).get().stream().filter(r -> !r.endpoint().equals(endpoint)).count(), - placements.writes.forRange(replica.range()).get().stream().filter(r -> !r.endpoint().equals(endpoint)).count()); + int minProposedSize = prev.fullCMSMemberIds().size() - 1; if (minProposedSize < MIN_SAFE_CMS_SIZE) { logger.warn("Removing {} from CMS members would reduce the service size to {} which is below the " + @@ -109,19 +101,8 @@ public Result execute(ClusterMetadata prev) if (minProposedSize == 0) return new Transformation.Rejected(INVALID, String.format("Removing %s from the CMS would leave no members in CMS.", endpoint)); - ClusterMetadata.Transformer transformer = prev.transformer(); - Replica replica = new Replica(endpoint, entireRange, true); - - DataPlacement.Builder builder = prev.placements.get(metaParams).unbuild(); - builder.reads.withoutReplica(prev.nextEpoch(), replica); - builder.writes.withoutReplica(prev.nextEpoch(), replica); - DataPlacement proposed = builder.build(); - - if (proposed.reads.byEndpoint().isEmpty() || proposed.writes.byEndpoint().isEmpty()) - return new Transformation.Rejected(INVALID, String.format("Removing %s will leave no nodes in CMS", endpoint)); - - return Transformation.success(transformer.with(prev.placements.unbuild().with(metaParams, proposed).build()), - MetaStrategy.affectedRanges(prev)); + ClusterMetadata.Transformer transformer = prev.transformer().leaveCMS(nodeId); + return Transformation.success(transformer, MetaStrategy.affectedRanges(prev)); } @Override diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/StartAddToCMS.java b/src/java/org/apache/cassandra/tcm/transformations/cms/StartAddToCMS.java index 90f3056f4711..efa92c042a59 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/StartAddToCMS.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/StartAddToCMS.java @@ -18,25 +18,20 @@ package org.apache.cassandra.tcm.transformations.cms; -import java.util.HashSet; import java.util.Set; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.RangesByEndpoint; -import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.tcm.CMSMembership; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.MultiStepOperation; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.NodeId; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.sequences.AddToCMS; import org.apache.cassandra.tcm.sequences.ReconfigureCMS; import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; -import static org.apache.cassandra.locator.MetaStrategy.entireRange; /** * This class along with AddToCMS, FinishAddToCMS & RemoveFromCMS, contain a high degree of duplication with their intended @@ -76,27 +71,13 @@ public Result execute(ClusterMetadata prev) if (prev.inProgressSequences.get(ReconfigureCMS.SequenceKey.instance) != null) return new Rejected(INVALID, String.format("Cannot add node to CMS as a CMS reconfiguration is currently active")); - Replica replica = new Replica(endpoint, entireRange, true); - ReplicationParams metaParams = ReplicationParams.meta(prev); - RangesByEndpoint readReplicas = prev.placements.get(metaParams).reads.byEndpoint(); - RangesByEndpoint writeReplicas = prev.placements.get(metaParams).writes.byEndpoint(); - - if (readReplicas.containsKey(endpoint) || writeReplicas.containsKey(endpoint)) + CMSMembership cms = prev.cmsMembership; + if (cms.joiningMembers().contains(nodeId) || cms.fullMembers().contains(nodeId)) return new Transformation.Rejected(INVALID, "Endpoint is already a member of CMS"); - ClusterMetadata.Transformer transformer = prev.transformer(); - DataPlacement.Builder builder = prev.placements.get(metaParams).unbuild() - .withWriteReplica(prev.nextEpoch(), replica); - - transformer.with(prev.placements.unbuild().with(metaParams, builder.build()).build()); - - Set streamCandidates = new HashSet<>(); - for (Replica r : prev.placements.get(metaParams).reads.byEndpoint().flattenValues()) - { - if (!replica.equals(r)) - streamCandidates.add(r.endpoint()); - } + ClusterMetadata.Transformer transformer = prev.transformer().startJoiningCMS(nodeId); + Set streamCandidates = prev.fullCMSMembers(); AddToCMS joinSequence = new AddToCMS(prev.nextEpoch(), nodeId, streamCandidates, new FinishAddToCMS(endpoint)); transformer = transformer.with(prev.inProgressSequences.with(nodeId, joinSequence)); return Transformation.success(transformer, MetaStrategy.affectedRanges(prev)); From 5db6f87f492477d6236a4dbce45783b256121da8 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Fri, 13 Jun 2025 11:11:05 +0100 Subject: [PATCH 06/23] [CASSANDRA-20736] New MetadataKey for CMS membership --- .../apache/cassandra/tcm/ClusterMetadata.java | 13 ++++-- .../apache/cassandra/tcm/MetadataKeys.java | 2 + .../tcm/StubClusterMetadataService.java | 3 +- .../tcm/compatibility/GossipHelper.java | 10 +++-- .../test/log/ClusterMetadataTestHelper.java | 4 +- .../cassandra/locator/MetaStrategyTest.java | 4 +- .../ClusterMetadataTransformationTest.java | 41 +++++++++++++++++++ .../tcm/membership/MembershipUtils.java | 6 +-- .../cassandra/utils/CassandraGenerators.java | 4 +- 9 files changed, 73 insertions(+), 14 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index c7e4660f4e6e..4fa279ab6ca4 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -60,6 +60,7 @@ import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.accord.AccordFastPath; import org.apache.cassandra.service.accord.AccordStaleReplicas; @@ -254,7 +255,7 @@ public EndpointsForRange fullCMSMembersAsReplicas() private DataPlacements addMetaPlacement(DataPlacements placements, CMSMembership cms) { - if (epoch.isBefore(Epoch.FIRST)) + if (epoch.isBefore(Epoch.FIRST) || schema.getKeyspaces().get(SchemaConstants.METADATA_KEYSPACE_NAME).isEmpty()) return placements; DataPlacement.Builder metaBuilder = DataPlacement.builder(); @@ -826,11 +827,16 @@ public Transformed build() consensusMigrationState = consensusMigrationState.withLastModified(epoch); } - if (consensusMigrationState != base.consensusMigrationState || schema != base.schema) - { + if (consensusMigrationState != base.consensusMigrationState || schema != base.schema) { consensusMigrationState.validateAgainstSchema(schema); } + if (cmsMembership != base.cmsMembership) + { + modifiedKeys.add(MetadataKeys.CMS_MEMBERSHIP); + cmsMembership = cmsMembership.withLastModified(epoch); + } + return new Transformed(new ClusterMetadata(base.metadataIdentifier, epoch, partitioner, @@ -983,7 +989,6 @@ public String toString() ", extensions=" + extensions + ", accordStaleReplicas=" + accordStaleReplicas + ", cmsMembership=" + cmsMembership + - ", cmsPlacement=" + cmsDataPlacement + '}'; } diff --git a/src/java/org/apache/cassandra/tcm/MetadataKeys.java b/src/java/org/apache/cassandra/tcm/MetadataKeys.java index 0be621b20189..8188d03e14cd 100644 --- a/src/java/org/apache/cassandra/tcm/MetadataKeys.java +++ b/src/java/org/apache/cassandra/tcm/MetadataKeys.java @@ -45,6 +45,7 @@ public class MetadataKeys public static final MetadataKey LOCKED_RANGES = make(CORE_NS, "sequences", "locked_ranges"); public static final MetadataKey IN_PROGRESS_SEQUENCES = make(CORE_NS, "sequences", "in_progress"); public static final MetadataKey CONSENSUS_MIGRATION_STATE = make(CORE_NS, "consensus", "migration_state"); + public static final MetadataKey CMS_MEMBERSHIP = make(CORE_NS, "cms_membership", "cms_membership"); public static final ImmutableMap>> CORE_METADATA = ImmutableMap.>>builder() @@ -57,6 +58,7 @@ public class MetadataKeys .put(ACCORD_FAST_PATH, cm -> cm.accordFastPath) .put(ACCORD_STALE_REPLICAS, cm -> cm.accordStaleReplicas) .put(CONSENSUS_MIGRATION_STATE, cm -> cm.consensusMigrationState) + .put(CMS_MEMBERSHIP, cm -> cm.cmsMembership) .build(); public static MetadataKey make(String...parts) diff --git a/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java index 6036554e0d00..01bfbf85f5d3 100644 --- a/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java @@ -184,7 +184,8 @@ public StubClusterMetadataService build() InProgressSequences.EMPTY, ConsensusMigrationState.EMPTY, ImmutableMap.of(), - AccordStaleReplicas.EMPTY); + AccordStaleReplicas.EMPTY, + CMSMembership.EMPTY); } return new StubClusterMetadataService(new UniformRangePlacement(), snapshots != null ? snapshots : MetadataSnapshots.NO_OP, diff --git a/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java b/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java index 1f451cdbc2ce..dc3fae0fb9b9 100644 --- a/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java +++ b/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java @@ -58,6 +58,7 @@ import org.apache.cassandra.service.accord.AccordFastPath; import org.apache.cassandra.service.accord.AccordStaleReplicas; import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.tcm.CMSMembership; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.MultiStepOperation; @@ -303,7 +304,8 @@ public static ClusterMetadata emptyWithSchemaFromSystemTables(Set allKno InProgressSequences.EMPTY, ConsensusMigrationState.EMPTY, Collections.emptyMap(), - AccordStaleReplicas.EMPTY); + AccordStaleReplicas.EMPTY, + CMSMembership.EMPTY); } public static ClusterMetadata fromEndpointStates(DistributedSchema schema, Map epStates) @@ -393,7 +395,8 @@ public static ClusterMetadata fromEndpointStates(Map epstates) diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java index 063f94857cfc..a24b7a32b362 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java @@ -63,6 +63,7 @@ import org.apache.cassandra.service.accord.AccordStaleReplicas; import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; import org.apache.cassandra.tcm.AtomicLongBackedProcessor; +import org.apache.cassandra.tcm.CMSMembership; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Commit; @@ -168,7 +169,8 @@ public static ClusterMetadata minimalForTesting(Epoch epoch, IPartitioner partit InProgressSequences.EMPTY, ConsensusMigrationState.EMPTY, ImmutableMap.of(), - AccordStaleReplicas.EMPTY); + AccordStaleReplicas.EMPTY, + CMSMembership.EMPTY); } public static ClusterMetadata minimalForTesting(IPartitioner partitioner) diff --git a/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java b/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java index 36cc9d154702..be3252c166ab 100644 --- a/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java +++ b/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java @@ -36,6 +36,7 @@ import org.apache.cassandra.service.accord.AccordFastPath; import org.apache.cassandra.service.accord.AccordStaleReplicas; import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.tcm.CMSMembership; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.membership.Directory; @@ -96,7 +97,8 @@ public static ClusterMetadata metadata(NodeConfiguration... configurations) InProgressSequences.EMPTY, ConsensusMigrationState.EMPTY, ImmutableMap.of(), - AccordStaleReplicas.EMPTY); + AccordStaleReplicas.EMPTY, + CMSMembership.EMPTY); } @Test diff --git a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java index 528dbae47737..730ee6cb6d41 100644 --- a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java +++ b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Collections; +import java.util.List; import java.util.Random; import java.util.concurrent.ThreadLocalRandom; @@ -33,6 +34,7 @@ import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.ClusterMetadata.Transformer.Transformed; @@ -55,6 +57,7 @@ import static org.apache.cassandra.tcm.MetadataKeys.ACCORD_FAST_PATH; import static org.apache.cassandra.tcm.MetadataKeys.ACCORD_STALE_REPLICAS; +import static org.apache.cassandra.tcm.MetadataKeys.CMS_MEMBERSHIP; import static org.apache.cassandra.tcm.MetadataKeys.CONSENSUS_MIGRATION_STATE; import static org.apache.cassandra.tcm.MetadataKeys.DATA_PLACEMENTS; import static org.apache.cassandra.tcm.MetadataKeys.IN_PROGRESS_SEQUENCES; @@ -137,6 +140,42 @@ public void testModifyMembershipAndOwnership() assertModifications(transformed, NODE_DIRECTORY, TOKEN_MAP); } + @Test + public void testModifyCMSMembership() + { + ClusterMetadata metadata = new ClusterMetadata(Murmur3Partitioner.instance, Directory.EMPTY, DistributedSchema.empty()); + Transformed transformed = metadata.transformer().build(); + assertTrue(transformed.modifiedKeys.isEmpty()); + + List endpoints = MembershipUtils.uniqueEndpoints(random, 2); + NodeAddresses a1 = new NodeAddresses(endpoints.get(0)); + NodeAddresses a2 = new NodeAddresses(endpoints.get(1)); + + transformed = metadata.transformer() + .register(a1, new Location("dc1", "rack1"), NodeVersion.CURRENT) + .register(a2, new Location("dc1", "rack1"), NodeVersion.CURRENT) + .build(); + assertModifications(transformed, NODE_DIRECTORY); + NodeId n1 = transformed.metadata.directory.peerId(a1.broadcastAddress); + NodeId n2 = transformed.metadata.directory.peerId(a2.broadcastAddress); + + transformed = metadata.transformer().startJoiningCMS(n1).build(); + assertModifications(transformed, CMS_MEMBERSHIP); + transformed = transformed.metadata.transformer().cancelJoiningCMS(n1).build(); + assertModifications(transformed, CMS_MEMBERSHIP); + + transformed = transformed.metadata.transformer().startJoiningCMS(n1).build(); + assertModifications(transformed, CMS_MEMBERSHIP); + transformed = transformed.metadata.transformer().finishJoiningCMS(n1).build(); + assertModifications(transformed, CMS_MEMBERSHIP); + + transformed = transformed.metadata.transformer().startJoiningCMS(n2).finishJoiningCMS(n2).build(); + assertModifications(transformed, CMS_MEMBERSHIP); + + transformed = transformed.metadata.transformer().leaveCMS(n1).build(); + assertModifications(transformed, CMS_MEMBERSHIP); + } + @Test public void testModifySchema() { @@ -320,6 +359,8 @@ else if (key == CONSENSUS_MIGRATION_STATE) return metadata.consensusMigrationState; else if (key == ACCORD_STALE_REPLICAS) return metadata.accordStaleReplicas; + else if (key == CMS_MEMBERSHIP) + return metadata.cmsMembership; throw new IllegalArgumentException("Unknown metadata key " + key); } diff --git a/test/unit/org/apache/cassandra/tcm/membership/MembershipUtils.java b/test/unit/org/apache/cassandra/tcm/membership/MembershipUtils.java index 91e1f1e1c3ce..3820c8f8d4d9 100644 --- a/test/unit/org/apache/cassandra/tcm/membership/MembershipUtils.java +++ b/test/unit/org/apache/cassandra/tcm/membership/MembershipUtils.java @@ -19,8 +19,8 @@ package org.apache.cassandra.tcm.membership; import java.net.UnknownHostException; +import java.util.List; import java.util.Random; -import java.util.Set; import java.util.stream.Collectors; import org.apache.cassandra.locator.InetAddressAndPort; @@ -38,13 +38,13 @@ public static InetAddressAndPort randomEndpoint(Random random) return endpoint(random.nextInt(254) + 1); } - public static Set uniqueEndpoints(Random random, int count) + public static List uniqueEndpoints(Random random, int count) { return random.ints(1, 255) .distinct() .limit(count) .mapToObj(MembershipUtils::endpoint) - .collect(Collectors.toSet()); + .collect(Collectors.toList()); } public static InetAddressAndPort endpoint(int i) diff --git a/test/unit/org/apache/cassandra/utils/CassandraGenerators.java b/test/unit/org/apache/cassandra/utils/CassandraGenerators.java index 36ce78e21a73..05c85e79ac75 100644 --- a/test/unit/org/apache/cassandra/utils/CassandraGenerators.java +++ b/test/unit/org/apache/cassandra/utils/CassandraGenerators.java @@ -139,6 +139,7 @@ import org.apache.cassandra.service.accord.fastpath.SimpleFastPathStrategy; import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.tcm.CMSMembership; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.extensions.ExtensionKey; @@ -1937,7 +1938,8 @@ public Gen build() ConsensusMigrationState consensusMigrationState = ConsensusMigrationState.EMPTY; Map, ExtensionValue> extensions = ImmutableMap.of(); AccordStaleReplicas accordStaleReplicas = accordStaleReplicasGen.generate(rnd); - return new ClusterMetadata(epoch, partitioner, schema, directory, tokenMap, placements, accordFastPath, lockedRanges, inProgressSequences, consensusMigrationState, extensions, accordStaleReplicas); + CMSMembership cmsMembership = CMSMembership.EMPTY; + return new ClusterMetadata(epoch, partitioner, schema, directory, tokenMap, placements, accordFastPath, lockedRanges, inProgressSequences, consensusMigrationState, extensions, accordStaleReplicas, cmsMembership); }; } } From 5a9ccbffb835941bae27076c767520767c2f4b87 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Fri, 13 Jun 2025 16:14:09 +0100 Subject: [PATCH 07/23] [CASSANDRA-20736] Support for upgrades from gossip & from earlier versions with TCM --- .../apache/cassandra/tcm/CMSMembership.java | 36 +++++++++++++++++++ .../apache/cassandra/tcm/ClusterMetadata.java | 33 +++++++++++++++-- .../tcm/ownership/DataPlacements.java | 16 +++++++-- 3 files changed, 80 insertions(+), 5 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/CMSMembership.java b/src/java/org/apache/cassandra/tcm/CMSMembership.java index d83a5b950e90..d18bfdb66310 100644 --- a/src/java/org/apache/cassandra/tcm/CMSMembership.java +++ b/src/java/org/apache/cassandra/tcm/CMSMembership.java @@ -28,7 +28,10 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.tcm.membership.Directory; import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.ownership.DataPlacement; +import org.apache.cassandra.tcm.ownership.VersionedEndpoints; import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; import org.apache.cassandra.utils.btree.BTreeSet; @@ -42,6 +45,39 @@ public class CMSMembership implements MetadataValue private final BTreeSet fullMembers; private final BTreeSet joiningMembers; + /** + * Used to derive a CMSMembership when deserializing a ClusterMetadata instance written with a metadata version + * prior to V7. At that time, CMS membership was always inferred from the data placements of the distributed + * cluster metadata keyspace. Read replicas are full members of the CMS and write-only replicas are in the process + * of joining. Note: every read replica must also be a write replica, leaving the CMS is atomic in respect of the + * placements. + * @param placement + * @param directory + * @return + */ + public static CMSMembership reconstruct(DataPlacement placement, Directory directory) + { + SortedSet full = new TreeSet<>(); + SortedSet joining = new TreeSet<>(); + Epoch lm = Epoch.EMPTY; + for (VersionedEndpoints.ForRange endpoints : placement.reads.endpoints) + { + lm = endpoints.lastModified().isAfter(lm) ? endpoints.lastModified() : lm; + endpoints.get().endpoints().forEach(e -> full.add(directory.peerId(e))); + } + + for (VersionedEndpoints.ForRange endpoints : placement.writes.endpoints) + { + lm = endpoints.lastModified().isAfter(lm) ? endpoints.lastModified() : lm; + endpoints.get().endpoints().forEach(e -> { + NodeId id = directory.peerId(e); + if (!full.contains(id)) + joining.add(id); + }); + } + return new CMSMembership(lm, BTreeSet.of(full), BTreeSet.of(joining)); + } + private CMSMembership() { this(Epoch.EMPTY, diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 4fa279ab6ca4..73367ee8f80c 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -209,7 +209,7 @@ private ClusterMetadata(int metadataIdentifier, this.directory = directory; this.tokenMap = tokenMap; this.accordFastPath = accordFastPath; - this.placements = addMetaPlacement(placements, cmsMembership); + this.placements = maybeAddMetaPlacement(placements, cmsMembership); this.lockedRanges = lockedRanges; this.inProgressSequences = inProgressSequences; this.consensusMigrationState = consensusMigrationState; @@ -253,7 +253,7 @@ public EndpointsForRange fullCMSMembersAsReplicas() return fullCMSReplicas; } - private DataPlacements addMetaPlacement(DataPlacements placements, CMSMembership cms) + private DataPlacements maybeAddMetaPlacement(DataPlacements placements, CMSMembership cms) { if (epoch.isBefore(Epoch.FIRST) || schema.getKeyspaces().get(SchemaConstants.METADATA_KEYSPACE_NAME).isEmpty()) return placements; @@ -261,9 +261,24 @@ private DataPlacements addMetaPlacement(DataPlacements placements, CMSMembership DataPlacement.Builder metaBuilder = DataPlacement.builder(); if (epoch.is(Epoch.FIRST)) { + // PRE_INITIALIZE_CMS: placements need to be hardcoded to the local address so that the subsequent + // INITIALIZE_CMS can be committed metaBuilder.withReadReplica(Epoch.FIRST, MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort())); metaBuilder.withWriteReplica(Epoch.FIRST, MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort())); } + else if (epoch.isAfter(Epoch.FIRST) && directory.isEmpty()) + { + // This cluster did not previously upgrade from a gossip based version (i.e. pre-6.0) but did at some point + // run a version prior to MetadataVersion.V7 where we started to encode CMS membership directly. This + // condition implies that we are reconstructing a serialized cluster metadata during replay or else the + // directory should not be empty after Epoch.FIRST as the base state in INITIALIZE_CMS now includes the + // first CMS node. Similarly, if the cluster had previously been running a gossip-based version, the + // directory would contain entries for each of the live nodes at the time of upgrade. + // Given this state, the very next transformation that is/was applied will be to register the node that + // committed the PRE_INITIALIZE_CMS and INTIALIZE_CMS transformations. So we just leave the placements + // untouched as they will already contain that node as an endpoint. + return placements; + } else { for (NodeId id : cms.fullMembers()) @@ -1218,6 +1233,20 @@ public ClusterMetadata deserialize(DataInputPlus in, Version version) throws IOE CMSMembership cmsMembership = CMSMembership.EMPTY; if (version.isAtLeast(Version.V8)) cmsMembership = CMSMembership.serializer.deserialize(in, version); + else + { + KeyspaceMetadata metadataKs = schema.getKeyspaceMetadata(SchemaConstants.METADATA_KEYSPACE_NAME); + if (!dir.isEmpty()) + { + // Pre-V7 the membership of the CMS was always inferred from the placement of the distributed + // metadata keyspace. This is true for the initial cluster metadata created during upgrade from + // gossip and for subsequent epochs. The endpoints in the placement must belong to registered nodes, + // so we can derive the CMSMembership using the data placement and directory. + DataPlacement placement = placements.get(metadataKs.params.replication); + cmsMembership = CMSMembership.reconstruct(placement, dir); + } + placements = placements.unbuild().without(metadataKs.params.replication).build(); + } return new ClusterMetadata(clusterIdentifier, epoch, diff --git a/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java b/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java index a4dc95eef47c..a21d809298b6 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java +++ b/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java @@ -250,9 +250,14 @@ public static class Serializer implements MetadataSerializer public void serialize(DataPlacements t, DataOutputPlus out, Version version) throws IOException { Map map = t.asMap(); - out.writeInt(map.size()); + + // From V7, placements for the metadata keyspace are derived from CMSMembership, not serialized + int mapSize = version.isBefore(Version.V7) ? map.size() : Math.max(map.size() - 1, 0); + out.writeInt(mapSize); for (Map.Entry entry : map.entrySet()) { + if (version.isAtLeast(Version.V7) && entry.getKey().isMeta()) + continue; ReplicationParams.serializer.serialize(entry.getKey(), out, version); DataPlacement.serializerFor(entry.getKey()).serialize(entry.getValue(), out, version); } @@ -274,9 +279,14 @@ public DataPlacements deserialize(DataInputPlus in, Version version) throws IOEx public long serializedSize(DataPlacements t, Version version) { - long size = sizeof(t.size()); - for (Map.Entry entry : t.asMap().entrySet()) + Map map = t.asMap(); + // From V7, placements for the metadata keyspace are derived from CMSMembership, not serialized + int mapSize = version.isBefore(Version.V7) ? map.size() : Math.max(map.size() - 1, 0); + long size = sizeof(mapSize); + for (Map.Entry entry : map.entrySet()) { + if (version.isAtLeast(Version.V7) && entry.getKey().isMeta()) + continue; size += ReplicationParams.serializer.serializedSize(entry.getKey(), version); size += DataPlacement.serializerFor(entry.getKey()).serializedSize(entry.getValue(), version); } From 4c012b13107a16db73381f21d842dc28b1bee51c Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Fri, 13 Jun 2025 18:41:46 +0100 Subject: [PATCH 08/23] [CASSANDRA-20736] Update CancelCMSReconfiguration --- .../apache/cassandra/tcm/CMSMembership.java | 18 +++++ .../apache/cassandra/tcm/ClusterMetadata.java | 24 +++---- .../sequences/CancelCMSReconfiguration.java | 66 ++++++++----------- 3 files changed, 53 insertions(+), 55 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/CMSMembership.java b/src/java/org/apache/cassandra/tcm/CMSMembership.java index d18bfdb66310..5b88543be8af 100644 --- a/src/java/org/apache/cassandra/tcm/CMSMembership.java +++ b/src/java/org/apache/cassandra/tcm/CMSMembership.java @@ -28,6 +28,8 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.MetaStrategy; +import org.apache.cassandra.locator.Replica; import org.apache.cassandra.tcm.membership.Directory; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.ownership.DataPlacement; @@ -78,6 +80,22 @@ public static CMSMembership reconstruct(DataPlacement placement, Directory direc return new CMSMembership(lm, BTreeSet.of(full), BTreeSet.of(joining)); } + public DataPlacement toPlacement(Directory directory) + { + DataPlacement.Builder builder = DataPlacement.builder(); + for (NodeId id : fullMembers) + { + Replica replica = MetaStrategy.replica(directory.endpoint(id)); + builder.withReadReplica(lastModified, replica); + builder.withWriteReplica(lastModified, replica); + } + for(NodeId id : joiningMembers) + { + builder.withWriteReplica(lastModified, MetaStrategy.replica(directory.endpoint(id))); + } + return builder.build(); + } + private CMSMembership() { this(Epoch.EMPTY, diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 73367ee8f80c..360f5fc014d8 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -258,13 +258,16 @@ private DataPlacements maybeAddMetaPlacement(DataPlacements placements, CMSMembe if (epoch.isBefore(Epoch.FIRST) || schema.getKeyspaces().get(SchemaConstants.METADATA_KEYSPACE_NAME).isEmpty()) return placements; - DataPlacement.Builder metaBuilder = DataPlacement.builder(); + DataPlacement metaPlacement; if (epoch.is(Epoch.FIRST)) { // PRE_INITIALIZE_CMS: placements need to be hardcoded to the local address so that the subsequent // INITIALIZE_CMS can be committed - metaBuilder.withReadReplica(Epoch.FIRST, MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort())); - metaBuilder.withWriteReplica(Epoch.FIRST, MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort())); + Replica localReplica = MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort()); + metaPlacement = DataPlacement.builder() + .withReadReplica(Epoch.FIRST, localReplica) + .withWriteReplica(Epoch.FIRST, localReplica) + .build(); } else if (epoch.isAfter(Epoch.FIRST) && directory.isEmpty()) { @@ -281,20 +284,11 @@ else if (epoch.isAfter(Epoch.FIRST) && directory.isEmpty()) } else { - for (NodeId id : cms.fullMembers()) - { - Replica replica = MetaStrategy.replica(directory.endpoint(id)); - metaBuilder.withReadReplica(cms.lastModified(), replica); - metaBuilder.withWriteReplica(cms.lastModified(), replica); - } - - for(NodeId id : cms.joiningMembers()) - { - metaBuilder.withWriteReplica(cms.lastModified(), MetaStrategy.replica(directory.endpoint(id))); - } + // Build a placement based on the CMS membership + metaPlacement = cms.toPlacement(directory); } return placements.unbuild() - .with(ReplicationParams.meta(this), metaBuilder.build()) + .with(ReplicationParams.meta(this), metaPlacement) .build(); } diff --git a/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java b/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java index 5ce36e09a814..2c520e44ebd8 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java +++ b/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java @@ -25,25 +25,20 @@ import org.apache.cassandra.exceptions.ExceptionCode; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.Replica; import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; +import org.apache.cassandra.tcm.CMSMembership; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.Directory; -import org.apache.cassandra.tcm.ownership.DataPlacement; -import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; -import static org.apache.cassandra.locator.MetaStrategy.entireRange; - public class CancelCMSReconfiguration implements Transformation { public static final Serializer serializer = new Serializer(); @@ -66,53 +61,44 @@ public Result execute(ClusterMetadata prev) if (reconfigureCMS == null) return new Rejected(ExceptionCode.INVALID, "Can not cancel reconfiguration since there does not seem to be any in-flight"); - ReplicationParams metaParams = ReplicationParams.meta(prev); ClusterMetadata.Transformer transformer = prev.transformer(); - DataPlacement placement = prev.placements.get(metaParams); // Reset any partially completed transition by removing the pending replica from the write group if (reconfigureCMS.next.activeTransition != null) { - InetAddressAndPort pendingEndpoint = prev.directory.endpoint(reconfigureCMS.next.activeTransition.nodeId); - Replica pendingReplica = new Replica(pendingEndpoint, entireRange, true); - placement = placement.unbuild() - .withoutWriteReplica(prev.nextEpoch(), pendingReplica) - .build(); + // see what the placements for the meta keyspace will be after cancelling the active join + CMSMembership cms = prev.cmsMembership.cancelJoining(reconfigureCMS.next.activeTransition.nodeId); + if (!cms.joiningMembers().isEmpty()) + return new Rejected(ExceptionCode.INVALID, + String.format("Placements will be inconsistent if this transformation is applied:" + + "\nFull members %s\nJoining members: %s", + cms.fullMembers(), + cms.joiningMembers())); + + // if all is good, actually cancel the joining member + transformer = transformer.cancelJoiningCMS(reconfigureCMS.next.activeTransition.nodeId); + // Recalculate the replication params for the meta keyspace based on the actual placement as it will be + ReplicationParams recalculated = getAccurateReplication(prev.directory, cms); + + // If they no longer match the replication params in schema, i.e. the transitions completed so far did not + // bring the membership/placements into line with configuration, update schema to match what we actually have + if (!recalculated.equals(ReplicationParams.meta(prev))) + { + KeyspaceMetadata keyspace = prev.schema.getKeyspaceMetadata(SchemaConstants.METADATA_KEYSPACE_NAME); + KeyspaceMetadata newKeyspace = keyspace.withSwapped(new KeyspaceParams(keyspace.params.durableWrites, recalculated, FastPathStrategy.simple())); + transformer = transformer.with(new DistributedSchema(prev.schema.getKeyspaces().withAddedOrUpdated(newKeyspace))); + } } - if (!placement.reads.equivalentTo(placement.writes)) - return new Rejected(ExceptionCode.INVALID, String.format("Placements will be inconsistent if this transformation is applied:\nReads %s\nWrites: %s", - placement.reads, - placement.writes)); - - // Reset the replication params for the meta keyspace based on the actual placement in case they no longer match - ReplicationParams fromPlacement = getAccurateReplication(prev.directory, placement); - - // If they no longer match, i.e. the transitions completed so far did not bring the placements into line with - // the configuration, remove the entry keyed by the existing configured params. - DataPlacements.Builder builder = prev.placements.unbuild(); - if (!metaParams.equals(fromPlacement)) - { - builder = builder.without(metaParams); - - // Also update schema with the corrected params - KeyspaceMetadata keyspace = prev.schema.getKeyspaceMetadata(SchemaConstants.METADATA_KEYSPACE_NAME); - KeyspaceMetadata newKeyspace = keyspace.withSwapped(new KeyspaceParams(keyspace.params.durableWrites, fromPlacement, FastPathStrategy.simple())); - transformer = transformer.with(new DistributedSchema(prev.schema.getKeyspaces().withAddedOrUpdated(newKeyspace))); - } - - // finally, add the possibly corrected placement keyed by the possibly corrected params - builder = builder.with(fromPlacement, placement); - transformer = transformer.with(builder.build()); return Transformation.success(transformer.with(prev.inProgressSequences.without(ReconfigureCMS.SequenceKey.instance)) .with(prev.lockedRanges.unlock(reconfigureCMS.next.lockKey)), MetaStrategy.affectedRanges(prev)); } - private ReplicationParams getAccurateReplication(Directory directory, DataPlacement placement) + private ReplicationParams getAccurateReplication(Directory directory, CMSMembership membership) { Map replicasPerDc = new HashMap<>(); - placement.writes.byEndpoint().keySet().forEach(i -> { - String dc = directory.location(directory.peerId(i)).datacenter; + membership.fullMembers().forEach(id -> { + String dc = directory.location(id).datacenter; int count = replicasPerDc.getOrDefault(dc, 0); replicasPerDc.put(dc, ++count); }); From c9faf3f7fc4639a92193f5e60870097dc7be112f Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Mon, 16 Jun 2025 19:17:59 +0100 Subject: [PATCH 09/23] [CASSANDRA-20736] Properly set lastModified on DataPlacements --- .../org/apache/cassandra/tcm/ClusterMetadata.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 360f5fc014d8..8ad54b6d7d80 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -259,6 +259,8 @@ private DataPlacements maybeAddMetaPlacement(DataPlacements placements, CMSMembe return placements; DataPlacement metaPlacement; + Epoch previousLastModified = placements.lastModified(); + Epoch nextLastModified; if (epoch.is(Epoch.FIRST)) { // PRE_INITIALIZE_CMS: placements need to be hardcoded to the local address so that the subsequent @@ -268,6 +270,7 @@ private DataPlacements maybeAddMetaPlacement(DataPlacements placements, CMSMembe .withReadReplica(Epoch.FIRST, localReplica) .withWriteReplica(Epoch.FIRST, localReplica) .build(); + nextLastModified = Epoch.FIRST; } else if (epoch.isAfter(Epoch.FIRST) && directory.isEmpty()) { @@ -286,10 +289,16 @@ else if (epoch.isAfter(Epoch.FIRST) && directory.isEmpty()) { // Build a placement based on the CMS membership metaPlacement = cms.toPlacement(directory); + if (cms.lastModified().isAfter(previousLastModified)) + nextLastModified = cms.lastModified(); + else + nextLastModified = previousLastModified; + } return placements.unbuild() .with(ReplicationParams.meta(this), metaPlacement) - .build(); + .build() + .withLastModified(nextLastModified); } public Transformer transformer() From 12a2f55c9202b0e8020105dc3ca847572bdb7a2e Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Mon, 16 Jun 2025 15:28:20 +0100 Subject: [PATCH 10/23] [CASSANDRA-20736] AtomicLongProcessor can always accept commits --- .../apache/cassandra/tcm/AbstractLocalProcessor.java | 4 +++- .../cassandra/tcm/AtomicLongBackedProcessor.java | 12 ++++++++++++ .../apache/cassandra/tcm/PaxosBackedProcessor.java | 10 ++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java b/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java index 40416a782157..8dde699d37b3 100644 --- a/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java +++ b/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java @@ -43,6 +43,7 @@ public AbstractLocalProcessor(LocalLog log) this.log = log; } + /** * Epoch returned by processor in the Result is _not_ guaranteed to be visible by the Follower by * the time when this method returns. @@ -53,7 +54,7 @@ public final Commit.Result commit(Entry.Id entryId, Transformation transform, fi while (!retryPolicy.hasExpired()) { ClusterMetadata previous = log.waitForHighestConsecutive(); - if (!previous.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort()) && previous.epoch.isAfter(Epoch.FIRST)) + if (!acceptCommit(previous)) { String msg = String.format("Node %s is not a CMS member in epoch %s; members=%s", FBUtilities.getBroadcastAddressAndPort(), @@ -196,4 +197,5 @@ private LogState toLogState(Epoch lastKnown) public abstract ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy); protected abstract boolean tryCommitOne(Entry.Id entryId, Transformation transform, Epoch previousEpoch, Epoch nextEpoch); + protected abstract boolean acceptCommit(ClusterMetadata metadata); } diff --git a/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java b/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java index bbbe4fa80621..fd0c92c18115 100644 --- a/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java +++ b/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java @@ -59,6 +59,18 @@ public AtomicLongBackedProcessor(LocalLog log, boolean isReset) this.epochHolder = new AtomicLong(epoch.getEpoch()); } + @Override + public boolean acceptCommit(ClusterMetadata metadata) + { + // AtomicLongBackedProcessor is only for use in tests and offline tools and it should be safe to assume that it + // is always allowed to process commit requests. In a non-test setup, when the CMS is initialized the initiating + // node is also registered. This is not the case in tests (see initCMS/recreateCMS in ServerTestUtils for more + // detail), as this simplifies setting up/tearing down topologies in test cases. The processor intended for use + // in real clusters (PaxosBackedProcessor) performs some actual validation here to ensure that the node is + // really a CMS member. + return true; + } + @Override protected boolean tryCommitOne(Entry.Id entryId, Transformation transform, Epoch previousEpoch, Epoch nextEpoch) { diff --git a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java index 03f9afe0537c..b45887bbc50b 100644 --- a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java +++ b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java @@ -43,6 +43,7 @@ import org.apache.cassandra.net.RequestCallbackWithFailure; import org.apache.cassandra.net.Verb; import org.apache.cassandra.schema.DistributedMetadataLogKeyspace; +import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.log.Entry; import org.apache.cassandra.tcm.log.LocalLog; import org.apache.cassandra.tcm.log.LogState; @@ -63,6 +64,15 @@ public PaxosBackedProcessor(LocalLog log) super(log); } + @Override + protected boolean acceptCommit(ClusterMetadata metadata) + { + if (metadata.epoch.isAfter(Epoch.FIRST) && metadata.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort())) + return true; + return metadata.epoch.isEqualOrBefore(Epoch.FIRST) + && metadata.placements.get(ReplicationParams.meta(metadata)).reads.byEndpoint().keySet().contains(FBUtilities.getBroadcastAddressAndPort()); + } + @Override protected boolean tryCommitOne(Entry.Id entryId, Transformation transform, Epoch previousEpoch, Epoch nextEpoch) { From 36d62e7743cb04dab93043449a91768a5f06faa2 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Wed, 18 Jun 2025 15:41:07 +0100 Subject: [PATCH 11/23] [CASSANDRA-20736] Separate MetaStrategy placements from others --- .../cassandra/locator/EndpointsForToken.java | 8 +- .../cassandra/locator/ReplicaLayout.java | 24 ++-- .../apache/cassandra/tcm/ClusterMetadata.java | 121 +++++++++++------- .../cassandra/tcm/PaxosBackedProcessor.java | 2 +- .../tcm/ownership/DataPlacement.java | 5 + .../tcm/ownership/DataPlacements.java | 18 +-- .../tcm/ownership/UniformRangePlacement.java | 6 +- .../cassandra/tcm/serialization/Version.java | 1 + 8 files changed, 109 insertions(+), 76 deletions(-) diff --git a/src/java/org/apache/cassandra/locator/EndpointsForToken.java b/src/java/org/apache/cassandra/locator/EndpointsForToken.java index fc88fa761bba..f0a3f732bafa 100644 --- a/src/java/org/apache/cassandra/locator/EndpointsForToken.java +++ b/src/java/org/apache/cassandra/locator/EndpointsForToken.java @@ -25,7 +25,9 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.ownership.VersionedEndpoints; /** @@ -164,7 +166,11 @@ public static EndpointsForToken copyOf(Token token, Iterable replicas) public static VersionedEndpoints.ForToken natural(Keyspace keyspace, Token token) { - return ClusterMetadata.current().placements.get(keyspace.getMetadata().params.replication).reads.forToken(token); + ReplicationParams replication = keyspace.getMetadata().params.replication; + DataPlacement placement = replication.isMeta() + ? ClusterMetadata.current().getCMSPlacement() + : ClusterMetadata.current().placements.get(replication); + return placement.reads.forToken(token); } } diff --git a/src/java/org/apache/cassandra/locator/ReplicaLayout.java b/src/java/org/apache/cassandra/locator/ReplicaLayout.java index 440b2bed7310..71a4ff4beb4e 100644 --- a/src/java/org/apache/cassandra/locator/ReplicaLayout.java +++ b/src/java/org/apache/cassandra/locator/ReplicaLayout.java @@ -239,7 +239,9 @@ public static ReplicaLayout.ForTokenWrite forTokenWriteLiveAndDown(ClusterMetada { // todo deduplicate so that "pending" contains "read - write", // which is a hack until we revisit how consistency level handles pending - DataPlacement dataPlacement = metadata.placements.get(ks.params.replication); + DataPlacement dataPlacement = ks.params.replication.isMeta() + ? metadata.getCMSPlacement() + : metadata.placements.get(ks.params.replication); natural = forNonLocalStrategyTokenRead(dataPlacement, token); // perf optimization to avoid double endpoints search and filtering for a typical case // DataPlacement constructor does a deduplication of reads/writes, so we can use cheap == comparision here @@ -394,30 +396,22 @@ static ReplicaLayout.ForRangeRead forRangeReadSorted(ClusterMetadata metadata, K static EndpointsForRange forNonLocalStategyRangeRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, AbstractBounds range) { - return metadata.placements.get(keyspace.params.replication).reads.forRange(range.right.getToken()).get(); + DataPlacement placement = keyspace.params.replication.isMeta() + ? metadata.getCMSPlacement() + : metadata.placements.get(keyspace.params.replication); + return placement.reads.forRange(range.right.getToken()).get(); } - public static EndpointsForToken forNonLocalStrategyTokenRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, Token token) - { - return forNonLocalStrategyTokenRead(metadata.placements.get(keyspace.params.replication), token); - } - - public static EndpointsForToken forNonLocalStrategyTokenRead(DataPlacement dataPlacement, Token token) + private static EndpointsForToken forNonLocalStrategyTokenRead(DataPlacement dataPlacement, Token token) { return dataPlacement.reads.forToken(token).get(); } - static EndpointsForToken forNonLocalStrategyTokenWrite(ClusterMetadata metadata, KeyspaceMetadata keyspace, Token token) - { - return forNonLocalStrategyTokenWrite(metadata.placements.get(keyspace.params.replication), token); - } - - static EndpointsForToken forNonLocalStrategyTokenWrite(DataPlacement dataPlacement, Token token) + private static EndpointsForToken forNonLocalStrategyTokenWrite(DataPlacement dataPlacement, Token token) { return dataPlacement.writes.forToken(token).get(); } - static EndpointsForRange forLocalStrategyRange(ClusterMetadata metadata, AbstractReplicationStrategy replicationStrategy, AbstractBounds range) { return replicationStrategy.calculateNaturalReplicas(range.right.getToken(), metadata); diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 8ad54b6d7d80..56b71360a6bd 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -120,6 +120,7 @@ public class ClusterMetadata // These fields are lazy but only for the test purposes, since their computation requires initialization of the log ks private EndpointsForRange fullCMSReplicas; private Set fullCMSEndpoints; + private DataPlacement cmsDataPlacement; private DataPlacements writePlacementAllSettled; public ClusterMetadata(IPartitioner partitioner) @@ -209,7 +210,7 @@ private ClusterMetadata(int metadataIdentifier, this.directory = directory; this.tokenMap = tokenMap; this.accordFastPath = accordFastPath; - this.placements = maybeAddMetaPlacement(placements, cmsMembership); + this.placements = placements; this.lockedRanges = lockedRanges; this.inProgressSequences = inProgressSequences; this.consensusMigrationState = consensusMigrationState; @@ -217,6 +218,7 @@ private ClusterMetadata(int metadataIdentifier, this.locator = Locator.usingDirectory(directory); this.accordStaleReplicas = accordStaleReplicas; this.cmsMembership = cmsMembership; + this.cmsDataPlacement = calculateCMSPlacement(placements, cmsMembership); } public Set fullCMSMemberIds() @@ -253,52 +255,57 @@ public EndpointsForRange fullCMSMembersAsReplicas() return fullCMSReplicas; } - private DataPlacements maybeAddMetaPlacement(DataPlacements placements, CMSMembership cms) + public DataPlacement getCMSPlacement() + { + return cmsDataPlacement; + } + + private DataPlacement calculateCMSPlacement(DataPlacements placements, CMSMembership cms) { if (epoch.isBefore(Epoch.FIRST) || schema.getKeyspaces().get(SchemaConstants.METADATA_KEYSPACE_NAME).isEmpty()) - return placements; - - DataPlacement metaPlacement; - Epoch previousLastModified = placements.lastModified(); - Epoch nextLastModified; - if (epoch.is(Epoch.FIRST)) - { - // PRE_INITIALIZE_CMS: placements need to be hardcoded to the local address so that the subsequent - // INITIALIZE_CMS can be committed - Replica localReplica = MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort()); - metaPlacement = DataPlacement.builder() - .withReadReplica(Epoch.FIRST, localReplica) - .withWriteReplica(Epoch.FIRST, localReplica) - .build(); - nextLastModified = Epoch.FIRST; - } - else if (epoch.isAfter(Epoch.FIRST) && directory.isEmpty()) - { - // This cluster did not previously upgrade from a gossip based version (i.e. pre-6.0) but did at some point - // run a version prior to MetadataVersion.V7 where we started to encode CMS membership directly. This - // condition implies that we are reconstructing a serialized cluster metadata during replay or else the - // directory should not be empty after Epoch.FIRST as the base state in INITIALIZE_CMS now includes the - // first CMS node. Similarly, if the cluster had previously been running a gossip-based version, the - // directory would contain entries for each of the live nodes at the time of upgrade. - // Given this state, the very next transformation that is/was applied will be to register the node that - // committed the PRE_INITIALIZE_CMS and INTIALIZE_CMS transformations. So we just leave the placements - // untouched as they will already contain that node as an endpoint. - return placements; + return DataPlacement.empty(); + + if (directory.isEmpty()) + { + if (epoch.is(Epoch.FIRST)) + { + // PRE_INITIALIZE_CMS: placements need to be hardcoded to the local address so that the subsequent + // INITIALIZE_CMS can be committed + Replica localReplica = MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort()); + return DataPlacement.builder() + .withReadReplica(Epoch.FIRST, localReplica) + .withWriteReplica(Epoch.FIRST, localReplica) + .build(); + } + else + { + // This cluster did not previously upgrade from a gossip based version (i.e. pre-6.0) but did at some point + // run a version prior to MetadataVersion.V7 where we started to encode CMS membership directly. This + // condition implies that we are reconstructing a serialized cluster metadata during replay or else the + // directory should not be empty after Epoch.FIRST as the base state in INITIALIZE_CMS now includes the + // first CMS node. Similarly, if the cluster had previously been running a gossip-based version, the + // directory would contain entries for each of the live nodes at the time of upgrade. + // Given this state, the very next transformation that is/was applied will be to register the node that + // committed the PRE_INITIALIZE_CMS and INTIALIZE_CMS transformations. So we just extract the placements + // associated with the MetaStrategy params as they will already contain that node as an endpoint. + for (ReplicationParams params : placements.keys()) + if (params.isMeta()) + return placements.get(params); + + // This point can only be reached in tests. + // TODO enforce that invariant somehow + Replica localReplica = MetaStrategy.replica(FBUtilities.getBroadcastAddressAndPort()); + return DataPlacement.builder() + .withReadReplica(epoch, localReplica) + .withWriteReplica(epoch, localReplica) + .build(); + } } else { // Build a placement based on the CMS membership - metaPlacement = cms.toPlacement(directory); - if (cms.lastModified().isAfter(previousLastModified)) - nextLastModified = cms.lastModified(); - else - nextLastModified = previousLastModified; - + return cms.toPlacement(directory); } - return placements.unbuild() - .with(ReplicationParams.meta(this), metaPlacement) - .build() - .withLastModified(nextLastModified); } public Transformer transformer() @@ -1155,7 +1162,11 @@ public void serialize(ClusterMetadata metadata, DataOutputPlus out, Version vers DistributedSchema.serializer.serialize(metadata.schema, out, version); Directory.serializer.serialize(metadata.directory, out, version); TokenMap.serializer.serialize(metadata.tokenMap, out, version); - DataPlacements.serializer.serialize(metadata.placements, out, version); + // Prior to V9, placements for the MetaStrategy keyspace were included in the main DataPlacements + // so when targetting such a version, emulate that. + DataPlacements placements = version.isBefore(Version.V9) ? preV9Placements(metadata) : metadata.placements; + DataPlacements.serializer.serialize(placements, out, version); + if (version.isAtLeast(MIN_ACCORD_VERSION)) { AccordFastPath.serializer.serialize(metadata.accordFastPath, out, version); @@ -1174,7 +1185,8 @@ public void serialize(ClusterMetadata metadata, DataOutputPlus out, Version vers assert key.valueType.isInstance(value); value.serialize(out, version); } - if (version.isAtLeast(Version.V7)) + // From V9 CMS membership is directly encoded in ClusterMetadata + if (version.isAtLeast(Version.V9)) CMSMembership.serializer.serialize(metadata.cmsMembership, out, version); } @@ -1234,19 +1246,20 @@ public ClusterMetadata deserialize(DataInputPlus in, Version version) throws IOE } CMSMembership cmsMembership = CMSMembership.EMPTY; - if (version.isAtLeast(Version.V8)) + if (version.isAtLeast(Version.V9)) cmsMembership = CMSMembership.serializer.deserialize(in, version); else { KeyspaceMetadata metadataKs = schema.getKeyspaceMetadata(SchemaConstants.METADATA_KEYSPACE_NAME); if (!dir.isEmpty()) { - // Pre-V7 the membership of the CMS was always inferred from the placement of the distributed + // Pre-V9 the membership of the CMS was always inferred from the placement of the distributed // metadata keyspace. This is true for the initial cluster metadata created during upgrade from // gossip and for subsequent epochs. The endpoints in the placement must belong to registered nodes, // so we can derive the CMSMembership using the data placement and directory. DataPlacement placement = placements.get(metadataKs.params.replication); cmsMembership = CMSMembership.reconstruct(placement, dir); + placements = placements.unbuild().without(metadataKs.params.replication).build(); } placements = placements.unbuild().without(metadataKs.params.replication).build(); } @@ -1299,8 +1312,7 @@ public long serializedSize(ClusterMetadata metadata, Version version) sizeof(metadata.partitioner.getClass().getCanonicalName()) + DistributedSchema.serializer.serializedSize(metadata.schema, version) + Directory.serializer.serializedSize(metadata.directory, version) + - TokenMap.serializer.serializedSize(metadata.tokenMap, version) + - DataPlacements.serializer.serializedSize(metadata.placements, version); + TokenMap.serializer.serializedSize(metadata.tokenMap, version); if (version.isAtLeast(MIN_ACCORD_VERSION)) { @@ -1312,12 +1324,27 @@ public long serializedSize(ClusterMetadata metadata, Version version) size += LockedRanges.serializer.serializedSize(metadata.lockedRanges, version) + InProgressSequences.serializer.serializedSize(metadata.inProgressSequences, version); - if (version.isAtLeast(Version.V8)) + // Prior to V9, placements for the MetaStrategy keyspace were included in the main DataPlacements + // so when targetting such a version, emulate that. + DataPlacements placements = version.isBefore(Version.V9) ? preV9Placements(metadata) : metadata.placements; + size += DataPlacements.serializer.serializedSize(placements, version); + // From V9 CMS membership is directly encoded in ClusterMetadata + if (version.isAtLeast(Version.V9)) size += CMSMembership.serializer.serializedSize(metadata.cmsMembership, version); return size; } + private DataPlacements preV9Placements(ClusterMetadata metadata) + { + if (metadata.cmsDataPlacement.isEmpty()) + return metadata.placements; + + return metadata.placements.unbuild() + .with(ReplicationParams.meta(metadata), metadata.cmsDataPlacement) + .build(); + } + public static IPartitioner getPartitioner(DataInputPlus in, Version version) throws IOException { if (version.isAtLeast(Version.V1)) diff --git a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java index b45887bbc50b..a7cd5590b06d 100644 --- a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java +++ b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java @@ -70,7 +70,7 @@ protected boolean acceptCommit(ClusterMetadata metadata) if (metadata.epoch.isAfter(Epoch.FIRST) && metadata.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort())) return true; return metadata.epoch.isEqualOrBefore(Epoch.FIRST) - && metadata.placements.get(ReplicationParams.meta(metadata)).reads.byEndpoint().keySet().contains(FBUtilities.getBroadcastAddressAndPort()); + && metadata.getCMSPlacement().reads.byEndpoint().keySet().contains(FBUtilities.getBroadcastAddressAndPort()); } @Override diff --git a/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java b/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java index fd1a2618db6b..6c0ff2012c5c 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java +++ b/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java @@ -106,6 +106,11 @@ public static DataPlacement empty() return EMPTY; } + public boolean isEmpty() + { + return reads.isEmpty() && writes.isEmpty(); + } + public static Builder builder() { return new Builder(ReplicaGroups.builder(), diff --git a/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java b/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java index a21d809298b6..7a9e236b1253 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java +++ b/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java @@ -45,6 +45,7 @@ import org.apache.cassandra.utils.FBUtilities; import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; public class DataPlacements extends ReplicationMap implements MetadataValue { @@ -250,14 +251,13 @@ public static class Serializer implements MetadataSerializer public void serialize(DataPlacements t, DataOutputPlus out, Version version) throws IOException { Map map = t.asMap(); + if (version.isBefore(Version.V9)) + out.writeInt(map.size()); + else + out.writeUnsignedVInt32(map.size()); - // From V7, placements for the metadata keyspace are derived from CMSMembership, not serialized - int mapSize = version.isBefore(Version.V7) ? map.size() : Math.max(map.size() - 1, 0); - out.writeInt(mapSize); for (Map.Entry entry : map.entrySet()) { - if (version.isAtLeast(Version.V7) && entry.getKey().isMeta()) - continue; ReplicationParams.serializer.serialize(entry.getKey(), out, version); DataPlacement.serializerFor(entry.getKey()).serialize(entry.getValue(), out, version); } @@ -266,7 +266,7 @@ public void serialize(DataPlacements t, DataOutputPlus out, Version version) thr public DataPlacements deserialize(DataInputPlus in, Version version) throws IOException { - int size = in.readInt(); + int size = version.isBefore(Version.V9) ? in.readInt() : in.readUnsignedVInt32(); Map map = Maps.newHashMapWithExpectedSize(size); for (int i = 0; i < size; i++) { @@ -280,13 +280,9 @@ public DataPlacements deserialize(DataInputPlus in, Version version) throws IOEx public long serializedSize(DataPlacements t, Version version) { Map map = t.asMap(); - // From V7, placements for the metadata keyspace are derived from CMSMembership, not serialized - int mapSize = version.isBefore(Version.V7) ? map.size() : Math.max(map.size() - 1, 0); - long size = sizeof(mapSize); + long size = version.isBefore(Version.V9) ? sizeof(map.size()) : sizeofUnsignedVInt(map.size()); for (Map.Entry entry : map.entrySet()) { - if (version.isAtLeast(Version.V7) && entry.getKey().isMeta()) - continue; size += ReplicationParams.serializer.serializedSize(entry.getKey(), version); size += DataPlacement.serializerFor(entry.getKey()).serializedSize(entry.getValue(), version); } diff --git a/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java b/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java index 3d3b4da374aa..26ecdc33e2be 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java +++ b/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java @@ -329,7 +329,11 @@ public DataPlacements calculatePlacements(Epoch epoch, logger.trace("Calculating data placements for {}", ksMetadata.name); AbstractReplicationStrategy replication = ksMetadata.replicationStrategy; ReplicationParams params = ksMetadata.params.replication; - if (params.isMeta() || params.isLocal()) + if (params.isMeta()) + { + // don't calculate meta strategy placements, these are derived from ClusterMetadata.cmsMembership + } + else if (params.isLocal()) { placements.put(params, metadata.placements.get(params)); } diff --git a/src/java/org/apache/cassandra/tcm/serialization/Version.java b/src/java/org/apache/cassandra/tcm/serialization/Version.java index 1f15a2e31b95..50417d17837b 100644 --- a/src/java/org/apache/cassandra/tcm/serialization/Version.java +++ b/src/java/org/apache/cassandra/tcm/serialization/Version.java @@ -79,6 +79,7 @@ public enum Version V8(8), /** * - DataPlacements don't include MetaStrategy, replaced by ClusterMetadata.CMSMembership + * - Size of DataPlacements is encoded as vint */ V9(9), From cb9ebbbb9442e03be51261724b9cf385ce24e545 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Thu, 19 Jun 2025 13:38:36 +0100 Subject: [PATCH 12/23] [CASSANDRA-20736] Make DataPlacements private on ClusterMetadata --- .../schema/CreateKeyspaceStatement.java | 2 +- .../db/AbstractMutationVerbHandler.java | 2 +- .../cassandra/db/DiskBoundaryManager.java | 2 +- .../cassandra/db/ReadCommandVerbHandler.java | 4 +-- .../db/compaction/CompactionManager.java | 2 +- .../apache/cassandra/db/view/ViewUtils.java | 6 ++-- .../cassandra/hints/HintsDispatcher.java | 2 +- .../cassandra/locator/EndpointsForToken.java | 8 +---- .../cassandra/locator/MetaStrategy.java | 4 +-- .../cassandra/locator/ReplicaLayout.java | 14 ++++---- .../cassandra/locator/ReplicaPlans.java | 2 +- .../service/ActiveRepairService.java | 2 +- .../org/apache/cassandra/service/Rebuild.java | 2 +- .../cassandra/service/StorageService.java | 10 +++--- .../service/accord/AccordTopology.java | 2 +- .../apache/cassandra/service/paxos/Paxos.java | 2 +- .../cassandra/service/paxos/PaxosRepair.java | 2 +- .../streaming/DataMovementVerbHandler.java | 2 +- .../apache/cassandra/tcm/ClusterMetadata.java | 32 ++++++++++++------- .../apache/cassandra/tcm/MetadataKeys.java | 2 +- .../cassandra/tcm/PaxosBackedProcessor.java | 1 - .../listeners/PlacementsChangeListener.java | 4 +-- .../tcm/ownership/UniformRangePlacement.java | 2 +- .../tcm/sequences/BootstrapAndJoin.java | 4 +-- .../tcm/sequences/BootstrapAndReplace.java | 4 +-- .../apache/cassandra/tcm/sequences/Move.java | 4 +-- .../tcm/sequences/ProgressBarrier.java | 6 ++-- .../tcm/sequences/RemoveNodeStreams.java | 2 +- .../tcm/sequences/ReplaceSameAddress.java | 2 +- .../tcm/sequences/UnbootstrapAndLeave.java | 2 +- .../tcm/transformations/AccordMarkStale.java | 2 +- .../tcm/transformations/AlterSchema.java | 2 +- .../tcm/transformations/AlterTopology.java | 4 +-- .../transformations/ApplyPlacementDeltas.java | 2 +- .../tcm/transformations/PrepareJoin.java | 6 ++-- .../tcm/transformations/PrepareLeave.java | 2 +- .../tcm/transformations/PrepareMove.java | 4 +-- .../tcm/transformations/PrepareReplace.java | 2 +- .../cms/PrepareCMSReconfiguration.java | 4 +-- .../tools/TransformClusterMetadataHelper.java | 27 ++++------------ .../distributed/shared/ClusterUtils.java | 8 ++--- .../test/log/ClusterMetadataTestHelper.java | 2 +- .../log/MetadataChangeSimulationTest.java | 12 +++---- .../test/log/OperationalEquivalenceTest.java | 3 +- .../test/log/ReconfigureCMSTest.java | 8 ++--- .../test/log/ResumableStartupTest.java | 4 +-- .../test/log/SimulatedOperation.java | 4 +-- .../distributed/test/log/SnapshotTest.java | 2 +- .../test/ring/RangeVersioningTest.java | 2 +- ...ClusterMetadataUpgradeAssassinateTest.java | 2 +- .../fuzz/topology/TopologyMixupTestBase.java | 2 +- .../simulator/cluster/OnClusterReplace.java | 4 +-- .../cassandra/locator/SimpleStrategyTest.java | 2 +- .../service/accord/AccordTopologyUtils.java | 2 +- .../service/accord/EpochSyncTest.java | 6 ++-- .../cassandra/tcm/BootWithMetadataTest.java | 2 +- .../cassandra/tcm/ClusterMetadataTest.java | 4 +-- .../ClusterMetadataTransformationTest.java | 2 +- .../apache/cassandra/tcm/UnregisterTest.java | 2 +- .../tcm/compatibility/GossipHelperTest.java | 6 ++-- .../MetadataSnapshotListenerTest.java | 2 +- .../InProgressSequenceCancellationTest.java | 2 +- .../tcm/sequences/ProgressBarrierTest.java | 10 +++--- .../transformations/EventsMetadataTest.java | 14 ++++---- 64 files changed, 144 insertions(+), 153 deletions(-) diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java index a622b861921c..eac58187b157 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java @@ -96,7 +96,7 @@ public Keyspaces apply(ClusterMetadata metadata) // as we have as keys in metadata.placements to have a fast map lookup // ReplicationParams are immutable, so it is a safe optimization KeyspaceParams keyspaceParams = attrs.asNewKeyspaceParams(); - ReplicationParams replicationParams = metadata.placements.deduplicateReplicationParams(keyspaceParams.replication); + ReplicationParams replicationParams = metadata.placements().deduplicateReplicationParams(keyspaceParams.replication); keyspaceParams = keyspaceParams.withSwapped(replicationParams); KeyspaceMetadata keyspaceMetadata = KeyspaceMetadata.create(keyspaceName, keyspaceParams); diff --git a/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java b/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java index 5d2e4e3cdcfe..18aa78552b73 100644 --- a/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java +++ b/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java @@ -195,6 +195,6 @@ else if (message.epoch().isBefore(metadata.schema.lastModified())) private static VersionedEndpoints.ForToken writePlacements(ClusterMetadata metadata, String keyspace, DecoratedKey key) { - return metadata.placements.get(metadata.schema.getKeyspace(keyspace).getMetadata().params.replication).writes.forToken(key.getToken()); + return metadata.placement(metadata.schema.getKeyspace(keyspace).getMetadata().params.replication).writes.forToken(key.getToken()); } } diff --git a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java index 1b35423fe93b..be8b30ff0732 100644 --- a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java +++ b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java @@ -148,7 +148,7 @@ private static RangesAtEndpoint getLocalRanges(ColumnFamilyStore cfs, ClusterMet if (StorageService.instance.isBootstrapMode() && !StorageService.isReplacingSameAddress()) // When replacing same address, the node marks itself as UN locally { - placement = metadata.placements.get(cfs.keyspace.getMetadata().params.replication); + placement = metadata.placement(cfs.keyspace.getMetadata().params.replication); } else { diff --git a/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java index 8ff89f4e9b5d..5c93b8d6a62e 100644 --- a/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java +++ b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java @@ -238,8 +238,8 @@ private ClusterMetadata checkTokenOwnership(ClusterMetadata metadata, Message getViewNaturalEndpoint(ClusterMetadata metadata, Location local = metadata.locator.local(); KeyspaceMetadata keyspaceMetadata = metadata.schema.getKeyspaces().getNullable(keyspace); - EndpointsForToken naturalBaseReplicas = metadata.placements.get(keyspaceMetadata.params.replication).reads.forToken(baseToken).get(); - EndpointsForToken naturalViewReplicas = metadata.placements.get(keyspaceMetadata.params.replication).reads.forToken(viewToken).get(); + DataPlacement placement = metadata.placement(keyspaceMetadata.params.replication); + EndpointsForToken naturalBaseReplicas = placement.reads.forToken(baseToken).get(); + EndpointsForToken naturalViewReplicas = placement.reads.forToken(viewToken).get(); Optional localReplica = Iterables.tryFind(naturalViewReplicas, Replica::isSelf).toJavaUtil(); if (localReplica.isPresent()) diff --git a/src/java/org/apache/cassandra/hints/HintsDispatcher.java b/src/java/org/apache/cassandra/hints/HintsDispatcher.java index 3ba38bdeafee..025d52579be0 100644 --- a/src/java/org/apache/cassandra/hints/HintsDispatcher.java +++ b/src/java/org/apache/cassandra/hints/HintsDispatcher.java @@ -276,7 +276,7 @@ private void rehintHintsNeedingRehinting() // Also may need to apply locally because it's possible this is from the batchlog // and we never applied it locally // TODO (review): Additional error handling necessary? Hints are lossy - DataPlacement dataPlacement = cm.placements.get(cm.schema.getKeyspace(mutation.getKeyspaceName()).getMetadata().params.replication); + DataPlacement dataPlacement = cm.placement(cm.schema.getKeyspace(mutation.getKeyspaceName()).getMetadata().params.replication); VersionedEndpoints.ForToken forToken = dataPlacement.writes.forToken(mutation.key().getToken()); Replica self = forToken.get().selfIfPresent(); if (self != null) diff --git a/src/java/org/apache/cassandra/locator/EndpointsForToken.java b/src/java/org/apache/cassandra/locator/EndpointsForToken.java index f0a3f732bafa..7342816015b2 100644 --- a/src/java/org/apache/cassandra/locator/EndpointsForToken.java +++ b/src/java/org/apache/cassandra/locator/EndpointsForToken.java @@ -25,9 +25,7 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.ownership.VersionedEndpoints; /** @@ -166,11 +164,7 @@ public static EndpointsForToken copyOf(Token token, Iterable replicas) public static VersionedEndpoints.ForToken natural(Keyspace keyspace, Token token) { - ReplicationParams replication = keyspace.getMetadata().params.replication; - DataPlacement placement = replication.isMeta() - ? ClusterMetadata.current().getCMSPlacement() - : ClusterMetadata.current().placements.get(replication); - return placement.reads.forToken(token); + return ClusterMetadata.current().placement(keyspace.getMetadata().params.replication).reads.forToken(token); } } diff --git a/src/java/org/apache/cassandra/locator/MetaStrategy.java b/src/java/org/apache/cassandra/locator/MetaStrategy.java index a7afa4898c40..fdebff72b166 100644 --- a/src/java/org/apache/cassandra/locator/MetaStrategy.java +++ b/src/java/org/apache/cassandra/locator/MetaStrategy.java @@ -82,13 +82,13 @@ public MetaStrategy(String keyspaceName, Map configOptions) @Override public EndpointsForRange calculateNaturalReplicas(Token token, ClusterMetadata metadata) { - return metadata.placements.get(ReplicationParams.meta(metadata)).reads.forRange(entireRange).get(); + return metadata.placement(ReplicationParams.meta(metadata)).reads.forRange(entireRange).get(); } @Override public DataPlacement calculateDataPlacement(Epoch epoch, List> ranges, ClusterMetadata metadata) { - return metadata.placements.get(ReplicationParams.meta(metadata)); + return metadata.placement(ReplicationParams.meta(metadata)); } @Override diff --git a/src/java/org/apache/cassandra/locator/ReplicaLayout.java b/src/java/org/apache/cassandra/locator/ReplicaLayout.java index 71a4ff4beb4e..40533e96195c 100644 --- a/src/java/org/apache/cassandra/locator/ReplicaLayout.java +++ b/src/java/org/apache/cassandra/locator/ReplicaLayout.java @@ -239,9 +239,7 @@ public static ReplicaLayout.ForTokenWrite forTokenWriteLiveAndDown(ClusterMetada { // todo deduplicate so that "pending" contains "read - write", // which is a hack until we revisit how consistency level handles pending - DataPlacement dataPlacement = ks.params.replication.isMeta() - ? metadata.getCMSPlacement() - : metadata.placements.get(ks.params.replication); + DataPlacement dataPlacement = metadata.placement(ks.params.replication); natural = forNonLocalStrategyTokenRead(dataPlacement, token); // perf optimization to avoid double endpoints search and filtering for a typical case // DataPlacement constructor does a deduplication of reads/writes, so we can use cheap == comparision here @@ -396,10 +394,12 @@ static ReplicaLayout.ForRangeRead forRangeReadSorted(ClusterMetadata metadata, K static EndpointsForRange forNonLocalStategyRangeRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, AbstractBounds range) { - DataPlacement placement = keyspace.params.replication.isMeta() - ? metadata.getCMSPlacement() - : metadata.placements.get(keyspace.params.replication); - return placement.reads.forRange(range.right.getToken()).get(); + return metadata.placement(keyspace.params.replication).reads.forRange(range.right.getToken()).get(); + } + + public static EndpointsForToken forNonLocalStrategyTokenRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, Token token) + { + return forNonLocalStrategyTokenRead(metadata.placement(keyspace.params.replication), token); } private static EndpointsForToken forNonLocalStrategyTokenRead(DataPlacement dataPlacement, Token token) diff --git a/src/java/org/apache/cassandra/locator/ReplicaPlans.java b/src/java/org/apache/cassandra/locator/ReplicaPlans.java index da52510bf343..b6e71bb3eebe 100644 --- a/src/java/org/apache/cassandra/locator/ReplicaPlans.java +++ b/src/java/org/apache/cassandra/locator/ReplicaPlans.java @@ -232,7 +232,7 @@ public static Replica findCounterLeaderReplica(ClusterMetadata metadata, String NodeProximity proximity = DatabaseDescriptor.getNodeProximity(); AbstractReplicationStrategy replicationStrategy = keyspace.getReplicationStrategy(); - EndpointsForToken replicas = metadata.placements.get(keyspace.getMetadata().params.replication).reads.forToken(key.getToken()).get(); + EndpointsForToken replicas = metadata.placement(keyspace.getMetadata().params.replication).reads.forToken(key.getToken()).get(); // CASSANDRA-13043: filter out those endpoints not accepting clients yet, maybe because still bootstrapping replicas = replicas.filter(replica -> StorageService.instance.isRpcReady(replica.endpoint())); diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java index 286e8e86fcb5..bf187247e74f 100644 --- a/src/java/org/apache/cassandra/service/ActiveRepairService.java +++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java @@ -1213,7 +1213,7 @@ public List>> repairPaxosForTopologyChangeAsync(String ksName // are based on the system partitioner EndpointsForRange endpoints = replication.isMeta() ? ClusterMetadata.current().fullCMSMembersAsReplicas() - : ClusterMetadata.current().placements.get(replication).reads.forRange(range).get(); + : ClusterMetadata.current().placement(replication).reads.forRange(range).get(); Set liveEndpoints = endpoints.filter(FailureDetector.isReplicaAlive).endpoints(); if (!PaxosRepair.hasSufficientLiveNodesForTopologyChange(keyspace, range, liveEndpoints)) diff --git a/src/java/org/apache/cassandra/service/Rebuild.java b/src/java/org/apache/cassandra/service/Rebuild.java index 7c28960b96bf..c766dddacb75 100644 --- a/src/java/org/apache/cassandra/service/Rebuild.java +++ b/src/java/org/apache/cassandra/service/Rebuild.java @@ -232,7 +232,7 @@ private static RangesAtEndpoint rangesForRebuildWithTokens(String tokens, String private static MovementMap movementMap(ClusterMetadata metadata, String keyspace, String tokens) { MovementMap.Builder movementMapBuilder = MovementMap.builder(); - DataPlacements placements = metadata.placements; + DataPlacements placements = metadata.placements(); if (keyspace == null) { placements.forEach((params, placement) -> movementMapBuilder.put(params, addMovementsForParams(placement, null))); diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index dc9d9e3f680b..08d071b828e9 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -2076,7 +2076,7 @@ private EndpointsByRange constructRangeToEndpointMap(String keyspace, List range : ranges) { Token token = tokenMap.nextToken(tokenMap.tokens(), range.right.getToken()); - rangeToEndpointMap.put(range, metadata.placements.get(keyspaceMetadata.params.replication) + rangeToEndpointMap.put(range, metadata.placement(keyspaceMetadata.params.replication) .reads.forRange(token).get()); } } @@ -3414,14 +3414,14 @@ public EndpointsForToken getNaturalReplicasForToken(String keyspaceName, ByteBuf token = MetaStrategy.partitioner.getToken(key); else token = metadata.partitioner.getToken(key); - return metadata.placements.get(keyspaceMetadata.params.replication).reads.forToken(token).get(); + return metadata.placement(keyspaceMetadata.params.replication).reads.forToken(token).get(); } public boolean isEndpointValidForWrite(String keyspace, Token token) { ClusterMetadata metadata = ClusterMetadata.current(); KeyspaceMetadata keyspaceMetadata = metadata.schema.getKeyspaces().getNullable(keyspace); - return keyspaceMetadata != null && metadata.placements.get(keyspaceMetadata.params.replication).writes.forToken(token).get().containsSelf(); + return keyspaceMetadata != null && metadata.placement(keyspaceMetadata.params.replication).writes.forToken(token).get().containsSelf(); } public void setLoggingLevel(String classQualifier, String rawLevel) throws Exception @@ -4227,7 +4227,7 @@ private LinkedHashMap getEffectiveOwnership(String ke if (replicationParams.isMeta()) { LinkedHashMap ownership = Maps.newLinkedHashMap(); - metadata.placements.get(replicationParams).writes.byEndpoint().flattenValues().forEach((r) -> { + metadata.placement(replicationParams).writes.byEndpoint().flattenValues().forEach((r) -> { ownership.put(r.endpoint(), 1.0f); }); return ownership; diff --git a/src/java/org/apache/cassandra/service/accord/AccordTopology.java b/src/java/org/apache/cassandra/service/accord/AccordTopology.java index f37aef6811d8..2f26338bf845 100644 --- a/src/java/org/apache/cassandra/service/accord/AccordTopology.java +++ b/src/java/org/apache/cassandra/service/accord/AccordTopology.java @@ -310,7 +310,7 @@ public static Topology createAccordTopology(Epoch epoch, DistributedSchema schem public static Topology createAccordTopology(ClusterMetadata metadata, ShardLookup lookup) { - return createAccordTopology(metadata.epoch, metadata.schema, metadata.placements, metadata.directory, metadata.accordFastPath, lookup, metadata.accordStaleReplicas); + return createAccordTopology(metadata.epoch, metadata.schema, metadata.placements(), metadata.directory, metadata.accordFastPath, lookup, metadata.accordStaleReplicas); } public static Topology createAccordTopology(ClusterMetadata metadata, Topology current) diff --git a/src/java/org/apache/cassandra/service/paxos/Paxos.java b/src/java/org/apache/cassandra/service/paxos/Paxos.java index a40900f73cf0..da0d2b4e0dd5 100644 --- a/src/java/org/apache/cassandra/service/paxos/Paxos.java +++ b/src/java/org/apache/cassandra/service/paxos/Paxos.java @@ -277,7 +277,7 @@ static Electorate.Local get(TableMetadata table, DecoratedKey key, ConsistencyLe final Token token = table.partitioner == MetaStrategy.partitioner ? MetaStrategy.entireRange.right : key.getToken(); ClusterMetadata metadata = ClusterMetadata.current(); Keyspace keyspace = Keyspace.open(table.keyspace); - DataPlacement placement = metadata.placements.get(keyspace.getMetadata().params.replication); + DataPlacement placement = metadata.placement(keyspace.getMetadata().params.replication); Epoch epoch = placement.writes.forToken(token).lastModified(); ForTokenWrite electorate = forTokenWriteLiveAndDown(metadata, keyspace, token); if (consistency == LOCAL_SERIAL) diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java b/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java index 5948086394de..33189404e694 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java @@ -586,7 +586,7 @@ public static boolean hasSufficientLiveNodesForTopologyChange(Keyspace keyspace, ClusterMetadata metadata = ClusterMetadata.current(); Collection allEndpoints = replication.isMeta() ? metadata.fullCMSMembers() - : metadata.placements.get(replication).reads.forRange(range).endpoints(); + : metadata.placement(replication).reads.forRange(range).endpoints(); return hasSufficientLiveNodesForTopologyChange(allEndpoints, liveEndpoints, ep -> metadata.locator.location(ep).datacenter, diff --git a/src/java/org/apache/cassandra/streaming/DataMovementVerbHandler.java b/src/java/org/apache/cassandra/streaming/DataMovementVerbHandler.java index 1bccf29044f2..0bf743fef573 100644 --- a/src/java/org/apache/cassandra/streaming/DataMovementVerbHandler.java +++ b/src/java/org/apache/cassandra/streaming/DataMovementVerbHandler.java @@ -50,7 +50,7 @@ public void doVerb(Message message) throws IOException StreamPlan streamPlan = new StreamPlan(StreamOperation.fromString(message.payload.streamOperation)); ClusterMetadata metadata = ClusterMetadata.current(); Schema.instance.getNonLocalStrategyKeyspaces().stream().forEach((ksm) -> { - if (metadata.placements.get(ksm.params.replication).writes.byEndpoint().keySet().size() <= 1) + if (metadata.placement(ksm.params.replication).writes.byEndpoint().keySet().size() <= 1) return; message.payload.movements.get(ksm.params.replication).asMap().forEach((local, endpoints) -> { diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 56b71360a6bd..c70001c8e272 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -105,7 +105,7 @@ public class ClusterMetadata public final DistributedSchema schema; public final Directory directory; public final TokenMap tokenMap; - public final DataPlacements placements; + private final DataPlacements placements; public final AccordFastPath accordFastPath; public final LockedRanges lockedRanges; public final InProgressSequences inProgressSequences; @@ -308,6 +308,16 @@ private DataPlacement calculateCMSPlacement(DataPlacements placements, CMSMember } } + public DataPlacement placement(ReplicationParams params) + { + return params.isMeta() ? cmsDataPlacement : placements.get(params); + } + + public DataPlacements placements() + { + return placements; + } + public Transformer transformer() { return new Transformer(this, this.nextEpoch()); @@ -422,8 +432,8 @@ public DataPlacement writePlacementAllSettled(KeyspaceMetadata ksm) // TODO Remove this as it isn't really an equivalent to the previous concept of pending ranges public boolean hasPendingRangesFor(KeyspaceMetadata ksm, Token token) { - ReplicaGroups writes = placements.get(ksm.params.replication).writes; - ReplicaGroups reads = placements.get(ksm.params.replication).reads; + ReplicaGroups writes = placement(ksm.params.replication).writes; + ReplicaGroups reads = placement(ksm.params.replication).reads; if (ksm.params.replication.isMeta()) return !reads.equals(writes); return !reads.forToken(token).equals(writes.forToken(token)); @@ -432,8 +442,8 @@ public boolean hasPendingRangesFor(KeyspaceMetadata ksm, Token token) // TODO Remove this as it isn't really an equivalent to the previous concept of pending ranges public boolean hasPendingRangesFor(KeyspaceMetadata ksm, InetAddressAndPort endpoint) { - ReplicaGroups writes = placements.get(ksm.params.replication).writes; - ReplicaGroups reads = placements.get(ksm.params.replication).reads; + ReplicaGroups writes = placement(ksm.params.replication).writes; + ReplicaGroups reads = placement(ksm.params.replication).reads; return !writes.byEndpoint().get(endpoint).equals(reads.byEndpoint().get(endpoint)); } @@ -444,22 +454,22 @@ public Collection> localWriteRanges(KeyspaceMetadata metadata) public Collection> writeRanges(KeyspaceMetadata metadata, InetAddressAndPort peer) { - return placements.get(metadata.params.replication).writes.byEndpoint().get(peer).ranges(); + return placement(metadata.params.replication).writes.byEndpoint().get(peer).ranges(); } // TODO Remove this as it isn't really an equivalent to the previous concept of pending ranges public Map, VersionedEndpoints.ForRange> pendingRanges(KeyspaceMetadata metadata) { Map, VersionedEndpoints.ForRange> map = new HashMap<>(); - ReplicaGroups writes = placements.get(metadata.params.replication).writes; - ReplicaGroups reads = placements.get(metadata.params.replication).reads; + ReplicaGroups writes = placement(metadata.params.replication).writes; + ReplicaGroups reads = placement(metadata.params.replication).reads; // first, pending ranges as the result of range splitting or merging // i.e. new ranges being created through join/leave List> pending = new ArrayList<>(writes.ranges()); pending.removeAll(reads.ranges()); for (Range p : pending) - map.put(p, placements.get(metadata.params.replication).writes.forRange(p)); + map.put(p, placement(metadata.params.replication).writes.forRange(p)); // next, ranges where the ranges themselves are not changing, but the replicas are // i.e. replacement or RF increase @@ -476,8 +486,8 @@ public Map, VersionedEndpoints.ForRange> pendingRanges(KeyspaceMeta // TODO Remove this as it isn't really an equivalent to the previous concept of pending endpoints public VersionedEndpoints.ForToken pendingEndpointsFor(KeyspaceMetadata metadata, Token t) { - VersionedEndpoints.ForToken writeEndpoints = placements.get(metadata.params.replication).writes.forToken(t); - VersionedEndpoints.ForToken readEndpoints = placements.get(metadata.params.replication).reads.forToken(t); + VersionedEndpoints.ForToken writeEndpoints = placement(metadata.params.replication).writes.forToken(t); + VersionedEndpoints.ForToken readEndpoints = placement(metadata.params.replication).reads.forToken(t); EndpointsForToken.Builder endpointsForToken = writeEndpoints.get().newBuilder(writeEndpoints.size() - readEndpoints.size()); for (Replica writeReplica : writeEndpoints.get()) diff --git a/src/java/org/apache/cassandra/tcm/MetadataKeys.java b/src/java/org/apache/cassandra/tcm/MetadataKeys.java index 8188d03e14cd..ea73c63afafc 100644 --- a/src/java/org/apache/cassandra/tcm/MetadataKeys.java +++ b/src/java/org/apache/cassandra/tcm/MetadataKeys.java @@ -52,7 +52,7 @@ public class MetadataKeys .put(SCHEMA, cm -> cm.schema) .put(NODE_DIRECTORY, cm -> cm.directory) .put(TOKEN_MAP, cm -> cm.tokenMap) - .put(DATA_PLACEMENTS, cm -> cm.placements) + .put(DATA_PLACEMENTS, cm -> cm.placements()) .put(LOCKED_RANGES, cm -> cm.lockedRanges) .put(IN_PROGRESS_SEQUENCES, cm -> cm.inProgressSequences) .put(ACCORD_FAST_PATH, cm -> cm.accordFastPath) diff --git a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java index a7cd5590b06d..6aeb12aa8817 100644 --- a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java +++ b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java @@ -43,7 +43,6 @@ import org.apache.cassandra.net.RequestCallbackWithFailure; import org.apache.cassandra.net.Verb; import org.apache.cassandra.schema.DistributedMetadataLogKeyspace; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.log.Entry; import org.apache.cassandra.tcm.log.LocalLog; import org.apache.cassandra.tcm.log.LogState; diff --git a/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java b/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java index 605b52637813..3878fadfaefb 100644 --- a/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java +++ b/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java @@ -33,8 +33,8 @@ public void notifyPostCommit(ClusterMetadata prev, ClusterMetadata next, boolean private boolean shouldInvalidate(ClusterMetadata prev, ClusterMetadata next) { - if (!prev.placements.lastModified().equals(next.placements.lastModified()) && - !prev.placements.equivalentTo(next.placements)) // <- todo should we update lastModified if the result is the same? + if (!prev.placements().lastModified().equals(next.placements().lastModified()) && + !prev.placements().equivalentTo(next.placements())) // <- todo should we update lastModified if the result is the same? return true; if (prev.schema.getKeyspaces().size() != next.schema.getKeyspaces().size()) diff --git a/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java b/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java index 26ecdc33e2be..6fa832f8d402 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java +++ b/src/java/org/apache/cassandra/tcm/ownership/UniformRangePlacement.java @@ -335,7 +335,7 @@ public DataPlacements calculatePlacements(Epoch epoch, } else if (params.isLocal()) { - placements.put(params, metadata.placements.get(params)); + placements.put(params, metadata.placement(params)); } else { diff --git a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java index b410097f84a8..2148a5d567d4 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java +++ b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java @@ -314,7 +314,7 @@ public ProgressBarrier barrier() @Override public ClusterMetadata.Transformer cancel(ClusterMetadata metadata) { - DataPlacements placements = metadata.placements; + DataPlacements placements = metadata.placements(); switch (next) { // need to undo MID_JOIN and START_JOIN, then merge the ranges split by PrepareJoin @@ -345,7 +345,7 @@ public BootstrapAndJoin finishJoiningRing() @VisibleForTesting public Pair getMovementMaps(ClusterMetadata metadata) { - MovementMap movementMap = movementMap(metadata.directory.endpoint(startJoin.nodeId()), metadata.placements, startJoin.delta()); + MovementMap movementMap = movementMap(metadata.directory.endpoint(startJoin.nodeId()), metadata.placements(), startJoin.delta()); MovementMap strictMovementMap = toStrict(movementMap, finishJoin.delta()); return Pair.create(movementMap, strictMovementMap); } diff --git a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java index cb34179d4456..5ae2e3c5ddd7 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java +++ b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java @@ -307,7 +307,7 @@ public ProgressBarrier barrier() @Override public ClusterMetadata.Transformer cancel(ClusterMetadata metadata) { - DataPlacements placements = metadata.placements; + DataPlacements placements = metadata.placements(); switch (next) { // need to undo MID_REPLACE and START_REPLACE, but PREPARE_REPLACE doesn't affect placements @@ -344,7 +344,7 @@ public BootstrapAndReplace finishJoiningRing() private static MovementMap movementMap(InetAddressAndPort beingReplaced, PlacementDeltas startDelta) { MovementMap.Builder movementMapBuilder = MovementMap.builder(); - DataPlacements placements = ClusterMetadata.current().placements; + DataPlacements placements = ClusterMetadata.current().placements(); startDelta.forEach((params, delta) -> { EndpointsByReplica.Builder movements = new EndpointsByReplica.Builder(); DataPlacement originalPlacements = placements.get(params); diff --git a/src/java/org/apache/cassandra/tcm/sequences/Move.java b/src/java/org/apache/cassandra/tcm/sequences/Move.java index a26d6ded4500..c680b892b5b9 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/Move.java +++ b/src/java/org/apache/cassandra/tcm/sequences/Move.java @@ -223,7 +223,7 @@ public SequenceState executeNext() StreamPlan streamPlan = new StreamPlan(StreamOperation.RELOCATION); Keyspaces keyspaces = Schema.instance.getNonLocalStrategyKeyspaces(); Map movementMap = movementMap(FailureDetector.instance, - metadata.placements, + metadata.placements(), toSplitRanges, startMove.delta(), midMove.delta(), @@ -330,7 +330,7 @@ public ProgressBarrier barrier() @Override public ClusterMetadata.Transformer cancel(ClusterMetadata metadata) { - DataPlacements placements = metadata.placements; + DataPlacements placements = metadata.placements(); switch (next) { diff --git a/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java b/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java index 7b89acceb6e2..739acaa33bf2 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java +++ b/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java @@ -58,6 +58,7 @@ import org.apache.cassandra.tcm.Retry; import org.apache.cassandra.tcm.membership.Directory; import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.concurrent.AsyncPromise; @@ -170,8 +171,9 @@ public boolean await(ConsistencyLevel cl, ClusterMetadata metadata) Set> ranges = e.getValue(); for (Range range : ranges) { - EndpointsForRange writes = metadata.placements.get(params).writes.matchRange(range).get().filter(r -> filter.test(r.endpoint())); - EndpointsForRange reads = metadata.placements.get(params).reads.matchRange(range).get().filter(r -> filter.test(r.endpoint())); + DataPlacement placement = metadata.placement(params); + EndpointsForRange writes = placement.writes.matchRange(range).get().filter(r -> filter.test(r.endpoint())); + EndpointsForRange reads = placement.reads.matchRange(range).get().filter(r -> filter.test(r.endpoint())); reads.stream().map(Replica::endpoint).forEach(superset::add); writes.stream().map(Replica::endpoint).forEach(superset::add); diff --git a/src/java/org/apache/cassandra/tcm/sequences/RemoveNodeStreams.java b/src/java/org/apache/cassandra/tcm/sequences/RemoveNodeStreams.java index d6e8ca312754..bd09f8aaca7d 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/RemoveNodeStreams.java +++ b/src/java/org/apache/cassandra/tcm/sequences/RemoveNodeStreams.java @@ -121,7 +121,7 @@ private static MovementMap movementMap(InetAddressAndPort leaving, ClusterMetada RangesByEndpoint startWriteAdditions = startDelta.get(params).writes.additions; RangesByEndpoint startWriteRemovals = startDelta.get(params).writes.removals; // find current placements from the metadata, we need to stream from replicas that are not changed and are therefore not in the deltas - ReplicaGroups currentPlacements = metadata.placements.get(params).reads; + ReplicaGroups currentPlacements = metadata.placement(params).reads; startWriteAdditions.flattenValues() .forEach(newReplica -> { EndpointsForRange.Builder candidateBuilder = new EndpointsForRange.Builder(newReplica.range()); diff --git a/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java b/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java index 4580d716a1ba..ab819c5f9686 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java +++ b/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java @@ -44,7 +44,7 @@ public static MovementMap movementMap(NodeId nodeId, ClusterMetadata metadata) { MovementMap.Builder builder = MovementMap.builder(); InetAddressAndPort addr = metadata.directory.endpoint(nodeId); - metadata.placements.forEach((params, placement) -> { + metadata.placements().forEach((params, placement) -> { EndpointsByReplica.Builder sources = new EndpointsByReplica.Builder(); placement.reads.byEndpoint().get(addr).forEach(destination -> { placement.reads.forRange(destination.range()).forEach(potentialSource -> { diff --git a/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java b/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java index 83d543a8dda1..593b18dc41c0 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java +++ b/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java @@ -236,7 +236,7 @@ public ProgressBarrier barrier() @Override public ClusterMetadata.Transformer cancel(ClusterMetadata metadata) { - DataPlacements placements = metadata.placements; + DataPlacements placements = metadata.placements(); switch (next) { // need to undo MID_LEAVE and START_LEAVE, but PrepareLeave doesn't affect placement diff --git a/src/java/org/apache/cassandra/tcm/transformations/AccordMarkStale.java b/src/java/org/apache/cassandra/tcm/transformations/AccordMarkStale.java index be4385f4262d..77c88e471456 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/AccordMarkStale.java +++ b/src/java/org/apache/cassandra/tcm/transformations/AccordMarkStale.java @@ -78,7 +78,7 @@ public Result execute(ClusterMetadata prev) for (KeyspaceMetadata keyspace : prev.schema.getKeyspaces().without(SchemaConstants.REPLICATED_SYSTEM_KEYSPACE_NAMES)) { - List shards = AccordTopology.KeyspaceShard.forKeyspace(keyspace, prev.placements, prev.directory); + List shards = AccordTopology.KeyspaceShard.forKeyspace(keyspace, prev.placements(), prev.directory); for (AccordTopology.KeyspaceShard shard : shards) { diff --git a/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java b/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java index a0031c621a93..cc1594caf92b 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java +++ b/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java @@ -244,7 +244,7 @@ public final Result execute(ClusterMetadata prev) DataPlacements.Builder newPlacementsBuilder = DataPlacements.builder(calculatedPlacements.size()); calculatedPlacements.forEach((params, newPlacement) -> { - DataPlacement previousPlacement = prev.placements.get(params); + DataPlacement previousPlacement = prev.placement(params); // Preserve placement versioning that has resulted from natural application where possible if (previousPlacement.equivalentTo(newPlacement)) newPlacementsBuilder.with(params, previousPlacement); diff --git a/src/java/org/apache/cassandra/tcm/transformations/AlterTopology.java b/src/java/org/apache/cassandra/tcm/transformations/AlterTopology.java index e247f66a8805..4eb3d7b36489 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/AlterTopology.java +++ b/src/java/org/apache/cassandra/tcm/transformations/AlterTopology.java @@ -130,11 +130,11 @@ public Result execute(ClusterMetadata prev) for (Map.Entry update : updates.entrySet()) updated = updated.withUpdatedRackAndDc(update.getKey(), update.getValue()); ClusterMetadata proposed = prev.transformer().with(updated).build().metadata; - DataPlacements proposedPlacements = placementProvider.calculatePlacements(prev.placements.lastModified(), + DataPlacements proposedPlacements = placementProvider.calculatePlacements(prev.placements().lastModified(), proposed.tokenMap.toRanges(), proposed, proposed.schema.getKeyspaces()); - if (!proposedPlacements.equivalentTo(prev.placements)) + if (!proposedPlacements.equivalentTo(prev.placements())) { logger.info("Rejecting topology modifications which would materially change data placements: {}", updates); return new Rejected(INVALID, "Proposed updates modify data placements, violating consistency guarantees"); diff --git a/src/java/org/apache/cassandra/tcm/transformations/ApplyPlacementDeltas.java b/src/java/org/apache/cassandra/tcm/transformations/ApplyPlacementDeltas.java index 2a1eaba7e0c4..ba4a2ca01c41 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/ApplyPlacementDeltas.java +++ b/src/java/org/apache/cassandra/tcm/transformations/ApplyPlacementDeltas.java @@ -72,7 +72,7 @@ public final Result execute(ClusterMetadata prev) ClusterMetadata.Transformer next = prev.transformer(); if (!delta.isEmpty()) - next = next.with(delta.apply(prev.nextEpoch(), prev.placements)); + next = next.with(delta.apply(prev.nextEpoch(), prev.placements())); next = transform(prev, next); diff --git a/src/java/org/apache/cassandra/tcm/transformations/PrepareJoin.java b/src/java/org/apache/cassandra/tcm/transformations/PrepareJoin.java index d8986e6829b5..08f9f102336c 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/PrepareJoin.java +++ b/src/java/org/apache/cassandra/tcm/transformations/PrepareJoin.java @@ -89,7 +89,7 @@ */ public class PrepareJoin implements Transformation { - public static final Serializer serializer = new Serializer() + public static final Serializer serializer = new Serializer<>() { public PrepareJoin construct(NodeId nodeId, Set tokens, PlacementProvider placementProvider, boolean joinTokenRing, boolean streamData) { @@ -168,10 +168,10 @@ public Result execute(ClusterMetadata prev) startJoin, midJoin, finishJoin, joinTokenRing, streamData); if (!prev.tokenMap.isEmpty()) - assertPreExistingWriteReplica(prev.placements, transitionPlan); + assertPreExistingWriteReplica(prev.placements(), transitionPlan); LockedRanges newLockedRanges = prev.lockedRanges.lock(lockKey, rangesToLock); - DataPlacements startingPlacements = transitionPlan.toSplit.apply(prev.nextEpoch(), prev.placements); + DataPlacements startingPlacements = transitionPlan.toSplit.apply(prev.nextEpoch(), prev.placements()); ClusterMetadata.Transformer proposed = prev.transformer() .with(newLockedRanges) .with(startingPlacements) diff --git a/src/java/org/apache/cassandra/tcm/transformations/PrepareLeave.java b/src/java/org/apache/cassandra/tcm/transformations/PrepareLeave.java index 0ad37f8bcd0f..74d2c2c8109a 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/PrepareLeave.java +++ b/src/java/org/apache/cassandra/tcm/transformations/PrepareLeave.java @@ -115,7 +115,7 @@ public Result execute(ClusterMetadata prev) PlacementDeltas startDelta = transitionPlan.addToWrites(); PlacementDeltas midDelta = transitionPlan.moveReads(); PlacementDeltas finishDelta = transitionPlan.removeFromWrites(); - transitionPlan.assertPreExistingWriteReplica(prev.placements); + transitionPlan.assertPreExistingWriteReplica(prev.placements()); LockedRanges.Key unlockKey = LockedRanges.keyFor(proposed.epoch); diff --git a/src/java/org/apache/cassandra/tcm/transformations/PrepareMove.java b/src/java/org/apache/cassandra/tcm/transformations/PrepareMove.java index e7e278d0d266..16dcdfca0851 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/PrepareMove.java +++ b/src/java/org/apache/cassandra/tcm/transformations/PrepareMove.java @@ -109,7 +109,7 @@ public Result execute(ClusterMetadata prev) StartMove startMove = new StartMove(nodeId, transitionPlan.addToWrites(), lockKey); MidMove midMove = new MidMove(nodeId, transitionPlan.moveReads(), lockKey); FinishMove finishMove = new FinishMove(nodeId, tokens, transitionPlan.removeFromWrites(), lockKey); - transitionPlan.assertPreExistingWriteReplica(prev.placements); + transitionPlan.assertPreExistingWriteReplica(prev.placements()); Move sequence = Move.newSequence(prev.nextEpoch(), lockKey, @@ -123,7 +123,7 @@ public Result execute(ClusterMetadata prev) return Transformation.success(prev.transformer() .withNodeState(nodeId, NodeState.MOVING) .with(prev.lockedRanges.lock(lockKey, rangesToLock)) - .with(transitionPlan.toSplit.apply(prev.nextEpoch(), prev.placements)) + .with(transitionPlan.toSplit.apply(prev.nextEpoch(), prev.placements())) .with(prev.inProgressSequences.with(nodeId, sequence)), rangesToLock); } diff --git a/src/java/org/apache/cassandra/tcm/transformations/PrepareReplace.java b/src/java/org/apache/cassandra/tcm/transformations/PrepareReplace.java index 8b1577822b83..94bf3508c04e 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/PrepareReplace.java +++ b/src/java/org/apache/cassandra/tcm/transformations/PrepareReplace.java @@ -109,7 +109,7 @@ public Result execute(ClusterMetadata prev) StartReplace start = new StartReplace(replaced, replacement, transitionPlan.addToWrites(), unlockKey); MidReplace mid = new MidReplace(replaced, replacement, transitionPlan.moveReads(), unlockKey); FinishReplace finish = new FinishReplace(replaced, replacement, transitionPlan.removeFromWrites(), unlockKey); - transitionPlan.assertPreExistingWriteReplica(prev.placements); + transitionPlan.assertPreExistingWriteReplica(prev.placements()); Set tokens = new HashSet<>(prev.tokenMap.tokens(replaced)); BootstrapAndReplace plan = BootstrapAndReplace.newSequence(prev.nextEpoch(), diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java b/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java index 0e8c60449137..ad9b51704e67 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java @@ -265,9 +265,7 @@ public Result execute(ClusterMetadata prev) KeyspaceMetadata keyspace = prev.schema.getKeyspaceMetadata(SchemaConstants.METADATA_KEYSPACE_NAME); KeyspaceMetadata newKeyspace = keyspace.withSwapped(new KeyspaceParams(keyspace.params.durableWrites, replicationParams, FastPathStrategy.simple())); - return executeInternal(prev, - transformer -> transformer.with(prev.placements.replaceParams(prev.nextEpoch(), ReplicationParams.meta(prev), replicationParams)) - .with(new DistributedSchema(prev.schema.getKeyspaces().withAddedOrUpdated(newKeyspace)))); + return executeInternal(prev, transformer -> transformer.with(new DistributedSchema(prev.schema.getKeyspaces().withAddedOrUpdated(newKeyspace)))); } public String toString() diff --git a/src/java/org/apache/cassandra/tools/TransformClusterMetadataHelper.java b/src/java/org/apache/cassandra/tools/TransformClusterMetadataHelper.java index f5cf15d5b26d..6bf27bcbea2d 100644 --- a/src/java/org/apache/cassandra/tools/TransformClusterMetadataHelper.java +++ b/src/java/org/apache/cassandra/tools/TransformClusterMetadataHelper.java @@ -27,13 +27,11 @@ import org.apache.cassandra.io.util.FileInputStreamPlus; import org.apache.cassandra.io.util.FileOutputStreamPlus; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.Replica; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeVersion; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.serialization.VerboseMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; @@ -63,9 +61,9 @@ public static void main(String ... args) throws IOException DatabaseDescriptor.setPartitionerUnsafe(partitioner); ClusterMetadataService.initializeForTools(false); ClusterMetadata metadata = ClusterMetadataService.deserializeClusterMetadata(sourceFile); - System.out.println("Old CMS: " + metadata.placements.get(ReplicationParams.meta(metadata))); + System.out.println("Old CMS: " + metadata.placement(ReplicationParams.meta(metadata))); metadata = makeCMS(metadata, InetAddressAndPort.getByNameUnchecked(args[1])); - System.out.println("New CMS: " + metadata.placements.get(ReplicationParams.meta(metadata))); + System.out.println("New CMS: " + metadata.placement(ReplicationParams.meta(metadata))); Path p = Files.createTempFile("clustermetadata", "dump"); try (FileOutputStreamPlus out = new FileOutputStreamPlus(p)) { @@ -76,20 +74,9 @@ public static void main(String ... args) throws IOException public static ClusterMetadata makeCMS(ClusterMetadata metadata, InetAddressAndPort endpoint) { - ReplicationParams metaParams = ReplicationParams.meta(metadata); - Iterable currentReplicas = metadata.placements.get(metaParams).writes.byEndpoint().flattenValues(); - DataPlacement.Builder builder = metadata.placements.get(metaParams).unbuild(); - for (Replica replica : currentReplicas) - { - builder.withoutReadReplica(metadata.epoch, replica) - .withoutWriteReplica(metadata.epoch, replica); - } - Replica newCMS = MetaStrategy.replica(endpoint); - builder.withReadReplica(metadata.epoch, newCMS) - .withWriteReplica(metadata.epoch, newCMS); - return metadata.transformer().with(metadata.placements.unbuild().with(metaParams, - builder.build()) - .build()) - .build().metadata; + NodeId id = metadata.directory.peerId(endpoint); + if (id == null) + throw new IllegalStateException("No node id found for endpoint: " + endpoint); + return metadata.transformer().startJoiningCMS(id).finishJoiningCMS(id).build().metadata; } } diff --git a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java index 19986671ecf2..006379ae3bfc 100644 --- a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java +++ b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java @@ -444,8 +444,8 @@ public static Map getPlacementDebugInfo(ClusterMetadataService m for (KeyspaceMetadata keyspace : metadata.schema.getKeyspaces()) { List[] placements = new List[2]; - placements[0] = metadata.placements.get(keyspace.params.replication).reads.toReplicaStringList(); - placements[1] = metadata.placements.get(keyspace.params.replication).writes.toReplicaStringList(); + placements[0] = metadata.placement(keyspace.params.replication).reads.toReplicaStringList(); + placements[1] = metadata.placement(keyspace.params.replication).writes.toReplicaStringList(); byKeyspace.put(keyspace.name, placements); } return byKeyspace; @@ -465,10 +465,10 @@ public static void logPlacementDebugString(ClusterMetadataService metadataServic StringBuilder builder = new StringBuilder(); builder.append("'keyspace' { 'name':").append(keyspace.name).append("', "); builder.append("'reads':['"); - ReplicaGroups placement = metadata.placements.get(keyspace.params.replication).reads; + ReplicaGroups placement = metadata.placement(keyspace.params.replication).reads; builder.append(byEndpoint ? placement.toStringByEndpoint() : placement.toString()); builder.append("'], 'writes':['"); - placement = metadata.placements.get(keyspace.params.replication).writes; + placement = metadata.placement(keyspace.params.replication).writes; builder.append(byEndpoint ? placement.toStringByEndpoint() : placement.toString()); builder.append("']}"); keyspaces.add(builder.toString()); diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java index a24b7a32b362..dc610ba4e732 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java @@ -1047,7 +1047,7 @@ public static PrepareMove prepareMove(NodeId id, Token newToken) public static VersionedEndpoints.ForToken getNaturalReplicasForToken(ClusterMetadata metadata, String keyspace, Token searchPosition) { KeyspaceMetadata keyspaceMetadata = metadata.schema.getKeyspaces().getNullable(keyspace); - return metadata.placements.get(keyspaceMetadata.params.replication).reads.forToken(searchPosition); + return metadata.placement(keyspaceMetadata.params.replication).reads.forToken(searchPosition); } public static BootstrapAndJoin getBootstrapPlan(int idx) diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java index 82e864b3ef2d..c3efb51c38ea 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java @@ -506,7 +506,7 @@ public static void simulateBounces(ReplicationFactor rf, CMSPlacementStrategy CM Set bouncing = new HashSet<>(); Set replicasFromBouncedReplicaSets = new HashSet<>(); outer: - for (VersionedEndpoints.ForRange placements : sut.service.metadata().placements.get(rf.asKeyspaceParams().replication).writes.endpoints) + for (VersionedEndpoints.ForRange placements : sut.service.metadata().placement(rf.asKeyspaceParams().replication).writes.endpoints) { List replicas = new ArrayList<>(metadata.directory.toNodeIds(placements.get().endpoints())); List bounceCandidates = new ArrayList<>(); @@ -543,8 +543,8 @@ public static void validatePlacementsFinal(CMSTestBase.CMSSut sut, ModelState mo ClusterMetadata actualMetadata = sut.service.metadata(); ReplicationParams replication = actualMetadata.schema.getKeyspaces().get("test").get().params.replication; Assert.assertEquals(replication, sut.rf.asKeyspaceParams().replication); - match(actualMetadata.placements.get(replication).reads, sut.rf.replicate(modelState.simulatedPlacements.nodes).asMap()); - match(actualMetadata.placements.get(replication).writes, sut.rf.replicate(modelState.simulatedPlacements.nodes).asMap()); + match(actualMetadata.placement(replication).reads, sut.rf.replicate(modelState.simulatedPlacements.nodes).asMap()); + match(actualMetadata.placement(replication).writes, sut.rf.replicate(modelState.simulatedPlacements.nodes).asMap()); } public static void validatePlacements(CMSTestBase.CMSSut sut, ModelState modelState) throws Throwable @@ -557,7 +557,7 @@ public static void validatePlacements(CMSTestBase.CMSSut sut, ModelState modelSt Assert.assertEquals(modelState.simulatedPlacements.nodes.stream().map(Node::token).collect(Collectors.toSet()), actualMetadata.tokenMap.tokens().stream().map(t -> ((LongToken) t).getLongValue()).collect(Collectors.toSet())); - for (Map.Entry e : actualMetadata.placements.asMap().entrySet()) + for (Map.Entry e : actualMetadata.placements().asMap().entrySet()) { if (!e.getKey().equals(replication)) continue; @@ -567,7 +567,7 @@ public static void validatePlacements(CMSTestBase.CMSSut sut, ModelState modelSt match(placement.reads, modelState.simulatedPlacements.readPlacements); } - validatePlacements(sut.partitioner, sut.rf, modelState, actualMetadata.placements); + validatePlacements(sut.partitioner, sut.rf, modelState, actualMetadata.placements()); } public static ModelChecker.Pair registerNewNode(ModelState state, CMSSut sut, int dcIdx, int rackIdx) @@ -938,7 +938,7 @@ public void testPlacementsAllSettled() throws Throwable Assert.assertTrue(allSettled.equivalentTo(sut.service.metadata().writePlacementAllSettled(ksm))); validatePlacements(sut, state); } - Assert.assertTrue(allSettled.equivalentTo(sut.service.metadata().placements.get(ksm.params.replication))); + Assert.assertTrue(allSettled.equivalentTo(sut.service.metadata().placement(ksm.params.replication))); } } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/OperationalEquivalenceTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/OperationalEquivalenceTest.java index 2f3e9f81ef9a..af4354a1331f 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/OperationalEquivalenceTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/OperationalEquivalenceTest.java @@ -116,8 +116,7 @@ public void testMove(ReplicationFactor rf) throws Exception withMove = ClusterMetadata.current(); } - assertPlacements(simulateAndCompare(rf, equivalentNodes).placements, - withMove.placements); + assertPlacements(simulateAndCompare(rf, equivalentNodes).placements(), withMove.placements()); } private static ClusterMetadata simulateAndCompare(ReplicationFactor rf, List nodes) throws Exception diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java index ff1b7d50ac2a..64663280aa49 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java @@ -85,7 +85,7 @@ public void expandAndShrinkCMSTest() throws Throwable ClusterMetadata metadata = ClusterMetadata.current(); assertEquals(5, metadata.fullCMSMembers().size()); assertEquals(ReplicationParams.simpleMeta(5, metadata.directory.knownDatacenters()), - metadata.placements.keys().stream().filter(ReplicationParams::isMeta).findFirst().get()); + metadata.placements().keys().stream().filter(ReplicationParams::isMeta).findFirst().get()); }); cluster.stream().forEach(i -> { Assert.assertTrue(i.executeInternal(String.format("SELECT * FROM %s.%s", SchemaConstants.METADATA_KEYSPACE_NAME, DistributedMetadataLogKeyspace.TABLE_NAME)).length > 0); @@ -96,7 +96,7 @@ public void expandAndShrinkCMSTest() throws Throwable ClusterMetadata metadata = ClusterMetadata.current(); assertEquals(1, metadata.fullCMSMembers().size()); assertEquals(ReplicationParams.simpleMeta(1, metadata.directory.knownDatacenters()), - metadata.placements.keys().stream().filter(ReplicationParams::isMeta).findFirst().get()); + metadata.placements().keys().stream().filter(ReplicationParams::isMeta).findFirst().get()); }); } } @@ -136,7 +136,7 @@ public void cancelCMSReconfigurationTest() throws Throwable Assert.assertNull(metadata.inProgressSequences.get(ReconfigureCMS.SequenceKey.instance)); assertEquals(2, metadata.fullCMSMembers().size()); ReplicationParams params = ReplicationParams.meta(metadata); - DataPlacement placements = metadata.placements.get(params); + DataPlacement placements = metadata.placements().get(params); assertTrue(placements.reads.equivalentTo(placements.writes)); assertEquals(metadata.fullCMSMembers().size(), Integer.parseInt(params.asMap().get("dc0"))); }); @@ -161,7 +161,7 @@ public void cancelCMSReconfigurationTest() throws Throwable Assert.assertNull(metadata.inProgressSequences.get(ReconfigureCMS.SequenceKey.instance)); Assert.assertTrue(metadata.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort())); assertEquals(3, metadata.fullCMSMembers().size()); - DataPlacement placements = metadata.placements.get(ReplicationParams.meta(metadata)); + DataPlacement placements = metadata.placements().get(ReplicationParams.meta(metadata)); Assert.assertTrue(placements.reads.equivalentTo(placements.writes)); }); } diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ResumableStartupTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ResumableStartupTest.java index 07ec054d54ae..51ccb4d7ea6b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ResumableStartupTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ResumableStartupTest.java @@ -146,12 +146,12 @@ public void bootstrapWithDeferredJoinTest() throws Throwable KeyspaceMetadata ksm = metadata.schema.getKeyspaceMetadata(keyspace); boolean isWriteReplica = false; boolean isReadReplica = false; - for (InetAddressAndPort readReplica : metadata.placements.get(ksm.params.replication).reads.byEndpoint().keySet()) + for (InetAddressAndPort readReplica : metadata.placement(ksm.params.replication).reads.byEndpoint().keySet()) { if (readReplica.getHostAddressAndPort().equals(newAddress)) isReadReplica = true; } - for (InetAddressAndPort writeReplica : metadata.placements.get(ksm.params.replication).writes.byEndpoint().keySet()) + for (InetAddressAndPort writeReplica : metadata.placement(ksm.params.replication).writes.byEndpoint().keySet()) { if (writeReplica.getHostAddressAndPort().equals(newAddress)) isWriteReplica = true; diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java b/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java index 5275471476b1..d4131f060d9a 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java @@ -177,8 +177,8 @@ public void advance(SimulatedPlacements simulatedState, ModelState.Transformer t sutActions.next(); ClusterMetadata m2 = ClusterMetadata.current(); - Map, VersionedEndpoints.ForRange> after = m2.placements.get(simulatedState.rf.asKeyspaceParams().replication).reads.asMap(); - m1.placements.get(simulatedState.rf.asKeyspaceParams().replication).reads.forEach((k, beforePlacements) -> { + Map, VersionedEndpoints.ForRange> after = m2.placement(simulatedState.rf.asKeyspaceParams().replication).reads.asMap(); + m1.placement(simulatedState.rf.asKeyspaceParams().replication).reads.forEach((k, beforePlacements) -> { if (after.containsKey(k)) { VersionedEndpoints.ForRange afterPlacements = after.get(k); diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java index e355fdccc800..62416825bb2c 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java @@ -69,7 +69,7 @@ public void testSimpleSnapshot() throws Throwable ClusterMetadata before = ClusterMetadata.current(); ClusterMetadata after = ClusterMetadataService.instance().triggerSnapshot(); ClusterMetadata serialized = ClusterMetadataService.instance().snapshotManager().getSnapshot(after.epoch); - assertEquals(before.placements, serialized.placements); + assertEquals(before.placements(), serialized.placements()); assertEquals(before.tokenMap, serialized.tokenMap); assertEquals(before.directory, serialized.directory); assertEquals(before.schema, serialized.schema); diff --git a/test/distributed/org/apache/cassandra/distributed/test/ring/RangeVersioningTest.java b/test/distributed/org/apache/cassandra/distributed/test/ring/RangeVersioningTest.java index 35c81a53bb3e..1b7eda5aced4 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ring/RangeVersioningTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ring/RangeVersioningTest.java @@ -54,7 +54,7 @@ public void existingPlacementsUnchangedAfterAlter() throws IOException for (int i = 1; i <= 4; i++) { Epoch smallestSeen = null; - for (VersionedEndpoints.ForRange fr : metadata.placements.get(ReplicationParams.simple(i)).writes.endpoints) + for (VersionedEndpoints.ForRange fr : metadata.placement(ReplicationParams.simple(i)).writes.endpoints) { if (smallestSeen == null || fr.lastModified().isBefore(smallestSeen)) smallestSeen = fr.lastModified(); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeAssassinateTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeAssassinateTest.java index c020fdcc66c1..d5875e70fe79 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeAssassinateTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeAssassinateTest.java @@ -60,7 +60,7 @@ static void checkPlacements(IUpgradeableInstance i, String host, boolean shouldE ((IInvokableInstance) i).runOnInstance(() -> { ClusterMetadata metadata = ClusterMetadata.current(); InetAddressAndPort ep = InetAddressAndPort.getByNameUnchecked(host); - metadata.placements.asMap().forEach((key, value) -> { + metadata.placements().forEach((key, value) -> { if (key.isMeta()) return; boolean existsInPlacements = Streams.concat(value.reads.endpoints.stream(), diff --git a/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java b/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java index 9fa798bb6397..b1d19a789108 100644 --- a/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java +++ b/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java @@ -940,7 +940,7 @@ public static int[] cmsGroup(IInvokableInstance inst) { return inst.callOnInstance(() -> { ClusterMetadata current = ClusterMetadata.current(); - Set members = current.placements.get(ReplicationParams.meta(current)).writes.byEndpoint().keySet(); + Set members = current.placement(ReplicationParams.meta(current)).writes.byEndpoint().keySet(); // Why not just use 'current.fullCMSMembers()'? That uses the "read" replicas, so "could" have less endpoints // It would be more consistent to use fullCMSMembers but thought process is knowing the full set is better // than the coordination set. diff --git a/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterReplace.java b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterReplace.java index 140da9f18e9b..9718f86ab89a 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterReplace.java +++ b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterReplace.java @@ -78,7 +78,7 @@ public ActionList performSimple() List> repairRanges = actions.cluster.get(leaving).unsafeApplyOnThisThread( (String keyspaceName) -> { ClusterMetadata metadata = ClusterMetadata.current(); - return metadata.placements.get(metadata.schema.getKeyspace(keyspaceName).getMetadata().params.replication) + return metadata.placement(metadata.schema.getKeyspace(keyspaceName).getMetadata().params.replication) .writes.ranges() .stream() .map(OnClusterReplace::toStringEntry) @@ -93,7 +93,7 @@ public ActionList performSimple() (String keyspaceName, String tk) -> { ClusterMetadata metadata = ClusterMetadata.current(); KeyspaceMetadata keyspaceMetadata = metadata.schema.getKeyspaces().getNullable(keyspaceName); - return metadata.placements.get(keyspaceMetadata.params.replication).reads + return metadata.placement(keyspaceMetadata.params.replication).reads .forToken(Utils.parseToken(tk)) .get() .stream().map(Replica::endpoint) diff --git a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java index d2a59bdc71a4..25854eca3e95 100644 --- a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java +++ b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java @@ -372,6 +372,6 @@ public static EndpointsForToken getWriteEndpoints(ClusterMetadata metadata, ReplicationParams replicationParams, Token token) { - return metadata.placements.get(replicationParams).writes.forToken(token).get(); + return metadata.placement(replicationParams).writes.forToken(token).get(); } } diff --git a/test/unit/org/apache/cassandra/service/accord/AccordTopologyUtils.java b/test/unit/org/apache/cassandra/service/accord/AccordTopologyUtils.java index 8b6b9ddb3c97..22a62ac84cd0 100644 --- a/test/unit/org/apache/cassandra/service/accord/AccordTopologyUtils.java +++ b/test/unit/org/apache/cassandra/service/accord/AccordTopologyUtils.java @@ -105,7 +105,7 @@ public static ClusterMetadata configureCluster(List> ranges, Keyspa { ReplicationParams replication = keyspace.params.replication; AbstractReplicationStrategy strategy = AbstractReplicationStrategy.createReplicationStrategy(keyspace.name, replication); - DataPlacements.Builder placements = metadata.placements.unbuild(); + DataPlacements.Builder placements = metadata.placements().unbuild(); DataPlacement placement = strategy.calculateDataPlacement(Epoch.EMPTY, metadata.tokenMap.toRanges(), metadata); placements.with(replication, placement); metadata = transformer.with(placements.build()).build().metadata; diff --git a/test/unit/org/apache/cassandra/service/accord/EpochSyncTest.java b/test/unit/org/apache/cassandra/service/accord/EpochSyncTest.java index c2070a76f507..011973415930 100644 --- a/test/unit/org/apache/cassandra/service/accord/EpochSyncTest.java +++ b/test/unit/org/apache/cassandra/service/accord/EpochSyncTest.java @@ -338,7 +338,7 @@ private static boolean left(ClusterMetadata metadata, Node.Id id) private static boolean joined(ClusterMetadata metadata, Node.Id id) { NodeAddresses address = metadata.directory.getNodeAddresses(new NodeId(id.id)); - return metadata.placements.get(replication_params).reads.byEndpoint().keySet().contains(address.broadcastAddress); + return metadata.placement(replication_params).reads.byEndpoint().keySet().contains(address.broadcastAddress); } public enum EpochTracker { topologyManager, accordSyncPropagator } @@ -609,7 +609,7 @@ private void notify(ClusterMetadata current) { Topology t = AccordTopology.createAccordTopology(current); Ranges ranges = t.ranges().mergeTouching(); - if (!current.placements.get(replication_params).reads.isEmpty()) + if (!current.placement(replication_params).reads.isEmpty()) Assertions.assertThat(ranges).hasSize(1); cms.setMetadata(current); for (Node.Id id : status(s -> s != Status.Removed)) @@ -715,7 +715,7 @@ void maybeTransition(ClusterMetadata current, Topology t) case Registered: Invariants.require(!t.nodes().contains(id), "Node was in Init state but present in the Topology!"); Invariants.require(current.directory.peerId(address(id)) != null, "Node exists but not in TCM"); - if (current.placements.get(replication_params).writes.byEndpoint().keySet().contains(address(id))) + if (current.placement(replication_params).writes.byEndpoint().keySet().contains(address(id))) status = Status.Joining; break; case Joining: diff --git a/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java b/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java index 6b0d1194482f..476a45459713 100644 --- a/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java +++ b/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java @@ -156,7 +156,7 @@ private Epoch doTest(Epoch epoch, ClusterMetadata first) throws Throwable assertEquals(toWrite.schema, fromRead.schema); assertEquals(toWrite.directory, fromRead.directory); assertEquals(toWrite.tokenMap, fromRead.tokenMap); - assertEquals(toWrite.placements, fromRead.placements); + assertEquals(toWrite.placements(), fromRead.placements()); assertEquals(toWrite.lockedRanges, fromRead.lockedRanges); assertEquals(toWrite.inProgressSequences, fromRead.inProgressSequences); assertEquals(toWrite.extensions, fromRead.extensions); diff --git a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTest.java b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTest.java index ac3a278cf238..8c307b519ee6 100644 --- a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTest.java +++ b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTest.java @@ -85,7 +85,7 @@ public void testWritePlacementAllSettledLeaving() ClusterMetadataService.instance().commit(plan.midLeave); ClusterMetadataService.instance().commit(plan.finishLeave); - DataPlacement actualFinishedWritePlacements = ClusterMetadata.current().placements.get(ksm.params.replication); + DataPlacement actualFinishedWritePlacements = ClusterMetadata.current().placement(ksm.params.replication); assertTrue(actualFinishedWritePlacements.difference(writeAllSettled).writes.removals.isEmpty()); assertTrue(actualFinishedWritePlacements.difference(writeAllSettled).writes.additions.isEmpty()); @@ -111,7 +111,7 @@ public void testWritePlacementAllSettledJoining() ClusterMetadataService.instance().commit(plan.midJoin); ClusterMetadataService.instance().commit(plan.finishJoin); - DataPlacement actualFinishedWritePlacements = ClusterMetadata.current().placements.get(ksm.params.replication); + DataPlacement actualFinishedWritePlacements = ClusterMetadata.current().placement(ksm.params.replication); assertTrue(actualFinishedWritePlacements.difference(writeAllSettled).writes.removals.isEmpty()); assertTrue(actualFinishedWritePlacements.difference(writeAllSettled).writes.additions.isEmpty()); } diff --git a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java index 730ee6cb6d41..235a9735e801 100644 --- a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java +++ b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java @@ -348,7 +348,7 @@ else if (key == NODE_DIRECTORY) else if (key == TOKEN_MAP) return metadata.tokenMap; else if (key == DATA_PLACEMENTS) - return metadata.placements; + return metadata.placements(); else if (key == LOCKED_RANGES) return metadata.lockedRanges; else if (key == IN_PROGRESS_SEQUENCES) diff --git a/test/unit/org/apache/cassandra/tcm/UnregisterTest.java b/test/unit/org/apache/cassandra/tcm/UnregisterTest.java index 62a6b160a288..3861eefd6bd7 100644 --- a/test/unit/org/apache/cassandra/tcm/UnregisterTest.java +++ b/test/unit/org/apache/cassandra/tcm/UnregisterTest.java @@ -110,7 +110,7 @@ private static void assertNoTrace(InetAddressAndPort ep, NodeId nodeId, Token t, assertFalse(metadata.directory.allJoinedEndpoints().contains(ep)); assertFalse(metadata.directory.allDatacenterRacks().containsKey("dc2")); assertFalse(metadata.directory.knownDatacenters().contains("dc2")); - metadata.placements.asMap().forEach((params, placement) -> { + metadata.placements().forEach((params, placement) -> { assertFalse(Streams.concat(placement.writes.endpoints.stream(), placement.reads.endpoints.stream()).anyMatch((fr) -> fr.endpoints().contains(ep))); }); } diff --git a/test/unit/org/apache/cassandra/tcm/compatibility/GossipHelperTest.java b/test/unit/org/apache/cassandra/tcm/compatibility/GossipHelperTest.java index cac53d553fb4..17d515e1ff5a 100644 --- a/test/unit/org/apache/cassandra/tcm/compatibility/GossipHelperTest.java +++ b/test/unit/org/apache/cassandra/tcm/compatibility/GossipHelperTest.java @@ -104,7 +104,7 @@ public void singleInstanceFromGossipTest() throws UnknownHostException assertEquals(internal, metadata.directory.addresses.get(nodeId).localAddress); assertEquals(nativeAddress, metadata.directory.addresses.get(nodeId).nativeAddress); - DataPlacements dp = metadata.placements; + DataPlacements dp = metadata.placements(); assertEquals(1, dp.get(KSM.params.replication).reads.forToken(token).get().size()); assertTrue(dp.get(KSM.params.replication).reads.forToken(token).get().contains(endpoint)); assertEquals(1, dp.get(KSM.params.replication).writes.forToken(token).get().size()); @@ -196,8 +196,8 @@ private static void verifyPlacements(Map endpoints, ClusterMetad assertEquals(entry.getValue(), metadata.tokenMap.tokens(nodeId).iterator().next()); } - ReplicaGroups reads = metadata.placements.get(KSM_NTS.params.replication).reads; - ReplicaGroups writes = metadata.placements.get(KSM_NTS.params.replication).writes; + ReplicaGroups reads = metadata.placement(KSM_NTS.params.replication).reads; + ReplicaGroups writes = metadata.placement(KSM_NTS.params.replication).writes; assertEquals(reads, writes); // tokens are // dc1: 1: 1000, 3: 3000, 5: 5000, 6: 7000, 7: 9000 diff --git a/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java b/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java index 46adc7c67a1d..05bc00cadb99 100644 --- a/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java +++ b/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java @@ -109,7 +109,7 @@ public void triggerSnapshotTest() listener.notify(entry, result); ClusterMetadata snapshot = snapshots.getSnapshot(nextEpoch); assertEquals(nextEpoch, snapshot.epoch); - assertEquals(toSnapshot.placements, snapshot.placements); + assertEquals(toSnapshot.placements(), snapshot.placements()); } private MetadataSnapshots init() diff --git a/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java b/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java index d4e1b4df299f..94eb01fc1b9e 100644 --- a/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java +++ b/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java @@ -301,7 +301,7 @@ private void testRevertingReplace(long seed) private void assertRelevantMetadata(ClusterMetadata first, ClusterMetadata second) { - assertTrue(first.placements.equivalentTo(second.placements)); + assertTrue(first.placements().equivalentTo(second.placements())); assertTrue(first.directory.equivalentTo(second.directory)); assertTrue(first.tokenMap.equivalentTo(second.tokenMap)); assertEquals(first.lockedRanges.locked.keySet(), second.lockedRanges.locked.keySet()); diff --git a/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java b/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java index ba431db848a7..64dfd850ed6b 100644 --- a/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java +++ b/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java @@ -170,7 +170,7 @@ public void respond(V response, Message message) {} case ALL: { Set replicas = metadata.lockedRanges.locked.get(LockedRanges.keyFor(metadata.epoch)) - .toPeers(rf.asKeyspaceParams().replication, metadata.placements, metadata.directory) + .toPeers(rf.asKeyspaceParams().replication, metadata.placements(), metadata.directory) .stream() .map(n -> metadata.directory.getNodeAddresses(n).broadcastAddress) .collect(Collectors.toSet()); @@ -188,7 +188,7 @@ public void respond(V response, Message message) {} case QUORUM: { Set replicas = metadata.lockedRanges.locked.get(LockedRanges.keyFor(metadata.epoch)) - .toPeers(rf.asKeyspaceParams().replication, metadata.placements, metadata.directory) + .toPeers(rf.asKeyspaceParams().replication, metadata.placements(), metadata.directory) .stream() .map(n -> metadata.directory.getNodeAddresses(n).broadcastAddress) .collect(Collectors.toSet()); @@ -206,7 +206,7 @@ public void respond(V response, Message message) {} case LOCAL_QUORUM: { List replicas = new ArrayList<>(metadata.lockedRanges.locked.get(LockedRanges.keyFor(metadata.epoch)) - .toPeers(rf.asKeyspaceParams().replication, metadata.placements, metadata.directory) + .toPeers(rf.asKeyspaceParams().replication, metadata.placements(), metadata.directory) .stream() .filter((n) -> metadata.directory.location(n).datacenter.equals(dc)) .map(n -> metadata.directory.getNodeAddresses(n).broadcastAddress) @@ -230,7 +230,7 @@ public void respond(V response, Message message) {} { Map byDc = new HashMap<>(); metadata.lockedRanges.locked.get(LockedRanges.keyFor(metadata.epoch)) - .toPeers(rf.asKeyspaceParams().replication, metadata.placements, metadata.directory) + .toPeers(rf.asKeyspaceParams().replication, metadata.placements(), metadata.directory) .forEach(n -> byDc.compute(metadata.directory.location(n).datacenter, (k, v) -> v == null ? 1 : v + 1)); @@ -259,7 +259,7 @@ public void respond(V response, Message message) {} } case ONE: Set replicas = metadata.lockedRanges.locked.get(LockedRanges.keyFor(metadata.epoch)) - .toPeers(rf.asKeyspaceParams().replication, metadata.placements, metadata.directory) + .toPeers(rf.asKeyspaceParams().replication, metadata.placements(), metadata.directory) .stream() .map(n -> metadata.directory.getNodeAddresses(n).broadcastAddress) .collect(Collectors.toSet()); diff --git a/test/unit/org/apache/cassandra/tcm/transformations/EventsMetadataTest.java b/test/unit/org/apache/cassandra/tcm/transformations/EventsMetadataTest.java index 11fd03c6969b..0f9115805fc9 100644 --- a/test/unit/org/apache/cassandra/tcm/transformations/EventsMetadataTest.java +++ b/test/unit/org/apache/cassandra/tcm/transformations/EventsMetadataTest.java @@ -95,8 +95,8 @@ public void firstRegisterTest() // should not be in tokenMap (no tokens yet) assertTrue(metadata.tokenMap.tokens(nodeId).isEmpty()); - assertTrue(metadata.placements.get(KSM.params.replication).writes.byEndpoint().isEmpty()); - assertTrue(metadata.placements.get(KSM.params.replication).reads.byEndpoint().isEmpty()); + assertTrue(metadata.placement(KSM.params.replication).writes.byEndpoint().isEmpty()); + assertTrue(metadata.placement(KSM.params.replication).reads.byEndpoint().isEmpty()); assertTrue(metadata.lockedRanges.locked.isEmpty()); } @@ -130,9 +130,9 @@ public void joinTest() throws ExecutionException, InterruptedException assertTrue(ClusterMetadata.current().tokenMap.tokens(nodeId).isEmpty()); assertEquals(NodeState.BOOTSTRAPPING, ClusterMetadata.current().directory.peerState(nodeId)); - assertTrue(ClusterMetadata.current().placements.get(KSM.params.replication).writes.byEndpoint().containsKey(node1)); + assertTrue(ClusterMetadata.current().placement(KSM.params.replication).writes.byEndpoint().containsKey(node1)); // the first joined node gets added to the read endpoints immediately - assertTrue(ClusterMetadata.current().placements.get(KSM.params.replication).reads.byEndpoint().containsKey(node1)); + assertTrue(ClusterMetadata.current().placement(KSM.params.replication).reads.byEndpoint().containsKey(node1)); ClusterMetadataService.instance().commit(plan.midJoin); ClusterMetadataService.instance().commit(plan.finishJoin); @@ -152,8 +152,8 @@ public void joinTest() throws ExecutionException, InterruptedException assertTrue(ClusterMetadata.current().tokenMap.tokens(nodeId).isEmpty()); assertEquals(NodeState.BOOTSTRAPPING, ClusterMetadata.current().directory.peerState(nodeId)); - assertTrue(ClusterMetadata.current().placements.get(KSM.params.replication).writes.byEndpoint().containsKey(node2)); - assertFalse(ClusterMetadata.current().placements.get(KSM.params.replication).reads.byEndpoint().containsKey(node2)); + assertTrue(ClusterMetadata.current().placement(KSM.params.replication).writes.byEndpoint().containsKey(node2)); + assertFalse(ClusterMetadata.current().placement(KSM.params.replication).reads.byEndpoint().containsKey(node2)); } @Test @@ -178,7 +178,7 @@ public void leaveTest() throws ExecutionException, InterruptedException // no change in metadata after prepareLeave; assertEquals(before.directory, after.directory); assertEquals(before.tokenMap, after.tokenMap); - assertEquals(before.placements, after.placements); + assertEquals(before.placements(), after.placements()); assertEquals(before.schema, after.schema); ClusterMetadataService.instance().commit(leave.startLeave); From aaa2049d94b5c8f929e115406b1ed9fa0f2352c7 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Mon, 16 Jun 2025 19:18:56 +0100 Subject: [PATCH 13/23] [CASSANDRA-20736] Test fixes --- .../apache/cassandra/tcm/ClusterMetadata.java | 13 +++++++------ .../cassandra/tcm/PaxosBackedProcessor.java | 5 +---- .../test/log/BootWithMetadataTest.java | 16 ++++------------ .../test/log/ClusterMetadataDumpTest.java | 3 ++- .../distributed/test/log/ReconfigureCMSTest.java | 12 ++++++------ .../cassandra/auth/GrantAndRevokeTest.java | 8 ++++++++ .../cassandra/tcm/BootWithMetadataTest.java | 6 +++++- .../apache/cassandra/tcm/GetLogStateTest.java | 14 ++++++++++++++ .../tcm/log/DistributedLogStateTest.java | 5 +++-- .../InProgressSequenceCancellationTest.java | 8 ++++++-- .../tcm/transformations/PrepareLeaveTest.java | 15 ++++++++------- 11 files changed, 64 insertions(+), 41 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index c70001c8e272..92e19efc5206 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -228,7 +228,13 @@ public Set fullCMSMemberIds() public boolean isCMSMember(InetAddressAndPort endpoint) { - return fullCMSMembers().contains(endpoint); + if (epoch.isAfter(Epoch.FIRST)) + return fullCMSMembers().contains(endpoint); + + // special case to handle initialization of the CMS for the first time + return epoch.isEqualOrBefore(Epoch.FIRST) && cmsDataPlacement.reads.byEndpoint() + .keySet() + .contains(endpoint); } public Set fullCMSMembers() @@ -255,11 +261,6 @@ public EndpointsForRange fullCMSMembersAsReplicas() return fullCMSReplicas; } - public DataPlacement getCMSPlacement() - { - return cmsDataPlacement; - } - private DataPlacement calculateCMSPlacement(DataPlacements placements, CMSMembership cms) { if (epoch.isBefore(Epoch.FIRST) || schema.getKeyspaces().get(SchemaConstants.METADATA_KEYSPACE_NAME).isEmpty()) diff --git a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java index 6aeb12aa8817..a133309ec95e 100644 --- a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java +++ b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java @@ -66,10 +66,7 @@ public PaxosBackedProcessor(LocalLog log) @Override protected boolean acceptCommit(ClusterMetadata metadata) { - if (metadata.epoch.isAfter(Epoch.FIRST) && metadata.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort())) - return true; - return metadata.epoch.isEqualOrBefore(Epoch.FIRST) - && metadata.getCMSPlacement().reads.byEndpoint().keySet().contains(FBUtilities.getBroadcastAddressAndPort()); + return metadata.isCMSMember(FBUtilities.getBroadcastAddressAndPort()); } @Override diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java index bcc02b89a62d..a76d0d8d424b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java @@ -32,14 +32,11 @@ import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.io.util.FileOutputStreamPlus; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.CMSOperations; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeVersion; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.serialization.VerboseMetadataSerializer; import static org.apache.cassandra.distributed.shared.ClusterUtils.start; @@ -108,15 +105,10 @@ public void newCMSTest() throws IOException, ExecutionException, InterruptedExce try { ClusterMetadata metadata = ClusterMetadata.current(); - Replica oldCMS = MetaStrategy.replica(InetAddressAndPort.getByNameUnchecked("127.0.0.1")); - Replica newCMS = MetaStrategy.replica(InetAddressAndPort.getByNameUnchecked("127.0.0.2")); + NodeId oldCMS = metadata.directory.peerId(InetAddressAndPort.getByNameUnchecked("127.0.0.1")); + NodeId newCMS = metadata.directory.peerId(InetAddressAndPort.getByNameUnchecked("127.0.0.2")); ClusterMetadata.Transformer transformer = metadata.transformer(); - DataPlacement.Builder builder = metadata.placements.get(ReplicationParams.meta(metadata)).unbuild() - .withoutReadReplica(metadata.nextEpoch(), oldCMS) - .withoutWriteReplica(metadata.nextEpoch(), oldCMS) - .withWriteReplica(metadata.nextEpoch(), newCMS) - .withReadReplica(metadata.nextEpoch(), newCMS); - transformer = transformer.with(metadata.placements.unbuild().with(ReplicationParams.meta(metadata), builder.build()).build()); + transformer.leaveCMS(oldCMS).startJoiningCMS(newCMS).finishJoiningCMS(newCMS); ClusterMetadata toDump = transformer.build().metadata.forceEpoch(Epoch.create(1000)); Path p = Files.createTempFile("clustermetadata", "dump"); try (FileOutputStreamPlus out = new FileOutputStreamPlus(p)) diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataDumpTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataDumpTest.java index b0f66ff885a8..1b1c6f482511 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataDumpTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataDumpTest.java @@ -63,7 +63,8 @@ else if (l.contains("UNSAFE_JOIN")) epochsSeen++; } assertEquals(3, unsafeJoinSeen); - assertEquals(3, registerSeen); + // Only 2 REGISTER transforms are expected as the first CMS node is registered implicitly by INITIALIZE_CMS + assertEquals(2, registerSeen); assertTrue(epochsSeen > 15); res = cluster.get(1).nodetoolResult("cms", "dumplog", "--start", "10", "--end", "15"); diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java index 64663280aa49..e2951a2f8478 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java @@ -41,7 +41,6 @@ import org.apache.cassandra.locator.MetaStrategy; import org.apache.cassandra.schema.DistributedMetadataLogKeyspace; import org.apache.cassandra.schema.ReplicationParams; -import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.service.paxos.Ballot; import org.apache.cassandra.service.paxos.PaxosRepairHistory; import org.apache.cassandra.tcm.ClusterMetadata; @@ -57,6 +56,7 @@ import static org.apache.cassandra.distributed.shared.ClusterUtils.replaceHostAndStart; import static org.apache.cassandra.distributed.shared.NetworkTopology.dcAndRack; import static org.apache.cassandra.distributed.shared.NetworkTopology.networkTopology; +import static org.apache.cassandra.schema.SchemaConstants.METADATA_KEYSPACE_NAME; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.psjava.util.AssertStatus.assertTrue; @@ -85,10 +85,10 @@ public void expandAndShrinkCMSTest() throws Throwable ClusterMetadata metadata = ClusterMetadata.current(); assertEquals(5, metadata.fullCMSMembers().size()); assertEquals(ReplicationParams.simpleMeta(5, metadata.directory.knownDatacenters()), - metadata.placements().keys().stream().filter(ReplicationParams::isMeta).findFirst().get()); + metadata.schema.getKeyspaceMetadata(METADATA_KEYSPACE_NAME).params.replication); }); cluster.stream().forEach(i -> { - Assert.assertTrue(i.executeInternal(String.format("SELECT * FROM %s.%s", SchemaConstants.METADATA_KEYSPACE_NAME, DistributedMetadataLogKeyspace.TABLE_NAME)).length > 0); + Assert.assertTrue(i.executeInternal(String.format("SELECT * FROM %s.%s", METADATA_KEYSPACE_NAME, DistributedMetadataLogKeyspace.TABLE_NAME)).length > 0); }); cluster.get(nodeSelector.get()).nodetoolResult("cms", "reconfigure", "1").asserts().success(); @@ -96,7 +96,7 @@ public void expandAndShrinkCMSTest() throws Throwable ClusterMetadata metadata = ClusterMetadata.current(); assertEquals(1, metadata.fullCMSMembers().size()); assertEquals(ReplicationParams.simpleMeta(1, metadata.directory.knownDatacenters()), - metadata.placements().keys().stream().filter(ReplicationParams::isMeta).findFirst().get()); + metadata.schema.getKeyspaceMetadata(METADATA_KEYSPACE_NAME).params.replication); }); } } @@ -326,11 +326,11 @@ private PaxosRepairHistory paxosRepairHistory(IInvokableInstance instance) Object[][] rows = instance.executeInternal("select points from system.paxos_repair_history " + "where keyspace_name = ? " + "and table_name = ?", - SchemaConstants.METADATA_KEYSPACE_NAME, + METADATA_KEYSPACE_NAME, DistributedMetadataLogKeyspace.TABLE_NAME); if (rows.length == 0) - return PaxosRepairHistory.empty(SchemaConstants.METADATA_KEYSPACE_NAME, DistributedMetadataLogKeyspace.TABLE_NAME); + return PaxosRepairHistory.empty(METADATA_KEYSPACE_NAME, DistributedMetadataLogKeyspace.TABLE_NAME); assertEquals(1, rows.length); //noinspection unchecked List points = (List)rows[0][0]; diff --git a/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java b/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java index 2c81d6163580..364cae7cb2ec 100644 --- a/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java +++ b/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java @@ -18,6 +18,7 @@ package org.apache.cassandra.auth; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.Set; @@ -34,6 +35,8 @@ import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; +import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableMetadata; @@ -59,6 +62,11 @@ public static void setUpAuth() ServerTestUtils.daemonInitialization(); DatabaseDescriptor.setPermissionsValidity(0); DatabaseDescriptor.setRolesValidity(0); + // for the tables in the distributed metadata keyspace to be queryable, there needs to be a valid placement + // which is derived from the CMS membership. Most unit tests don't actually need to query those dist tables + // which is why this isn't done as a matter of routine + ClusterMetadataTestHelper.reconfigureCms(ReplicationParams.ntsMeta(Collections.singletonMap(DatabaseDescriptor.getLocalDataCenter(), 1))); + ServerTestUtils.markCMS(); requireAuthentication(); requireNetwork(); CassandraDaemon.getInstanceForTesting().setupVirtualKeyspaces(); diff --git a/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java b/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java index 476a45459713..fe9c6f026464 100644 --- a/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java +++ b/test/unit/org/apache/cassandra/tcm/BootWithMetadataTest.java @@ -41,6 +41,7 @@ import org.apache.cassandra.tcm.membership.Directory; import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.tcm.membership.MembershipUtils; +import org.apache.cassandra.tcm.membership.NodeAddresses; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeVersion; import org.apache.cassandra.tcm.ownership.DataPlacements; @@ -110,7 +111,10 @@ private Epoch doTest(Epoch epoch, ClusterMetadata first) throws Throwable Directory directory = first.directory; int nodeCount = 10; int tokensPerNode = 5; - for (int i = 0; i < nodeCount; i++) + // Ensure that the "local" node is registered as it being member of the CMS is a precondition of booting from + // a ClusterMetadata. + directory = directory.with(NodeAddresses.current(), new Location("DC1", "RACK1")); + for (int i = 0; i < nodeCount-1; i++) directory = directory.with(nodeAddresses(random), new Location("DC1", "RACK1")); t = t.with(directory); diff --git a/test/unit/org/apache/cassandra/tcm/GetLogStateTest.java b/test/unit/org/apache/cassandra/tcm/GetLogStateTest.java index 167aa3992b63..965b3f4918c2 100644 --- a/test/unit/org/apache/cassandra/tcm/GetLogStateTest.java +++ b/test/unit/org/apache/cassandra/tcm/GetLogStateTest.java @@ -41,8 +41,12 @@ import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.tcm.log.LogStorage; import org.apache.cassandra.tcm.log.SystemKeyspaceStorage; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.membership.NodeVersion; import org.apache.cassandra.tcm.ownership.UniformRangePlacement; import org.apache.cassandra.tcm.transformations.CustomTransformation; +import org.apache.cassandra.tcm.transformations.ForceSnapshot; import org.apache.cassandra.utils.FBUtilities; import static org.junit.Assert.assertEquals; @@ -80,6 +84,16 @@ public static void setup() throws IOException ClusterMetadataService.setInstance(cms); log.readyUnchecked(); log.unsafeBootstrapForTesting(FBUtilities.getBroadcastAddressAndPort()); + // register first node & make it CMS as this affects how the meta strategy placements are calculated + ClusterMetadata withRegistered = ClusterMetadata.current() + .transformer() + .register(NodeAddresses.current(), + DatabaseDescriptor.getInitialLocationProvider().initialLocation(), + NodeVersion.CURRENT) + .build().metadata; + NodeId id = withRegistered.directory.peerId(NodeAddresses.current().broadcastAddress); + ClusterMetadata withCMS = withRegistered.transformer().startJoiningCMS(id).finishJoiningCMS(id).build().metadata; + cms.commit(new ForceSnapshot(withCMS)); } @Test diff --git a/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java b/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java index 37cee7237347..05045b693257 100644 --- a/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java +++ b/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java @@ -61,8 +61,9 @@ LogStateSUT getSystemUnderTest(MetadataSnapshots snapshots) { return new LogStateSUT() { - - // start test entries at FIRST + 1 as the pre-init transform is automatically inserted with Epoch.FIRST + // we start test entries at FIRST, but in a real log the PRE_INITIALIZE_CMS transform is automatically + // inserted with Epoch.FIRST, followed by INITIALIZE_CMS so the next entry to be committed would be at + // epoch 3 Epoch currentEpoch = Epoch.FIRST; Epoch nextEpoch; boolean applied; diff --git a/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java b/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java index 94eb01fc1b9e..50946d7ac465 100644 --- a/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java +++ b/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java @@ -105,11 +105,13 @@ private void testRevertingBootstrap(long seed) LockedRanges locked = lockedRanges(placements, random); // state of metadata before starting the sequence + // note: don't allow epoch to be Epoch.FIRST as this is a + // special case for calculating meta strategy placements. ClusterMetadata before = metadata(directory).transformer() .with(placements) .withNodeState(nodeId, NodeState.REGISTERED) .with(locked) - .build().metadata; + .build().metadata.forceEpoch(epoch(random)); // Placements after PREPARE_JOIN DataPlacements afterPrepare = placements(ranges(random), replication, random); @@ -179,11 +181,13 @@ private void testRevertingLeave(long seed) // Ranges locked by other operations LockedRanges locked = lockedRanges(placements, random); // state of metadata before starting the sequence + // note: don't allow epoch to be Epoch.FIRST as this is a + // special case for calculating meta strategy placements. ClusterMetadata before = metadata(directory).transformer() .with(placements) .withNodeState(nodeId, NodeState.JOINED) .with(locked) - .build().metadata; + .build().metadata.forceEpoch(epoch(random)); // PREPARE_LEAVE does not modify placements, so first transformation is START_LEAVE diff --git a/test/unit/org/apache/cassandra/tcm/transformations/PrepareLeaveTest.java b/test/unit/org/apache/cassandra/tcm/transformations/PrepareLeaveTest.java index f5044a00fdc3..bbf58eda543c 100644 --- a/test/unit/org/apache/cassandra/tcm/transformations/PrepareLeaveTest.java +++ b/test/unit/org/apache/cassandra/tcm/transformations/PrepareLeaveTest.java @@ -84,10 +84,11 @@ public static void setUpClass() throws Exception public void testCheckRF_Simple() throws Throwable { Keyspaces kss = Keyspaces.of(DistributedMetadataLogKeyspace.initialMetadata(Sets.newHashSet(hostDc.values())), KSM); + // should be accepted (2 nodes in dc1 where we remove the host): ClusterMetadata metadata = prepMetadata(kss, 2, 2); assertTrue(executeLeave(metadata)); - // should be rejected: - metadata = prepMetadata(kss, 1, 2); + // should be rejected because only 1 node in dc2: + metadata = prepMetadata(kss, 2, 1); assertFalse(executeLeave(metadata)); } @@ -97,17 +98,17 @@ public void testCheckRF_NTS() throws Throwable Keyspaces kss = Keyspaces.of(DistributedMetadataLogKeyspace.initialMetadata(Sets.newHashSet(hostDc.values())), KSM_NTS); ClusterMetadata metadata = prepMetadata(kss, 4, 4); assertTrue(executeLeave(metadata)); - // should be accepted (4 nodes in dc1 where we remove the host): - metadata = prepMetadata(kss, 4, 2); + // should be accepted (4 nodes in dc2 where we remove the host): + metadata = prepMetadata(kss, 2, 4); assertTrue(executeLeave(metadata)); - // should be rejected - metadata = prepMetadata(kss, 3, 4); + // should be rejected because there are already only 3 replicas in dc2 + metadata = prepMetadata(kss, 4, 3); assertFalse(executeLeave(metadata)); } private boolean executeLeave(ClusterMetadata metadata) throws Throwable { - PrepareLeave prepareLeave = new PrepareLeave(metadata.directory.peerId(InetAddressAndPort.getByName("127.0.0.1")), + PrepareLeave prepareLeave = new PrepareLeave(metadata.directory.peerId(InetAddressAndPort.getByName("127.0.0.11")), false, dummyPlacementProvider, LeaveStreams.Kind.UNBOOTSTRAP); From 01b2350b8b12b56256e840aec996b7274ec9cf9f Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Fri, 23 Jan 2026 12:39:01 +0000 Subject: [PATCH 14/23] [CASSANDRA-20736] Rework CMS initialization --- .../apache/cassandra/tcm/ClusterMetadata.java | 36 ++++++++++++++++--- .../cassandra/tcm/ClusterMetadataService.java | 1 + .../org/apache/cassandra/tcm/Startup.java | 8 ++--- .../transformations/cms/PreInitialize.java | 31 ++++++++++++---- ...rMetadataUpgradeDelayedInitializeTest.java | 8 ++--- 5 files changed, 66 insertions(+), 18 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 92e19efc5206..7f6b268ce3a0 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -351,10 +351,38 @@ public ClusterMetadata forceEpoch(Epoch epoch) capLastModified(cmsMembership, epoch)); } - public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier, - NodeAddresses addresses, - NodeVersion version, - Location location) + /** + * To be used only during the execute of PRE_INITIALIZE_CMS, this sets the DataPlacement for the metadata keyspace + * so that the global log can be initialized and the subsequent entry containing the INITIALIZE_CMS transformation + * can be committed. + * @param initialCMSPlacement Expected to be a singleton placement identifying the local node as the sole replica + * for the metadata keyspace + * @return ClusterMetadata instance in the correct state to constitute the result of the PRE_INITIALIZE_CMS + * transformation + */ + public ClusterMetadata forcePreInitializedState(DataPlacement initialCMSPlacement) + { + assert epoch.isEqualOrBefore(Epoch.FIRST); + // double check that all metadata elements have had the last modified epoch set correctly + ClusterMetadata initial = forceEpoch(Epoch.FIRST); + initial.cmsDataPlacement = initialCMSPlacement; + return initial; + } + + /** + * Produce a ClusterMetadata suitable for use as the base state in the INITIALIZE_CMS transformation. This should + * only be used on the first CMS node when bootstrapping the CMS after upgrade or in a brand new cluster. + * @param clusterIdentifier Unique identifier for split brain detection & protection + * @param addresses The NodeAddresses of the first CMS node. + * @param version Version info for the first CMS node. + * @param location The rack & DC of the first CMS node. + * @return ClusterMetadata instance in the correct state to constitute the base state of the INITIALIZE_CMS + * transformation + */ + public ClusterMetadata forceInitializedState(int clusterIdentifier, + NodeAddresses addresses, + NodeVersion version, + Location location) { if (this.metadataIdentifier != EMPTY_METADATA_IDENTIFIER) throw new IllegalStateException(String.format("Can only initialize cluster identifier once, but it was already set to %d", this.metadataIdentifier)); diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index 9c35f6a2b7bf..9c127655fccb 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -166,6 +166,7 @@ public static State state(ClusterMetadata metadata) // the distributed metadata table. if (metadata.epoch.isEqualOrBefore(Epoch.FIRST) || ClusterMetadata.current().isCMSMember(FBUtilities.getBroadcastAddressAndPort())) return LOCAL; + return REMOTE; } diff --git a/src/java/org/apache/cassandra/tcm/Startup.java b/src/java/org/apache/cassandra/tcm/Startup.java index ebf24e47c3d2..ebcaa2abd1b3 100644 --- a/src/java/org/apache/cassandra/tcm/Startup.java +++ b/src/java/org/apache/cassandra/tcm/Startup.java @@ -150,12 +150,12 @@ public static void initializeAsFirstCMSNode() NodeAddresses addresses = NodeAddresses.current(); Location location = DatabaseDescriptor.getLocator().local(); ClusterMetadataService.instance().log().bootstrap(addresses.broadcastAddress, location.datacenter); - ClusterMetadata metadata = ClusterMetadata.current(); - assert ClusterMetadataService.state() == LOCAL : String.format("Can't initialize as node hasn't transitioned to CMS state. State: %s.\n%s", ClusterMetadataService.state(), metadata); - Initialize initialize = new Initialize(metadata.initializeClusterIdentifier(addresses.broadcastAddress.hashCode(), + ClusterMetadata metadata = ClusterMetadata.current().forceInitializedState(addresses.broadcastAddress.hashCode(), addresses, NodeVersion.CURRENT, - location)); + location); + assert ClusterMetadataService.state(metadata) == LOCAL : String.format("Can't initialize as node hasn't transitioned to CMS state. State: %s.\n%s", ClusterMetadataService.state(), metadata); + Initialize initialize = new Initialize(metadata); ClusterMetadata initialized = ClusterMetadataService.instance().commit(initialize); NodeId id = initialized.myNodeId(); SystemKeyspace.setLocalHostId(id.toUUID()); diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java b/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java index ef7907bde93c..6592d1e0bb47 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/PreInitialize.java @@ -20,16 +20,21 @@ import java.io.IOException; +import com.google.common.collect.ImmutableSet; + import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.MetaStrategy; +import org.apache.cassandra.locator.Replica; import org.apache.cassandra.schema.DistributedMetadataLogKeyspace; import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.sequences.LockedRanges; import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; @@ -72,7 +77,6 @@ public Result execute(ClusterMetadata metadata) assert metadata.epoch.isBefore(Epoch.FIRST); ClusterMetadata.Transformer transformer = metadata.transformer(); - if (addr != null) { // If addr != null, then this is being executed on the peer which is actually initializing the log @@ -85,7 +89,7 @@ public Result execute(ClusterMetadata metadata) // PRE_INITIALIZE_CMS @ Epoch.FIRST, must be followed in the log by INITIALIZE_CMS @ (Epoch.FIRST + 1). // The serialization of INITIALIZE_CMS includes the full ClusterMetadata at that point, which is // obviously minimal, but will necessarily include the distributed metadata keyspace definition with - // the replication settings bootstrapped by PRE_INITIALIZE. This full ClusterMetadata becomes the + // the replication parameters bootstrapped by PRE_INITIALIZE_CMS. This full ClusterMetadata becomes the // starting point upon which further log entries are applied. So this means that once INITIALIZE_CMS // has been committed to the log, the actual content of PRE_INITIALIZE_CMS is irrelevant, even on // the first CMS node if it happens to replay it from its local storage after a restart. @@ -95,13 +99,28 @@ public Result execute(ClusterMetadata metadata) Keyspaces updated = metadata.schema.getKeyspaces() .withAddedOrReplaced(DistributedMetadataLogKeyspace.initialMetadata(datacenter)); transformer.with(new DistributedSchema(updated)); + ClusterMetadata.Transformer.Transformed transformed = transformer.build(); + + // This is required to bootstrap the initialization process. Because committing to the metadata log uses the + // previous ClusterMetadata to identify CMS members who form the consensus group, the first CMS member must + // be routable at that point. After the INITIALIZE_CMS is committed, and on instances other than the first + // CMS member, this placement is derived from the CMS membership list of node ids. + DataPlacement.Builder initialPlacement = DataPlacement.builder(); + Replica replica = new Replica(addr, + MetaStrategy.partitioner.getMinimumToken(), + MetaStrategy.partitioner.getMinimumToken(), + true); + initialPlacement.reads.withReplica(Epoch.FIRST, replica); + initialPlacement.writes.withReplica(Epoch.FIRST, replica); + metadata = transformed.metadata.forcePreInitializedState(initialPlacement.build()); + } + else + { + metadata = metadata.forceEpoch(Epoch.FIRST); } - ClusterMetadata.Transformer.Transformed transformed = transformer.build(); - metadata = transformed.metadata.forceEpoch(Epoch.FIRST); assert metadata.epoch.is(Epoch.FIRST) : metadata.epoch; - - return new Success(metadata, LockedRanges.AffectedRanges.EMPTY, transformed.modifiedKeys); + return new Success(metadata, LockedRanges.AffectedRanges.EMPTY, ImmutableSet.of()); } public String toString() diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeDelayedInitializeTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeDelayedInitializeTest.java index e1fffc74b78b..b406ae914c12 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeDelayedInitializeTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/ClusterMetadataUpgradeDelayedInitializeTest.java @@ -181,7 +181,7 @@ public static void installUpgradeVersionBB(ClassLoader classLoader, Integer num) try { new ByteBuddy().rebase(ClusterMetadata.class) - .method(named("initializeClusterIdentifier")) + .method(named("forceInitializedState")) .intercept(MethodDelegation.to(ClusterMetadataUpgradeDelayedInitializeTest.BBInterceptor.class)) .make() .load(classLoader, ClassLoadingStrategy.Default.INJECTION); @@ -201,13 +201,13 @@ public static void installUpgradeVersionBB(ClassLoader classLoader, Integer num) public static class BBInterceptor { @SuppressWarnings("unused") - public static ClusterMetadata initializeClusterIdentifier(@SuperCall Callable zuper) + public static ClusterMetadata forceInitializedState(@SuperCall Callable zuper) { try { - logger.info("initializeClusterIdentifier waiting..."); + logger.info("forceInitializedState waiting..."); BBState.latch.await(60, TimeUnit.SECONDS); - logger.info("initializeClusterIdentifier continuing..."); + logger.info("forceInitializedState continuing..."); return zuper.call(); } catch (Throwable e) From 51e1c6cfa7863aaeac105bb69b96beacb782c354 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Thu, 29 May 2025 12:32:37 +0100 Subject: [PATCH 15/23] [CASSANDRA-20476] Add dtest for CMS rediscovery --- .../distributed/shared/ClusterUtils.java | 40 ++- .../test/log/DiscoverNewCMSTest.java | 227 ++++++++++++++++++ 2 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java diff --git a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java index 006379ae3bfc..1d54bad404d9 100644 --- a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java +++ b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java @@ -24,7 +24,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -78,6 +80,7 @@ import org.apache.cassandra.gms.VersionedValue; import org.apache.cassandra.io.util.File; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.SimpleSeedProvider; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.RequestCallback; @@ -107,6 +110,7 @@ import static org.apache.cassandra.config.CassandraRelevantProperties.BROADCAST_INTERVAL_MS; import static org.apache.cassandra.config.CassandraRelevantProperties.REPLACE_ADDRESS_FIRST_BOOT; import static org.apache.cassandra.config.CassandraRelevantProperties.RING_DELAY; +import static org.apache.cassandra.distributed.impl.TestEndpointCache.fromCassandraInetAddressAndPort; import static org.apache.cassandra.distributed.impl.TestEndpointCache.toCassandraInetAddressAndPort; import static org.assertj.core.api.Assertions.assertThat; @@ -834,6 +838,21 @@ public static Set getCMSMembers(IInvokableInstance inst) .collect(Collectors.toSet())); } + public static Set getCMSMemberIds(IInvokableInstance inst) + { + Set idsAsInts = inst.callOnInstance(() -> ClusterMetadata.current() + .fullCMSMemberIds() + .stream() + .map(NodeId::id) + .collect(Collectors.toSet())); + return idsAsInts.stream().map(NodeId::new).collect(Collectors.toSet()); + } + + public static Set getCMSMemberAddresses(IInvokableInstance inst) + { + return inst.callOnInstance(() -> new HashSet<>(ClusterMetadata.current().fullCMSMembers())); + } + public static boolean decommission(IInvokableInstance leaving) { return leaving.callOnInstance(() -> { @@ -871,6 +890,12 @@ public static int getNodeId(IInvokableInstance target, IInvokableInstance execut }); } + public static InetSocketAddress getEndpoint(IInvokableInstance target, NodeId id) + { + String idString = Integer.toString(id.id()); + return target.callOnInstance(() -> fromCassandraInetAddressAndPort(ClusterMetadata.current().directory.endpoint(NodeId.fromString(idString)))); + } + public static boolean cancelInProgressSequences(IInvokableInstance executor) { return cancelInProgressSequences(getNodeId(executor), executor); @@ -1443,6 +1468,15 @@ public static String getPartitionerName(IInstance instance) return (String) instance.config().get("partitioner"); } + + public static void updateSeed(IInstance instance, String...address) + { + IInstanceConfig conf = instance.config(); + conf.set("seed_provider", + new IInstanceConfig.ParameterizedClass(SimpleSeedProvider.class.getName(), + Collections.singletonMap("seeds", String.join(",", address)))); + } + /** * Changes the instance's address to the new address. This method should only be called while the instance is * down, else has undefined behavior. @@ -1478,12 +1512,14 @@ private static void updateAddress(IInstanceConfig conf, String address) // are a risk if (!conf.broadcastAddress().equals(previous)) { - conf.networkTopology().put(conf.broadcastAddress(), NetworkTopology.dcAndRack(conf.localDatacenter(), conf.localRack())); + NetworkTopology topology = conf.networkTopology(); + NetworkTopology.DcAndRack location = NetworkTopology.dcAndRack(topology.localDC(previous), topology.localRack(previous)); + topology.put(conf.broadcastAddress(), location); try { Field field = NetworkTopology.class.getDeclaredField("map"); field.setAccessible(true); - Map map = (Map) field.get(conf.networkTopology()); + Map map = (Map) field.get(topology); map.remove(previous); } catch (NoSuchFieldException | IllegalAccessException e) diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java new file mode 100644 index 000000000000..431e042bff00 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.log; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import com.google.common.base.Throwables; + +import org.awaitility.Awaitility; +import org.junit.Before; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Pair; + +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class DiscoverNewCMSTest extends TestBaseImpl +{ + + @Before + public void disableAccord() + { + CassandraRelevantProperties.DTEST_ACCORD_ENABLED.setBoolean(false); + } + + @Test + public void singleNodeCMSAddressChangeTest() throws IOException, ExecutionException, InterruptedException + { + try (Cluster cluster = builder().withNodes(3) + .withConfig(config -> config.with(NETWORK, GOSSIP)) + .createWithoutStarting()) + { + test(cluster, 1); + } + } + + @Test + public void multiNodeCMSOnlyClusterAddressChangeTest() throws IOException, ExecutionException, InterruptedException + { + + try (Cluster cluster = builder().withNodes(3) + .withConfig(config -> config.with(NETWORK, GOSSIP)) + .createWithoutStarting()) + { + test(cluster, 3); + } + } + + @Test + public void multiNodeCMSAllAddressesChangeTest() throws IOException, ExecutionException, InterruptedException + { + try (Cluster cluster = builder().withNodes(6) + .withConfig(config -> config.with(NETWORK, GOSSIP)) + .createWithoutStarting()) + { + test(cluster, 3); + } + } + + private void test(Cluster cluster, int cmsSize) throws IOException, ExecutionException, InterruptedException + { + ExecutorService executor = Executors.newFixedThreadPool(cluster.size()); + cluster.setUncaughtExceptionsFilter((node, t) -> { + Throwable rootCause = Throwables.getRootCause(t); + // Some fetchCMSLog operations might temporarily fail and be retried during address changes + return rootCause.getMessage() != null + && rootCause.getMessage().startsWith("Cannot achieve consistency level SERIAL"); + + }); + cluster.startup(); + init(cluster); + IInvokableInstance n1 = cluster.get(1); + if (cmsSize > 1) + n1.nodetoolResult("cms", "reconfigure", "" + cmsSize).asserts().success(); + ClusterUtils.waitForCMSToQuiesce(cluster, n1); + + // Set up the expectations for what address changes are going to happen + Map> addressMapping = new HashMap<>(cluster.size()); + for (IInvokableInstance inst : cluster) + { + InetSocketAddress starting = inst.config().broadcastAddress(); + InetSocketAddress expected = new InetSocketAddress(bumpAddress(starting.getAddress()), starting.getPort()); + NodeId id = ClusterUtils.getNodeId(inst); + addressMapping.put(starting, Pair.create(id, expected)); + } + + // Check the CMS membership at the start of the test & predict what it should be at the end + Set startingCMS = new HashSet<>(cmsSize); + Set expectedCMS = new HashSet<>(cmsSize); + for (InetSocketAddress s : ClusterUtils.getCMSMemberAddresses(n1)) + { + startingCMS.add(s); + expectedCMS.add(addressMapping.get(s).right); + } + Set cmsNodes = ClusterUtils.getCMSMemberIds(n1); + + // Shut down all nodes, modify each one's broadcast address and reconfigure seeds as these + // will be used to rediscover peers. Seed config does not need to be uniform across the cluster + // but there must be enough intersection to enable the CMS members to rediscover each other + for (int i = 1; i <= cluster.size(); i++) + { + IInvokableInstance inst = cluster.get(i); + inst.shutdown().get(); + InetAddress newBroadcastAddress = addressMapping.get(inst.config().broadcastAddress()).right.getAddress(); + byte[] bytes = newBroadcastAddress.getAddress(); + ClusterUtils.updateAddress(inst, addrString(bytes)); + String seed1 = addrString(bytes[0], bytes[1], bytes[2], (byte) i); + String seed2 = addrString(bytes[0], bytes[1], bytes[2], (byte) ((i < cluster.size()) ? i + 1 : 1)); + ClusterUtils.updateSeed(inst, seed1, seed2); + } + + // Start everything up and wait for state to cluster state to quiesce + List> startups = new ArrayList<>(cluster.size()); + for (IInvokableInstance inst : cluster) + { + Future f = executor.submit(() -> { + inst.startup(); + return true; + }); + startups.add(f); + } + FBUtilities.waitOnFutures(startups, 60, TimeUnit.SECONDS); + ClusterUtils.waitForCMSToQuiesce(cluster, n1); + + // wait until each node's STARTUP transformation has been enacted by all nodes + Awaitility.waitAtMost(30, TimeUnit.SECONDS).until(() -> allAddressChangesEnacted(cluster)); + Epoch afterAllAddressChanges = getEpochAfterAllAddressChanges(cluster); + ClusterUtils.waitForCMSToQuiesce(cluster, afterAllAddressChanges, true); + + // Assert that: + // * The membership of the CMS (i.e. which node ids) remains the same + // * The set of CMS addresses matches the prediction made at the start + // * Every node has successfully changed its address. The previous check + // is a logical consequence of this, but it doesn't hurt to verify both + for (IInvokableInstance inst : cluster) + { + assertEquals(cmsNodes, ClusterUtils.getCMSMemberIds(inst)); + Set finalCMS = ClusterUtils.getCMSMemberAddresses(inst); + assertEquals(startingCMS.size(), finalCMS.size()); + assertEquals(expectedCMS.size(), finalCMS.size()); + assertTrue(expectedCMS.containsAll(finalCMS)); + for (Pair peer : addressMapping.values()) + { + InetSocketAddress fromInst = ClusterUtils.getEndpoint(inst, peer.left); + assertEquals("Check failed on instance " + inst.config().num(), peer.right, fromInst); + } + } + } + + private boolean allAddressChangesEnacted(Cluster cluster) + { + return getEpochAfterAllAddressChanges(cluster).isAfter(Epoch.FIRST); + } + + private Epoch getEpochAfterAllAddressChanges(Cluster cluster) + { + int nodes = cluster.size(); + long epochAfterAllStartups = cluster.get(1).callOnInstance(() -> { + UntypedResultSet rs = QueryProcessor.executeInternal("SELECT epoch, kind from system_views.cluster_metadata_log where kind = 'STARTUP'"); + if (rs.isEmpty() || rs.size() < nodes) + return -1; + + long epoch = rs.stream().mapToLong(r -> r.getLong("epoch")).max().orElse(-1); + return epoch; + }).longValue(); + return Epoch.create(epochAfterAllStartups); + } + + private InetAddress bumpAddress(InetAddress address) throws UnknownHostException + { + // ipv4 addresses for this test + assert address.getAddress().length == 4; + byte[] bytes = address.getAddress(); + bytes[2]++; + return InetAddress.getByAddress(bytes); + } + + private String addrString(byte...b) throws UnknownHostException + { + assert b.length == 4; + return InetAddress.getByAddress(new byte[] { b[0], b[1], b[2], b[3] }).getHostAddress(); + } + +} From d3466615426156210590d55a8ecf374bcade0872 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Mon, 23 Jun 2025 16:12:29 +0100 Subject: [PATCH 16/23] [CASSANDRA-20476] Prep for CMSLookup --- src/java/org/apache/cassandra/tcm/CMSOperations.java | 2 +- src/java/org/apache/cassandra/tcm/ClusterMetadata.java | 6 ++++++ .../org/apache/cassandra/tcm/ClusterMetadataService.java | 2 +- src/java/org/apache/cassandra/tcm/Commit.java | 5 +---- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/CMSOperations.java b/src/java/org/apache/cassandra/tcm/CMSOperations.java index d4b496e1acb7..592aec4a3814 100644 --- a/src/java/org/apache/cassandra/tcm/CMSOperations.java +++ b/src/java/org/apache/cassandra/tcm/CMSOperations.java @@ -155,7 +155,7 @@ public Map describeCMS() String members = metadata.fullCMSMembers().stream().sorted().map(Object::toString).collect(Collectors.joining(",")); info.put(MEMBERS, members); info.put(NEEDS_RECONFIGURATION, Boolean.toString(needsReconfiguration(metadata))); - info.put(IS_MEMBER, Boolean.toString(cms.isCurrentMember(FBUtilities.getBroadcastAddressAndPort()))); + info.put(IS_MEMBER, Boolean.toString(metadata.isCMSMember(FBUtilities.getBroadcastAddressAndPort()))); info.put(SERVICE_STATE, ClusterMetadataService.state(metadata).toString()); info.put(IS_MIGRATING, Boolean.toString(cms.isMigrating())); info.put(EPOCH, Long.toString(metadata.epoch.getEpoch())); diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 7f6b268ce3a0..39f102486a4b 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -117,6 +117,12 @@ public class ClusterMetadata // This isn't serialized as part of ClusterMetadata it's really just a view over the Directory. public final Locator locator; + // This isn't serialized, it's a helper to apply transient (or not yet committed) changes to CMS member addresses. + // It is initialized with any address changes detected in Startup::initializeCMSLookup when the node boots. It is + // subsequently updated exclusively by CMSLookup.LogListener when enacting a transformation which modifies the + // addresses of CMS member(s), making those changes permanent/durable. + public volatile CMSLookup cmsLookup = CMSLookup.FIRST; + // These fields are lazy but only for the test purposes, since their computation requires initialization of the log ks private EndpointsForRange fullCMSReplicas; private Set fullCMSEndpoints; diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index 9c127655fccb..7d70b157095b 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -164,7 +164,7 @@ public static State state(ClusterMetadata metadata) // The node is a full member of the CMS if it has started participating in reads for distributed metadata table (which // implies it is a write replica as well). In other words, it's a fully joined member of the replica set responsible for // the distributed metadata table. - if (metadata.epoch.isEqualOrBefore(Epoch.FIRST) || ClusterMetadata.current().isCMSMember(FBUtilities.getBroadcastAddressAndPort())) + if (metadata.isCMSMember(FBUtilities.getBroadcastAddressAndPort())) return LOCAL; return REMOTE; diff --git a/src/java/org/apache/cassandra/tcm/Commit.java b/src/java/org/apache/cassandra/tcm/Commit.java index f002b3e66ede..734868caf5f4 100644 --- a/src/java/org/apache/cassandra/tcm/Commit.java +++ b/src/java/org/apache/cassandra/tcm/Commit.java @@ -431,10 +431,7 @@ public void send(Result result, InetAddressAndPort source) // Filter the log entries from the commit result for the purposes of replicating to members of the cluster // other than the original submitter. We only need to include the sublist of entries starting at the one // which was newly committed. We exclude entries before that one as the submitter may have been lagging and - // supplied a last known epoch arbitrarily in the past. We include entries after the first newly committed - // one as there may have been a new period automatically triggered and we'd like to push that out to all - // peers too. Of course, there may be other entries interspersed with these but it doesn't harm anything to - // include those too, it may simply be redundant. + // supplied a last known epoch arbitrarily in the past. LogState newlyCommitted = success.logState.retainFrom(success.epoch); assert !newlyCommitted.isEmpty() : String.format("Nothing to replicate after retaining epochs since %s from %s", success.epoch, success.logState); From bed6f5647dc9fc3ff6217a0d9e61ff42e9c2b162 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Mon, 23 Jun 2025 18:05:25 +0100 Subject: [PATCH 17/23] [CASSANDRA-20476] Introduce CMSLookup --- .../org/apache/cassandra/tcm/CMSLookup.java | 185 ++++++++++++++++++ .../apache/cassandra/tcm/CMSMembership.java | 7 +- .../apache/cassandra/tcm/ClusterMetadata.java | 66 ++++++- .../cassandra/tcm/ClusterMetadataService.java | 3 +- src/java/org/apache/cassandra/tcm/Commit.java | 8 +- .../cassandra/tcm/membership/Directory.java | 2 +- .../tcm/membership/EndpointLookup.java | 26 +++ 7 files changed, 282 insertions(+), 15 deletions(-) create mode 100644 src/java/org/apache/cassandra/tcm/CMSLookup.java create mode 100644 src/java/org/apache/cassandra/tcm/membership/EndpointLookup.java diff --git a/src/java/org/apache/cassandra/tcm/CMSLookup.java b/src/java/org/apache/cassandra/tcm/CMSLookup.java new file mode 100644 index 000000000000..f7218c17f28e --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/CMSLookup.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.google.common.collect.Maps; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.tcm.listeners.ChangeListener; +import org.apache.cassandra.tcm.membership.EndpointLookup; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.utils.Pair; + +public class CMSLookup +{ + private static final Logger logger = LoggerFactory.getLogger(CMSLookup.class); + + public enum State { PRE_INIT, ACTIVE, RETIRED }; + + public final static CMSLookup NO_OP = new CMSLookup(State.PRE_INIT, Epoch.EMPTY, new HashMap<>()); + public static InitialBuilder builder(ClusterMetadata metadata) + { + return new InitialBuilder(metadata); + } + + private final Map> overrides; + private final BiMap addressMap; + private final Epoch lastModified; + private final State state; + + private CMSLookup(State state, Epoch epoch, Map> overrides) + { + this.state = state; + this.lastModified = epoch; + this.addressMap = HashBiMap.create(overrides.size()); + this.overrides = Maps.newHashMapWithExpectedSize(overrides.size()); + for (Map.Entry> e : overrides.entrySet()) + { + this.overrides.put(e.getKey(), e.getValue()); + this.addressMap.put(e.getValue().left, e.getValue().right); + } + } + + public boolean isUninitialized() + { + return state == State.PRE_INIT; + } + + public boolean isActive() + { + return state == State.ACTIVE; + } + + public InetAddressAndPort getAddressOverride(NodeId id) + { + Pair override = overrides.get(id); + return override != null ? override.right : null; + } + + public EndpointLookup asNodeLookup(EndpointLookup lookup) + { + return new EndpointLookup() + { + @Override + public InetAddressAndPort endpoint(NodeId id) + { + if (overrides.containsKey(id)) + return overrides.get(id).right; + return lookup.endpoint(id); + } + }; + } + + public CMSLookup rebuild(ClusterMetadata prev, ClusterMetadata next, boolean fromSnapshot) + { + logger.debug("Rebuilding CMS lookup {} with metadata from epoch {}", this, next.epoch.getEpoch()); + + // All address changes have been enacted, nothing to do + if (state == State.RETIRED) + return this; + + if (!next.epoch.isEqualOrBefore(Epoch.FIRST) + && !fromSnapshot + && next.directory.lastModified().equals(prev.directory.lastModified())) + return this; + + // Filters from the override list those which are no longer necessary as a transformation has now + // replaced the old address with the new one for that node id + Predicate>> overrideNowEnacted = entry -> { + NodeId nodeId = entry.getKey(); + if (!Objects.equals(prev.directory.getNodeAddresses(nodeId), next.directory.getNodeAddresses(nodeId))) + { + Pair override = overrides.get(nodeId); + if (override != null) + { + // If there was an override for this nodeId && the old/new addresses match the prev/next directory + // entries, filter the override from the map which will be used to build the new version + return !override.left.equals(prev.directory.endpoint(nodeId)) + || !override.right.equals(next.directory.endpoint(nodeId)); + } + } + return true; + }; + + logger.debug("Current endpoint overrides: {}", overrides); + Map> nextOverrides + = overrides.entrySet() + .stream() + .filter(overrideNowEnacted) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + logger.debug("Proposed endpoint overrides: {}", nextOverrides); + State state = nextOverrides.isEmpty() ? State.RETIRED : State.ACTIVE; + return new CMSLookup(state, next.epoch, nextOverrides); + } + + @Override + public String toString() + { + return "CMSLookup{" + + "state=" + state + + ", epoch=" + lastModified + + ", overrides=" + overrides + + ", addressMap=" + addressMap + + '}'; + } + + public static class InitialBuilder + { + private final Epoch epoch; + private final Map> overrides; + + private InitialBuilder(ClusterMetadata metadata) + { + this.epoch = metadata.epoch; + this.overrides = new HashMap<>(); + } + + public InitialBuilder withOverride(NodeId id, InetAddressAndPort originalAddress, InetAddressAndPort newAddress) + { + overrides.put(id, Pair.create(originalAddress, newAddress)); + return this; + } + + public CMSLookup build() + { + return new CMSLookup(State.ACTIVE, epoch, overrides); + } + } + + public static class LogListener implements ChangeListener + { + @Override + public void notifyPreCommit(ClusterMetadata prev, ClusterMetadata next, boolean fromSnapshot) + { + logger.debug("Reevaluating CMSLookup from {} at epoch {}", prev.epoch, next.epoch); + next.refreshCMSLookup(prev, fromSnapshot); + } + } +} diff --git a/src/java/org/apache/cassandra/tcm/CMSMembership.java b/src/java/org/apache/cassandra/tcm/CMSMembership.java index 5b88543be8af..b46b4b35446d 100644 --- a/src/java/org/apache/cassandra/tcm/CMSMembership.java +++ b/src/java/org/apache/cassandra/tcm/CMSMembership.java @@ -31,6 +31,7 @@ import org.apache.cassandra.locator.MetaStrategy; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.EndpointLookup; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.ownership.VersionedEndpoints; @@ -80,18 +81,18 @@ public static CMSMembership reconstruct(DataPlacement placement, Directory direc return new CMSMembership(lm, BTreeSet.of(full), BTreeSet.of(joining)); } - public DataPlacement toPlacement(Directory directory) + public DataPlacement toPlacement(EndpointLookup lookup) { DataPlacement.Builder builder = DataPlacement.builder(); for (NodeId id : fullMembers) { - Replica replica = MetaStrategy.replica(directory.endpoint(id)); + Replica replica = MetaStrategy.replica(lookup.endpoint(id)); builder.withReadReplica(lastModified, replica); builder.withWriteReplica(lastModified, replica); } for(NodeId id : joiningMembers) { - builder.withWriteReplica(lastModified, MetaStrategy.replica(directory.endpoint(id))); + builder.withWriteReplica(lastModified, MetaStrategy.replica(lookup.endpoint(id))); } return builder.build(); } diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 39f102486a4b..d4a9d1bcda7e 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -70,6 +70,7 @@ import org.apache.cassandra.tcm.extensions.ExtensionKey; import org.apache.cassandra.tcm.extensions.ExtensionValue; import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.EndpointLookup; import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.tcm.membership.NodeAddresses; import org.apache.cassandra.tcm.membership.NodeId; @@ -121,7 +122,7 @@ public class ClusterMetadata // It is initialized with any address changes detected in Startup::initializeCMSLookup when the node boots. It is // subsequently updated exclusively by CMSLookup.LogListener when enacting a transformation which modifies the // addresses of CMS member(s), making those changes permanent/durable. - public volatile CMSLookup cmsLookup = CMSLookup.FIRST; + public volatile CMSLookup cmsLookup = CMSLookup.NO_OP; // These fields are lazy but only for the test purposes, since their computation requires initialization of the log ks private EndpointsForRange fullCMSReplicas; @@ -189,7 +190,6 @@ public ClusterMetadata(Epoch epoch, cmsMembership); } - private ClusterMetadata(int metadataIdentifier, Epoch epoch, IPartitioner partitioner, @@ -224,7 +224,8 @@ private ClusterMetadata(int metadataIdentifier, this.locator = Locator.usingDirectory(directory); this.accordStaleReplicas = accordStaleReplicas; this.cmsMembership = cmsMembership; - this.cmsDataPlacement = calculateCMSPlacement(placements, cmsMembership); + // Build CMS placement using no-op CMS lookup, i.e. using only committed node addresses + this.cmsDataPlacement = calculateCMSPlacement(placements, cmsMembership, CMSLookup.NO_OP); } public Set fullCMSMemberIds() @@ -247,9 +248,10 @@ public Set fullCMSMembers() { if (fullCMSEndpoints == null) { + EndpointLookup lookup = endpointLookup(); fullCMSEndpoints = ImmutableSet.copyOf(cmsMembership.fullMembers() .stream() - .map(directory::endpoint) + .map(lookup::endpoint) .collect(Collectors.toSet())); } return fullCMSEndpoints; @@ -259,15 +261,57 @@ public EndpointsForRange fullCMSMembersAsReplicas() { if (fullCMSReplicas == null) { + EndpointLookup lookup = endpointLookup(); EndpointsForRange.Builder builder = EndpointsForRange.builder(MetaStrategy.entireRange); for (NodeId nodeId : fullCMSMemberIds()) - builder.add(MetaStrategy.replica(directory.endpoint(nodeId))); + builder.add(MetaStrategy.replica(lookup.endpoint(nodeId))); fullCMSReplicas = builder.build(); } return fullCMSReplicas; } - private DataPlacement calculateCMSPlacement(DataPlacements placements, CMSMembership cms) + // Synchronization is probably not necessary as this should only be called by a log listener + public synchronized void refreshCMSLookup(ClusterMetadata prev, boolean fromSnapshot) + { + CMSLookup prevLookup = prev.cmsLookup; + CMSLookup proposedLookup = prevLookup.rebuild(prev, this, fromSnapshot); + // rebuild returns `this` if no changes + if (prevLookup == proposedLookup) + logger.debug("No changes to CMSLookup between epochs {} and {}", prev.epoch.getEpoch(), epoch.getEpoch()); + else + { + logger.debug("Replacing CMSLookup, Current: {}, Proposed: {}", prevLookup, proposedLookup); + // recalculate CMS placement using new lookup + cmsDataPlacement = calculateCMSPlacement(placements, cmsMembership, proposedLookup); + // We shouldn't need to null out the other CMS fields when we refresh the CMSLookup as that is done in a + // precommit listener, meaning nothing should be accessing the ClusterMetadata instance yet. + } + + // either way, set the lookup for _this_ epoch + cmsLookup = proposedLookup; + } + + // Synchronization is probably not necessary as this should only be called once, during startup + public synchronized boolean initCMSLookup(CMSLookup lookup) + { + logger.info("Initializing CMS lookup on ClusterMetadata at epoch {}", epoch.getEpoch()); + boolean isInitial = cmsLookup.isUninitialized(); + logger.debug("Current CMS lookup: {}, proposed: {}", cmsLookup, lookup); + if (isInitial) + { + cmsLookup = lookup; + cmsDataPlacement = calculateCMSPlacement(placements, cmsMembership, lookup); + // We need to null out these fields when we init the CMSLookup because post-commit listeners may have + // accessed them during log replay, which happens before this. + fullCMSReplicas = null; + fullCMSEndpoints = null; + } + else + logger.warn("Invalid CMSLookup state for initialization: {}", cmsLookup); + return isInitial; + } + + private DataPlacement calculateCMSPlacement(DataPlacements placements, CMSMembership cms, CMSLookup lookup) { if (epoch.isBefore(Epoch.FIRST) || schema.getKeyspaces().get(SchemaConstants.METADATA_KEYSPACE_NAME).isEmpty()) return DataPlacement.empty(); @@ -310,11 +354,17 @@ private DataPlacement calculateCMSPlacement(DataPlacements placements, CMSMember } else { - // Build a placement based on the CMS membership - return cms.toPlacement(directory); + // Build a placement based on the CMS membership, with any in-flight endpoint changes applied + EndpointLookup endpointLookup = lookup.isActive() ? lookup.asNodeLookup(directory) : directory; + return cms.toPlacement(endpointLookup); } } + public EndpointLookup endpointLookup() + { + return cmsLookup.isActive() ? cmsLookup.asNodeLookup(directory) : directory; + } + public DataPlacement placement(ReplicationParams params) { return params.isMeta() ? cmsDataPlacement : placements.get(params); diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index 7d70b157095b..28ee2c3541ea 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -194,7 +194,8 @@ public static State state(ClusterMetadata metadata) Commit.Replicator replicator = CassandraRelevantProperties.TCM_USE_TEST_NO_OP_REPLICATOR.getBoolean() ? Commit.Replicator.NO_OP - : new Commit.DefaultReplicator(() -> log.metadata().directory); + : new Commit.DefaultReplicator(() -> log.metadata().directory, + () -> log.metadata().endpointLookup()); RemoteProcessor remoteProcessor = new RemoteProcessor(log, Discovery.instance::discoveredNodes); GossipProcessor gossipProcessor = new GossipProcessor(); diff --git a/src/java/org/apache/cassandra/tcm/Commit.java b/src/java/org/apache/cassandra/tcm/Commit.java index 734868caf5f4..6808fead7c76 100644 --- a/src/java/org/apache/cassandra/tcm/Commit.java +++ b/src/java/org/apache/cassandra/tcm/Commit.java @@ -41,6 +41,7 @@ import org.apache.cassandra.tcm.log.Entry; import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.EndpointLookup; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeVersion; import org.apache.cassandra.tcm.serialization.Version; @@ -414,10 +415,12 @@ public interface Replicator public static class DefaultReplicator implements Replicator { private final Supplier directorySupplier; + private final Supplier lookupSupplier; - public DefaultReplicator(Supplier directorySupplier) + public DefaultReplicator(Supplier directorySupplier, Supplier lookupSupplier) { this.directorySupplier = directorySupplier; + this.lookupSupplier = lookupSupplier; } public void send(Result result, InetAddressAndPort source) @@ -427,6 +430,7 @@ public void send(Result result, InetAddressAndPort source) Result.Success success = result.success(); Directory directory = directorySupplier.get(); + EndpointLookup lookup = lookupSupplier.get(); // Filter the log entries from the commit result for the purposes of replicating to members of the cluster // other than the original submitter. We only need to include the sublist of entries starting at the one @@ -438,7 +442,7 @@ public void send(Result result, InetAddressAndPort source) for (NodeId peerId : directory.peerIds()) { - InetAddressAndPort endpoint = directory.endpoint(peerId); + InetAddressAndPort endpoint = lookup.endpoint(peerId); boolean upgraded = directory.version(peerId).isUpgraded(); // Do not replicate to self and to the peer that has requested to commit this message if (endpoint.equals(FBUtilities.getBroadcastAddressAndPort()) || diff --git a/src/java/org/apache/cassandra/tcm/membership/Directory.java b/src/java/org/apache/cassandra/tcm/membership/Directory.java index c0c4d7a4df9a..5fb3d7edbe90 100644 --- a/src/java/org/apache/cassandra/tcm/membership/Directory.java +++ b/src/java/org/apache/cassandra/tcm/membership/Directory.java @@ -59,7 +59,7 @@ import static org.apache.cassandra.db.TypeSizes.sizeof; import static org.apache.cassandra.tcm.membership.NodeVersion.CURRENT; -public class Directory implements MetadataValue +public class Directory implements MetadataValue, EndpointLookup { public static final Serializer serializer = new Serializer(); diff --git a/src/java/org/apache/cassandra/tcm/membership/EndpointLookup.java b/src/java/org/apache/cassandra/tcm/membership/EndpointLookup.java new file mode 100644 index 000000000000..03840d948ff4 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/membership/EndpointLookup.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.membership; + +import org.apache.cassandra.locator.InetAddressAndPort; + +public interface EndpointLookup +{ + InetAddressAndPort endpoint(NodeId id); +} From 7807fd526ecfd025204b75385e473ea17e7a97ca Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Mon, 2 Jun 2025 11:50:04 +0100 Subject: [PATCH 18/23] [CASSANDRA-20476] Perform rediscovery of CMS at startup if addresses change --- .../apache/cassandra/net/MessageDelivery.java | 6 +- src/java/org/apache/cassandra/net/Verb.java | 6 +- .../cassandra/tcm/ClusterMetadataService.java | 2 +- .../org/apache/cassandra/tcm/Discovery.java | 74 +++++++--- .../org/apache/cassandra/tcm/Startup.java | 136 +++++++++++++++++- .../apache/cassandra/tcm/log/LocalLog.java | 23 +++ .../tcm/DiscoverySimulationTest.java | 2 +- 7 files changed, 224 insertions(+), 25 deletions(-) diff --git a/src/java/org/apache/cassandra/net/MessageDelivery.java b/src/java/org/apache/cassandra/net/MessageDelivery.java index 2e9155f71356..064dacb0e050 100644 --- a/src/java/org/apache/cassandra/net/MessageDelivery.java +++ b/src/java/org/apache/cassandra/net/MessageDelivery.java @@ -62,7 +62,7 @@ static Collection> fanoutAndWait(Messag @Override public void onResponse(Message msg) { - logger.info("Received a {} response from {}: {}", msg.verb(), msg.from(), msg.payload); + logger.debug("Received a {} response from {}: {}", msg.verb(), msg.from(), msg.payload); responses.add(Pair.create(msg.from(), msg.payload)); cdl.decrement(); } @@ -70,13 +70,13 @@ public void onResponse(Message msg) @Override public void onFailure(InetAddressAndPort from, RequestFailure reason) { - logger.info("Received failure in response to {} from {}: {}", verb, from, reason); + logger.debug("Received failure in response to {} from {}: {}", verb, from, reason); cdl.decrement(); } }; sendTo.forEach((ep) -> { - logger.info("Election for metadata migration sending {} ({}) to {}", verb, payload.toString(), ep); + logger.debug("Sending {} ({}) to {}", verb, payload.toString(), ep); messaging.sendWithCallback(Message.out(verb, payload), ep, callback); }); cdl.awaitUninterruptibly(timeout, timeUnit); diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java index ad2317e5abc0..cbb7691ce8c4 100644 --- a/src/java/org/apache/cassandra/net/Verb.java +++ b/src/java/org/apache/cassandra/net/Verb.java @@ -311,8 +311,12 @@ public enum Verb TCM_ABORT_MIG (811, P0, rpcTimeout, INTERNAL_METADATA, () -> CMSInitializationRequest.Initiator.serializer,() -> Election.instance.abortHandler, TCM_INIT_MIG_RSP ), TCM_DISCOVER_RSP (812, P0, rpcTimeout, INTERNAL_METADATA, () -> Discovery.serializer, RESPONSE_HANDLER ), TCM_DISCOVER_REQ (813, P0, rpcTimeout, INTERNAL_METADATA, () -> NoPayload.serializer, () -> Discovery.instance.requestHandler, TCM_DISCOVER_RSP ), - TCM_FETCH_PEER_LOG_RSP (818, P0, shortTimeout, FETCH_METADATA, MessageSerializers::logStateSerializer, RESPONSE_HANDLER ), + TCM_FETCH_PEER_LOG_RSP (818, P0, rpcTimeout, FETCH_METADATA, MessageSerializers::logStateSerializer, () -> ResponseVerbHandler.instance ), TCM_FETCH_PEER_LOG_REQ (819, P0, rpcTimeout, FETCH_METADATA, () -> FetchPeerLog.serializer, () -> FetchPeerLog.Handler.instance, TCM_FETCH_PEER_LOG_RSP ), + TCM_DISCOVER_PEERS_RSP (820, P0, rpcTimeout, INTERNAL_METADATA, () -> Discovery.serializer, () -> ResponseVerbHandler.instance ), + TCM_DISCOVER_PEERS_REQ (821, P0, rpcTimeout, INTERNAL_METADATA, () -> NoPayload.serializer, () -> Discovery.instance.requestHandler, TCM_DISCOVER_PEERS_RSP), + TCM_DISCOVER_SURVEY_RSP(822, P0, rpcTimeout, INTERNAL_METADATA, () -> Discovery.nodeIdSerializer, () -> ResponseVerbHandler.instance ), + TCM_DISCOVER_SURVEY_REQ(823, P0, rpcTimeout, INTERNAL_METADATA, () -> NoPayload.serializer, () -> Discovery.instance.requestHandler, TCM_DISCOVER_SURVEY_RSP), INITIATE_DATA_MOVEMENTS_RSP (814, P1, rpcTimeout, MISC, () -> NoPayload.serializer, RESPONSE_HANDLER ), INITIATE_DATA_MOVEMENTS_REQ (815, P1, rpcTimeout, MISC, () -> DataMovement.serializer, () -> DataMovementVerbHandler.instance, INITIATE_DATA_MOVEMENTS_RSP ), diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index 28ee2c3541ea..b0945536bd5e 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -153,7 +153,7 @@ public static State state() return state(ClusterMetadata.current()); } - public static State state(ClusterMetadata metadata) + public static ClusterMetadataService.State state(ClusterMetadata metadata) { if (CassandraRelevantProperties.TCM_UNSAFE_BOOT_WITH_CLUSTERMETADATA.isPresent()) return RESET; diff --git a/src/java/org/apache/cassandra/tcm/Discovery.java b/src/java/org/apache/cassandra/tcm/Discovery.java index 99ca8907cc68..21990118d395 100644 --- a/src/java/org/apache/cassandra/tcm/Discovery.java +++ b/src/java/org/apache/cassandra/tcm/Discovery.java @@ -36,6 +36,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; @@ -47,6 +48,7 @@ import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.NoPayload; import org.apache.cassandra.net.Verb; +import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; @@ -63,6 +65,28 @@ public class Discovery public static final Discovery instance = new Discovery(); public static final Serializer serializer = new Serializer(); + // TODO add this to MessageSerializers properly or define a real response format + public static final IVersionedSerializer nodeIdSerializer = new IVersionedSerializer<>() + { + @Override + public void serialize(NodeId t, DataOutputPlus out, int version) throws IOException + { + out.writeUnsignedVInt32(t.id()); + } + + @Override + public NodeId deserialize(DataInputPlus in, int version) throws IOException + { + int id = in.readUnsignedVInt32(); + return new NodeId(id); + } + + @Override + public long serializedSize(NodeId t, int version) + { + return TypeSizes.sizeofUnsignedVInt(t.id()); + } + }; public final IVerbHandler requestHandler; private final Set discovered = new ConcurrentSkipListSet<>(); @@ -94,12 +118,14 @@ public Discovery(Supplier messaging, Supplier candidates = new HashSet<>(); if (initiator != null) @@ -148,12 +176,11 @@ public DiscoveredNodes discoverOnce(InetAddressAndPort initiator, long timeout, else candidates.addAll(discovered); - if (candidates.isEmpty()) - candidates.addAll(seeds.get()); - + candidates.addAll(seeds.get()); candidates.remove(self); - Collection> responses = MessageDelivery.fanoutAndWait(messaging.get(), candidates, Verb.TCM_DISCOVER_REQ, NoPayload.noPayload, timeout, timeUnit); + Verb verb = allPeers ? Verb.TCM_DISCOVER_PEERS_REQ : Verb.TCM_DISCOVER_REQ; + Collection> responses = MessageDelivery.fanoutAndWait(messaging.get(), candidates, verb, NoPayload.noPayload, timeout, timeUnit); for (Pair discoveredNodes : responses) { @@ -179,19 +206,30 @@ private final class DiscoveryRequestHandler implements IVerbHandler @Override public void doVerb(Message message) { + discovered.add(message.from()); Set cms = ClusterMetadata.current().fullCMSMembers(); - logger.debug("Responding to discovery request from {}: {}", message.from(), cms); - DiscoveredNodes discoveredNodes; - if (!cms.isEmpty()) - discoveredNodes = new DiscoveredNodes(cms, DiscoveredNodes.Kind.CMS_ONLY); - else + switch (message.verb()) { - discovered.add(message.from()); - discoveredNodes = new DiscoveredNodes(new HashSet<>(discovered), DiscoveredNodes.Kind.KNOWN_PEERS); + case TCM_DISCOVER_REQ: + logger.trace("Responding to discovery request from {}: {}", message.from(), cms); + if (!cms.isEmpty()) + discoveredNodes = new DiscoveredNodes(cms, DiscoveredNodes.Kind.CMS_ONLY); + else + discoveredNodes = new DiscoveredNodes(new HashSet<>(discovered), DiscoveredNodes.Kind.KNOWN_PEERS); + messaging.get().send(message.responseWith(discoveredNodes), message.from()); + break; + case TCM_DISCOVER_PEERS_REQ: + logger.trace("Responding to {} request from {}", message.verb(), message.from()); + discoveredNodes = new DiscoveredNodes(new HashSet<>(discovered), DiscoveredNodes.Kind.KNOWN_PEERS); + messaging.get().send(message.responseWith(discoveredNodes), message.from()); + break; + case TCM_DISCOVER_SURVEY_REQ: + logger.trace("Responding to {} request from {}", message.verb(), message.from()); + NodeId id = NodeId.fromUUID(SystemKeyspace.getLocalHostId()); + messaging.get().send(message.responseWith(id), message.from()); + break; } - - messaging.get().send(message.responseWith(discoveredNodes), message.from()); } } diff --git a/src/java/org/apache/cassandra/tcm/Startup.java b/src/java/org/apache/cassandra/tcm/Startup.java index ebcaa2abd1b3..2354a3f27547 100644 --- a/src/java/org/apache/cassandra/tcm/Startup.java +++ b/src/java/org/apache/cassandra/tcm/Startup.java @@ -19,7 +19,9 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -52,7 +54,10 @@ import org.apache.cassandra.gms.NewGossiper; import org.apache.cassandra.gms.VersionedValue; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.MessageDelivery; import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.NoPayload; +import org.apache.cassandra.net.Verb; import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.Keyspaces; @@ -117,7 +122,16 @@ public static void initialize(Set seeds, case NORMAL: logger.info("Initializing as non CMS node"); initializeAsNonCmsNode(wrapProcessor); + ClusterMetadataService.instance().log().pause(); initMessaging.run(); + UUID localHostId = SystemKeyspace.getLocalHostId(); + // If null, this node has not yet registered so there will be more nothing to do here + if (localHostId != null) + { + NodeId nodeId = NodeId.fromUUID(localHostId); + initializeCMSLookup(nodeId, ClusterMetadata.current()); + } + ClusterMetadataService.instance().log().unpause(); break; case VOTE: logger.info("Initializing for discovery"); @@ -173,7 +187,6 @@ public static void initializeAsNonCmsNode(Function wrapPro ClusterMetadataService::state, logSpec)); ClusterMetadataService.instance().log().ready(); - NodeId nodeId = ClusterMetadata.current().myNodeId(); UUID currentHostId = SystemKeyspace.getLocalHostId(); if (nodeId != null && !Objects.equals(nodeId.toUUID(), currentHostId)) @@ -192,6 +205,127 @@ public static void initializeAsNonCmsNode(Function wrapPro } } + + /** + * If the broadcast address of this node has changed, we must verify the endpoints it knows for + * the members of the CMS are still reachable and valid. This is necessary for the node to submit + * a STARTUP transformation which updates its broadcast address in ClusterMetadata. + * + * If the node is itself a CMS member, it is also a requirement to be able to contact a + * majority of the other CMS members in order to perform the serial reads and writes which + * constitute committing to and fetching from the distributed metadata log. + * + * To do this, we use a simple protocol: + * 1. For each CMS member in our replayed ClusterMetadata, ping the associated broadcast address + * to query for id of the node at that address. This determines whether the endpoint still + * belongs to that same node (which is/was a CMS member). + * 2. While we don't have confirmed current addresses for a majority of CMS nodes: + * 2a. Run discovery to locate as many peer addresses as possible. + * 2b. Query every discovered endpoint and ask for its node id. + * If we still don't have confirmed addresses for a majority of CMS members, go to 2a and + * repeat as peers may themselves still be starting up and so may have become discoverable. + * + * This process builds up a mapping of id -> current address for CMS members which can then be + * used to construct a set of temporary redirects between addresses according to ClusterMetadata + * and the newly discovered ones. + * + * As each CMS node with a changed address goes through the startup process, it will commit its + * STARTUP transformation and the new broadcast address will be found in ClusterMetadata. A log + * listener is used to react to these transformations by removing redundant address overrides + * as they are enacted. + * + * @param nodeId derived from the persisted id of this node from the system.peers table + * @param replayed current ClusterMetadata after replaying the metadata log for startup + */ + private static ClusterMetadata initializeCMSLookup(NodeId nodeId, ClusterMetadata replayed) + { + InetAddressAndPort oldAddress = replayed.directory.endpoint(nodeId); + InetAddressAndPort newAddress = FBUtilities.getBroadcastAddressAndPort(); + if (newAddress.equals(oldAddress)) + return replayed; + + Map previousCMS = new HashMap<>(); + replayed.fullCMSMemberIds().forEach(id -> previousCMS.put(id, replayed.directory.endpoint(id))); + Map confirmedCMS = new HashMap<>(); + + Set candidates = new HashSet<>(previousCMS.values()); + candidates.add(newAddress); + + int maxRounds = 5; + int currentRound = 0; + long roundTimeNanos = Math.min(TimeUnit.SECONDS.toNanos(4), + DatabaseDescriptor.getDiscoveryTimeout(TimeUnit.NANOSECONDS) / maxRounds); + // TODO a non-CMS node only needs to be able to contact a single CMS member to commit its STARTUP + int quorum = (previousCMS.size() / 2) + 1; + logger.info("Running survey and discovery for CMS nodes {} (quorum = {})", previousCMS, quorum); + while (confirmedCMS.size() < quorum && currentRound < maxRounds) + { + logger.info("In round {} sending survey to {}", currentRound, candidates); + Collection> surveyed = + MessageDelivery.fanoutAndWait(MessagingService.instance(), + candidates, + Verb.TCM_DISCOVER_SURVEY_REQ, + NoPayload.noPayload, + roundTimeNanos, + TimeUnit.NANOSECONDS); + logger.info("Survey of {} discovered {}", candidates, surveyed); + surveyed.forEach(pair -> { + if (previousCMS.containsKey(pair.right)) + confirmedCMS.put(pair.right, pair.left); + }); + + logger.info("Confirmed CMS members {}", confirmedCMS); + if (confirmedCMS.size() < quorum || (previousCMS.size() == 1 && confirmedCMS.containsKey(nodeId))) + { + // In the single node CMS case, run discovery simply to propagate this node's new address to the rest of + // the cluster via the seeds & discovery meshing. Otherwise, if every node has a new address the non-CMS + // members have no way to discover the CMS and it has no way to know the new places to push updates to. + logger.info("Running discovery round; either CMS quorum was not confirmed or this is the only CMS member"); + Discovery.DiscoveredNodes nodes = Discovery.instance.discover(5, true); + candidates.addAll(nodes.nodes()); + logger.info("Rediscovery completed, discovered nodes: {}", nodes); + } + currentRound++; + } + + if (confirmedCMS.size() >= quorum) + { + logger.info("Identified a quorum of CMS members (found {}, required {}).", confirmedCMS.size(), quorum); + if (confirmedCMS.values().containsAll(previousCMS.values())) + { + logger.info("No endpoint changes found for CMS members"); + return replayed; + } + else + { + logger.info("Applying temporary address overrides for uncommitted CMS endpoint changes"); + CMSLookup.InitialBuilder builder = CMSLookup.builder(replayed); + for (NodeId confirmed : confirmedCMS.keySet()) + { + InetAddressAndPort prev = previousCMS.get(confirmed); + InetAddressAndPort next = confirmedCMS.get(confirmed); + if (!next.equals(prev)) + { + logger.info("Added override for {}, ({} -> {})", confirmed, previousCMS.get(confirmed), confirmedCMS.get(confirmed)); + builder = builder.withOverride(confirmed, previousCMS.get(confirmed), confirmedCMS.get(confirmed)); + } + } + if (replayed.initCMSLookup(builder.build())) + { + ClusterMetadataService.instance().log().addListener(new CMSLookup.LogListener()); + return replayed; + } + + throw new RuntimeException("Could not initialize CMS lookup"); + } + } + else + { + throw new RuntimeException(String.format("Unable to identify a quorum of CMS members (found %s, required %s)", + confirmedCMS.size(), quorum)); + } + } + public static void scrubDataDirectories(ClusterMetadata metadata) throws StartupException { // clean up debris in the rest of the keyspaces diff --git a/src/java/org/apache/cassandra/tcm/log/LocalLog.java b/src/java/org/apache/cassandra/tcm/log/LocalLog.java index 0da8e6715b6d..f3ffe7e6c2f9 100644 --- a/src/java/org/apache/cassandra/tcm/log/LocalLog.java +++ b/src/java/org/apache/cassandra/tcm/log/LocalLog.java @@ -103,6 +103,8 @@ public abstract class LocalLog implements Closeable // notification to them all. private final AtomicBoolean replayComplete = new AtomicBoolean(); + private final AtomicBoolean paused = new AtomicBoolean(); + public static LogSpec logSpec() { return new LogSpec(); @@ -382,6 +384,21 @@ public void append(Collection entries) } } + /** + * Temporarily halts processing of the pending buffer. While paused, entries may be appended to the buffer but + * they will not be enacted. Currently only used sparingly during startup if any address changes for CMS members are + * detected. In this case, we pause just while a mapping from old to new addresses is being built. + */ + public void pause() + { + paused.set(true); + } + + public void unpause() + { + paused.set(false); + } + public void append(Entry entry) { maybeAppend(entry); @@ -473,6 +490,12 @@ private Entry peek() */ void processPendingInternal() { + if (paused.get()) + { + logger.trace("Entry processing is paused, returning without scanning pending buffer"); + return; + } + while (true) { Entry pendingEntry = peek(); diff --git a/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java b/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java index 60213554420f..729017bae9f0 100644 --- a/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java +++ b/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java @@ -101,7 +101,7 @@ public void discoveryTest() throws Throwable Map> futures = new HashMap<>(); nodes.forEach((addr, discovery) -> { - futures.put(addr, CompletableFuture.supplyAsync(() -> discovery.discover(5), executor)); + futures.put(addr, CompletableFuture.supplyAsync(() -> discovery.discover(5, false), executor)); }); Map discovered = new HashMap<>(); From ba2256c4e776d16d67b0f518b3c4a9315f2a927d Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Wed, 25 Jun 2025 18:40:59 +0100 Subject: [PATCH 19/23] [CASSANDRA-20476] Attempt to wait for all address changes to be enacted before allowing startup to proceed --- .../apache/cassandra/gms/FailureDetector.java | 5 +++ .../cassandra/net/ResponseVerbHandler.java | 2 + .../org/apache/cassandra/tcm/Startup.java | 39 +++++++++++++++++-- .../test/log/DiscoverNewCMSTest.java | 16 ++++++-- 4 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/java/org/apache/cassandra/gms/FailureDetector.java b/src/java/org/apache/cassandra/gms/FailureDetector.java index 13d6266ca039..34f53508093a 100644 --- a/src/java/org/apache/cassandra/gms/FailureDetector.java +++ b/src/java/org/apache/cassandra/gms/FailureDetector.java @@ -328,6 +328,11 @@ public boolean isAlive(InetAddressAndPort ep) // registration via the metadata log, or a full gossip round). This is perfectly harmless, so no need to log // an error in that case. ClusterMetadata metadata = ClusterMetadata.current(); + if (metadata.cmsLookup.isActive() && metadata.fullCMSMembers().contains(ep)) + { + logger.trace("Found endpoint {} in active CMS lookup, assuming it is alive", ep); + return true; + } if (!metadata.directory.allJoinedEndpoints().contains(ep) && !metadata.fullCMSMembers().contains(ep)) logger.error("Unknown endpoint: " + ep, new UnknownEndpointException(ep)); } diff --git a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java index 2550ca2929b8..cb846c4331f3 100644 --- a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java +++ b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java @@ -49,6 +49,8 @@ class ResponseVerbHandler implements IVerbHandler Verb.TCM_REPLICATION, Verb.TCM_NOTIFY_RSP, Verb.TCM_DISCOVER_RSP, + Verb.TCM_DISCOVER_PEERS_RSP, + Verb.TCM_DISCOVER_SURVEY_RSP, Verb.TCM_INIT_MIG_RSP); // We skip epoch catchup for PaxosV2 verbs, since we are using PaxosV2 to serially read the log. diff --git a/src/java/org/apache/cassandra/tcm/Startup.java b/src/java/org/apache/cassandra/tcm/Startup.java index 2354a3f27547..d79103e4fd34 100644 --- a/src/java/org/apache/cassandra/tcm/Startup.java +++ b/src/java/org/apache/cassandra/tcm/Startup.java @@ -121,17 +121,47 @@ public static void initialize(Set seeds, break; case NORMAL: logger.info("Initializing as non CMS node"); + // note: if this node was a member of the CMS, log replay will put it back into that state initializeAsNonCmsNode(wrapProcessor); - ClusterMetadataService.instance().log().pause(); - initMessaging.run(); UUID localHostId = SystemKeyspace.getLocalHostId(); // If null, this node has not yet registered so there will be more nothing to do here if (localHostId != null) { NodeId nodeId = NodeId.fromUUID(localHostId); - initializeCMSLookup(nodeId, ClusterMetadata.current()); + ClusterMetadata replayed = ClusterMetadata.current(); + InetAddressAndPort oldAddress = replayed.directory.endpoint(nodeId); + InetAddressAndPort newAddress = FBUtilities.getBroadcastAddressAndPort(); + if (!newAddress.equals(oldAddress)) + { + // Build temporary mappings for addressing CMS nodes who's addresses have been + // changed but not yet committed via the CMS or not yet been enacted locally. + ClusterMetadataService.instance().log().pause(); + initMessaging.run(); + initializeCMSLookup(nodeId, replayed); + ClusterMetadataService.instance().log().unpause(); + + logger.info("Detected change in local node addresses, committing update to Cluster Metadata Service"); + Transformation transform = new org.apache.cassandra.tcm.transformations.Startup(nodeId, + NodeAddresses.current(), + NodeVersion.CURRENT); + replayed = ClusterMetadataService.instance().commit(transform); + + // Wait for any CMS members which have uncommitted address changes to commit them and + // us to enact them. + while (replayed.cmsLookup.isActive()) + { + logger.info("Waiting for pending CMS address changes to complete {}", replayed.cmsLookup); + TimeUnit.MILLISECONDS.sleep(1000); // TODO make configurable? + replayed = ClusterMetadata.current(); + } + } + logger.info("CMS address changes complete, current epoch is {}", replayed.epoch.getEpoch()); + } + else + { + // nothing to do, so just initialise messaging + initMessaging.run(); } - ClusterMetadataService.instance().log().unpause(); break; case VOTE: logger.info("Initializing for discovery"); @@ -272,6 +302,7 @@ private static ClusterMetadata initializeCMSLookup(NodeId nodeId, ClusterMetadat surveyed.forEach(pair -> { if (previousCMS.containsKey(pair.right)) confirmedCMS.put(pair.right, pair.left); + }); logger.info("Confirmed CMS members {}", confirmedCMS); diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java index 431e042bff00..41e94a22d8b9 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/DiscoverNewCMSTest.java @@ -103,12 +103,20 @@ public void multiNodeCMSAllAddressesChangeTest() throws IOException, ExecutionEx private void test(Cluster cluster, int cmsSize) throws IOException, ExecutionException, InterruptedException { ExecutorService executor = Executors.newFixedThreadPool(cluster.size()); + // Some fetchCMSLog operations might temporarily fail and be retried during address changes + String[] expectedErrors = new String[] { + "Cannot achieve consistency level SERIAL.*", + "Queried for epoch Epoch\\{epoch=\\d+\\}, but could not catch up.*" + }; cluster.setUncaughtExceptionsFilter((node, t) -> { Throwable rootCause = Throwables.getRootCause(t); - // Some fetchCMSLog operations might temporarily fail and be retried during address changes - return rootCause.getMessage() != null - && rootCause.getMessage().startsWith("Cannot achieve consistency level SERIAL"); - + if (rootCause.getMessage() == null) + return false; + String message = rootCause.getMessage(); + for (String expected : expectedErrors) + if (message.matches(expected)) + return true; + return false; }); cluster.startup(); init(cluster); From 76e9f1dd9f49a231ad167381ba2d511917f2485d Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Tue, 24 Jun 2025 13:09:55 +0100 Subject: [PATCH 20/23] [CASSANDRA-20476] Some minor logging additions --- src/java/org/apache/cassandra/tcm/Commit.java | 2 +- .../org/apache/cassandra/tcm/transformations/Startup.java | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/tcm/Commit.java b/src/java/org/apache/cassandra/tcm/Commit.java index 6808fead7c76..42e6b613d26a 100644 --- a/src/java/org/apache/cassandra/tcm/Commit.java +++ b/src/java/org/apache/cassandra/tcm/Commit.java @@ -367,8 +367,8 @@ static class Handler implements IVerbHandler public void doVerb(Message message) throws IOException { - checkCMSState(); logger.info("Received commit request {} from {}", message.payload, message.from()); + checkCMSState(); Retry retryPolicy = Retry.until(message.expiresAtNanos(), TCMMetrics.instance.commitRetries); Result result = processor.commit(message.payload.entryId, message.payload.transform, message.payload.lastKnown, retryPolicy); if (result.isSuccess()) diff --git a/src/java/org/apache/cassandra/tcm/transformations/Startup.java b/src/java/org/apache/cassandra/tcm/transformations/Startup.java index 4476be31901e..d0721f1053a7 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/Startup.java +++ b/src/java/org/apache/cassandra/tcm/transformations/Startup.java @@ -22,6 +22,9 @@ import java.util.Map; import java.util.Objects; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.tcm.ClusterMetadata; @@ -40,6 +43,7 @@ public class Startup implements Transformation { + private static final Logger logger = LoggerFactory.getLogger(Startup.class); public static final Serializer serializer = new Serializer(); private final NodeId nodeId; private final NodeVersion nodeVersion; @@ -108,6 +112,7 @@ public static void maybeExecuteStartupTransformation(NodeId localNodeId) if (!Objects.equals(directory.addresses.get(localNodeId), NodeAddresses.current()) || !Objects.equals(directory.versions.get(localNodeId), NodeVersion.CURRENT)) { + logger.info("Detected change in node addresses or version, committing updates to Cluster Metadata Service"); ClusterMetadataService.instance() .commit(new Startup(localNodeId, NodeAddresses.current(), NodeVersion.CURRENT)); } From 214261f45b226d298125314788ad7733cd472fd3 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Tue, 5 Aug 2025 19:24:29 +0100 Subject: [PATCH 21/23] [CASSANDRA-20476] Make sure to start messaging service --- src/java/org/apache/cassandra/tcm/Startup.java | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/Startup.java b/src/java/org/apache/cassandra/tcm/Startup.java index d79103e4fd34..63f6593e8d52 100644 --- a/src/java/org/apache/cassandra/tcm/Startup.java +++ b/src/java/org/apache/cassandra/tcm/Startup.java @@ -125,7 +125,11 @@ public static void initialize(Set seeds, initializeAsNonCmsNode(wrapProcessor); UUID localHostId = SystemKeyspace.getLocalHostId(); // If null, this node has not yet registered so there will be more nothing to do here - if (localHostId != null) + if (localHostId == null) + { + initMessaging.run(); + } + else { NodeId nodeId = NodeId.fromUUID(localHostId); ClusterMetadata replayed = ClusterMetadata.current(); @@ -154,13 +158,13 @@ public static void initialize(Set seeds, TimeUnit.MILLISECONDS.sleep(1000); // TODO make configurable? replayed = ClusterMetadata.current(); } + logger.info("Any in flight CMS address changes have been processed, current epoch is {}", replayed.epoch.getEpoch()); + } + else + { + // nothing more to do, so just initialize messaging + initMessaging.run(); } - logger.info("CMS address changes complete, current epoch is {}", replayed.epoch.getEpoch()); - } - else - { - // nothing to do, so just initialise messaging - initMessaging.run(); } break; case VOTE: From a52733aa6f06bbfb3f9bf9aedc4ca712bb0b7683 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Thu, 22 Jan 2026 19:15:39 +0000 Subject: [PATCH 22/23] [CASSANDRA-20476] Don't attempt to catch up from peers or CMS while log processing is suspended --- .../cassandra/tcm/ClusterMetadataService.java | 26 ++++++++++++++++++- .../apache/cassandra/tcm/PeerLogFetcher.java | 1 + .../apache/cassandra/tcm/log/LocalLog.java | 7 ++++- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index b0945536bd5e..fda70e9e08bb 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -723,10 +723,15 @@ public ClusterMetadata fetchLogFromCMS(Epoch awaitAtLeast) return metadata; Epoch ourEpoch = metadata.epoch; - if (ourEpoch.isEqualOrAfter(awaitAtLeast)) return metadata; + if (log.isPaused()) + { + logger.debug("Fetch metadata log from CMS was requested, but log processing is paused"); + return metadata; + } + Retry deadline = Retry.untilElapsed(getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), TCMMetrics.instance.fetchLogRetries); // responses for ALL withhout knowing we have pending metadata = processor.fetchLogAndWait(awaitAtLeast, deadline); @@ -761,6 +766,12 @@ public Future fetchLogFromPeerAsync(InetAddressAndPort from, Ep awaitAtLeast.isBefore(Epoch.FIRST)) return ImmediateFuture.success(current); + if (log.isPaused()) + { + logger.debug("Fetch metadata log from peer was requested, but log processing is paused"); + return ImmediateFuture.success(current); + } + return peerLogFetcher.asyncFetchLog(from, awaitAtLeast); } @@ -789,6 +800,13 @@ private ClusterMetadata fetchLogFromPeer(ClusterMetadata metadata, InetAddressAn Epoch before = metadata.epoch; if (before.isEqualOrAfter(awaitAtLeast)) return metadata; + + if (log.isPaused()) + { + logger.debug("Fetch metadata log from peer was requested, but log processing is paused"); + return metadata; + } + return peerLogFetcher.fetchLogEntriesAndWait(from, awaitAtLeast); } @@ -844,6 +862,12 @@ public ClusterMetadata fetchLogFromPeerOrCMS(ClusterMetadata metadata, InetAddre if (awaitAtLeast.isBefore(Epoch.FIRST) || FBUtilities.getBroadcastAddressAndPort().equals(from)) return metadata; + if (log.isPaused()) + { + logger.debug("Fetch metadata log from peer or CMS was requested, but log processing is paused"); + return metadata; + } + Epoch before = metadata.epoch; if (before.isEqualOrAfter(awaitAtLeast)) return metadata; diff --git a/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java b/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java index ceac4c7025c1..45f17a76a08c 100644 --- a/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java +++ b/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java @@ -69,6 +69,7 @@ public ClusterMetadata fetchLogEntriesAndWait(InetAddressAndPort remote, Epoch a catch (ExecutionException | TimeoutException e) { logger.warn("Could not fetch log entries from peer, remote = {}, await = {}", remote, awaitAtleast); + logger.debug("Exception while fetching log entries from peer, remote = {}", remote, e); } return metadata; } diff --git a/src/java/org/apache/cassandra/tcm/log/LocalLog.java b/src/java/org/apache/cassandra/tcm/log/LocalLog.java index f3ffe7e6c2f9..f10563fca825 100644 --- a/src/java/org/apache/cassandra/tcm/log/LocalLog.java +++ b/src/java/org/apache/cassandra/tcm/log/LocalLog.java @@ -399,6 +399,11 @@ public void unpause() paused.set(false); } + public boolean isPaused() + { + return paused.get(); + } + public void append(Entry entry) { maybeAppend(entry); @@ -492,7 +497,7 @@ void processPendingInternal() { if (paused.get()) { - logger.trace("Entry processing is paused, returning without scanning pending buffer"); + logger.info("Metadata log entry processing is paused, returning without scanning pending buffer"); return; } From dc2fab7cf99b59b890a5c97bf89f07997c654715 Mon Sep 17 00:00:00 2001 From: Sam Tunnicliffe Date: Mon, 26 Jan 2026 13:16:37 +0000 Subject: [PATCH 23/23] Make shadow gossip round parameters configurable for testing --- .../cassandra/config/CassandraRelevantProperties.java | 2 ++ src/java/org/apache/cassandra/gms/NewGossiper.java | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java index cf514b7dfce1..b883b3c19df1 100644 --- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java +++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java @@ -584,6 +584,8 @@ public enum CassandraRelevantProperties // transactional cluster metadata relevant properties // TODO: not a fan of being forced to prefix these to satisfy the alphabetic ordering constraint // but it makes sense to group logically related properties together + TCM_SHADOW_ROUND_MAX_ATTEMPTS("cassandra.shadow_round_max_attempts", "3"), + TCM_SHADOW_ROUND_TIMEOUT("cassandra.shadow_round_timeout_millis", "15000"), /** * for testing purposes disable the automatic CMS reconfiguration after a bootstrap/replace/move operation */ diff --git a/src/java/org/apache/cassandra/gms/NewGossiper.java b/src/java/org/apache/cassandra/gms/NewGossiper.java index 4bb8b7596865..073943fddc33 100644 --- a/src/java/org/apache/cassandra/gms/NewGossiper.java +++ b/src/java/org/apache/cassandra/gms/NewGossiper.java @@ -32,6 +32,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.locator.InetAddressAndPort; @@ -68,15 +69,17 @@ public Map doShadowRound() handler = shadowRoundHandler; int tries = 0; + int maxTries = CassandraRelevantProperties.TCM_SHADOW_ROUND_MAX_ATTEMPTS.getInt(); + long timeout = CassandraRelevantProperties.TCM_SHADOW_ROUND_TIMEOUT.getLong(); while (true) { try { - return shadowRoundHandler.doShadowRound().get(15, TimeUnit.SECONDS); + return shadowRoundHandler.doShadowRound().get(timeout, TimeUnit.MILLISECONDS); } catch (InterruptedException | ExecutionException | TimeoutException e) { - if (++tries > 3) + if (++tries >= maxTries) break; logger.warn("Got no response for shadow round"); }