From 3c0574693cfbea882c64db0d6d4786ac019c5225 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Tue, 24 Mar 2026 16:12:18 -0400 Subject: [PATCH 1/6] Get collection state from cache --- .../handler/component/CloudReplicaSource.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java index 836d0951f062..bf8cb48d797b 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java +++ b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java @@ -109,12 +109,14 @@ private void withShardsParam(Builder builder, String shardsParam) { if (sliceOrUrl.indexOf('/') < 0) { // this is a logical shard this.slices[i] = sliceOrUrl; + DocCollection coll = clusterState.getCollectionOrNull(builder.collection, true); + if (coll == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "Could not find collection to resolve replicas: " + builder.collection); + } replicas[i] = - findReplicas( - builder, - shardsParam, - clusterState, - clusterState.getCollection(builder.collection).getSlice(sliceOrUrl)); + findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl)); } else { // this has urls this.replicas[i] = StrUtils.splitSmart(sliceOrUrl, "|", true); @@ -189,7 +191,11 @@ private void addSlices( String collectionName, String shardKeys, boolean multiCollection) { - DocCollection coll = state.getCollection(collectionName); + DocCollection coll = state.getCollectionOrNull(collectionName, true); + if (coll == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, "Could not find collection to add slices: " + collectionName); + } Collection slices = coll.getRouter().getSearchSlices(shardKeys, params, coll); ClientUtils.addSlices(target, collectionName, slices, multiCollection); } From b2c817d584f5521ea83c14d4d48808b738ec02f0 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Wed, 25 Mar 2026 11:10:50 -0400 Subject: [PATCH 2/6] Add changelog --- .../unreleased/SOLR-18176-shardhandler-bottleneck.yml | 7 +++++++ .../apache/solr/handler/component/CloudReplicaSource.java | 6 +++--- 2 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml new file mode 100644 index 000000000000..52ba15ba0ce3 --- /dev/null +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -0,0 +1,7 @@ +title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached +type: fixed +authors: + - name: Matthew Biscocho +links: + - name: SOLR-18176 + url: https://issues.apache.org/jira/browse/SOLR-18176 diff --git a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java index bf8cb48d797b..5315d413f421 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java +++ b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java @@ -115,8 +115,7 @@ private void withShardsParam(Builder builder, String shardsParam) { SolrException.ErrorCode.BAD_REQUEST, "Could not find collection to resolve replicas: " + builder.collection); } - replicas[i] = - findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl)); + replicas[i] = findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl)); } else { // this has urls this.replicas[i] = StrUtils.splitSmart(sliceOrUrl, "|", true); @@ -194,7 +193,8 @@ private void addSlices( DocCollection coll = state.getCollectionOrNull(collectionName, true); if (coll == null) { throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, "Could not find collection to add slices: " + collectionName); + SolrException.ErrorCode.BAD_REQUEST, + "Could not find collection to add slices: " + collectionName); } Collection slices = coll.getRouter().getSearchSlices(shardKeys, params, coll); ClientUtils.addSlices(target, collectionName, slices, multiCollection); From 14483dd117264560d8e95acc2ce666fe0d7b49b1 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Mon, 30 Mar 2026 16:41:22 -0400 Subject: [PATCH 3/6] Add test --- .../SOLR-18176-shardhandler-bottleneck.yml | 2 +- ...ributedQueryComponentOptimizationTest.java | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml index 52ba15ba0ce3..167b1b59e24b 100644 --- a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -1,5 +1,5 @@ title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached -type: fixed +type: changed authors: - name: Matthew Biscocho links: diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java index 349d6dda7114..186589d192f1 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java @@ -24,15 +24,19 @@ import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.solr.BaseDistributedSearchTestCase; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.jetty.HttpJettySolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.SolrQuery; import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.cloud.SolrZKMetricsListener; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ShardParams; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.StrUtils; +import org.apache.solr.embedded.JettySolrRunner; import org.junit.BeforeClass; import org.junit.Test; @@ -707,6 +711,55 @@ private QueryResponse queryWithAsserts(String... q) throws Exception { return response; } + /** + * When a node resolves collection state for a collection it doesn't host, queries should use + * cached state and not make ZK calls on every query. + */ + @Test + public void testDistributedQueryDoesNotReadFromZk() throws Exception { + final String testCollection = "testCollection"; + + // Create a collection on only 1 node so the other node uses LazyCollectionRef for state + List jettys = cluster.getJettySolrRunners(); + CollectionAdminRequest.createCollection(testCollection, "conf", 1, 1) + .setCreateNodeSet(jettys.get(0).getNodeName()) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); + cluster + .getZkStateReader() + .waitForState( + testCollection, + DEFAULT_TIMEOUT, + TimeUnit.SECONDS, + (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); + + try { + // Node 1 hosts COLLECTION but not testCollection. + // Send a multi-collection query to trigger LazyCollectionRef get call + JettySolrRunner nodeWithoutOther = jettys.get(1); + try (SolrClient client = + new HttpJettySolrClient.Builder(nodeWithoutOther.getBaseUrl().toString()).build()) { + + String collectionsParameter = COLLECTION + "," + testCollection; + + // Warm up LazyCollectionRef state cache with query + client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); + + SolrZKMetricsListener metrics = cluster.getZkStateReader().getZkClient().getMetrics(); + long existsBefore = metrics.getExistsChecks(); + + // Query again and assert that exists call is not made + client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); + assertEquals( + "Query should not cause ZK exists checks as collection state should be cached", + existsBefore, + metrics.getExistsChecks()); + } + } finally { + CollectionAdminRequest.deleteCollection(testCollection) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); + } + } + private int getNumRequests( Map> requests) { int beforeNumRequests = 0; From 15c7534f3ee7ac0a5687bc04433beb71c532dd92 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Wed, 1 Apr 2026 15:57:36 -0400 Subject: [PATCH 4/6] Address PR comments --- .../SOLR-18176-shardhandler-bottleneck.yml | 4 +++- ...ributedQueryComponentOptimizationTest.java | 24 ++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml index 167b1b59e24b..1ce13b101d4a 100644 --- a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -1,7 +1,9 @@ -title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached +title: Fix query throughput bottleneck caused by uncached ZooKeeper get calls for queries with explicit 'collection' parameter type: changed authors: - name: Matthew Biscocho links: - name: SOLR-18176 url: https://issues.apache.org/jira/browse/SOLR-18176 + - name: SOLR-15352 + url: https://issues.apache.org/jira/browse/SOLR-15352 diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java index 186589d192f1..65fca131af8f 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java @@ -717,34 +717,40 @@ private QueryResponse queryWithAsserts(String... q) throws Exception { */ @Test public void testDistributedQueryDoesNotReadFromZk() throws Exception { - final String testCollection = "testCollection"; + final String secondColl = "secondColl"; // Create a collection on only 1 node so the other node uses LazyCollectionRef for state List jettys = cluster.getJettySolrRunners(); - CollectionAdminRequest.createCollection(testCollection, "conf", 1, 1) + CollectionAdminRequest.createCollection(secondColl, "conf", 1, 1) .setCreateNodeSet(jettys.get(0).getNodeName()) .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); cluster .getZkStateReader() .waitForState( - testCollection, + secondColl, DEFAULT_TIMEOUT, TimeUnit.SECONDS, (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); try { - // Node 1 hosts COLLECTION but not testCollection. + // Node 1 hosts COLLECTION but not secondColl. // Send a multi-collection query to trigger LazyCollectionRef get call - JettySolrRunner nodeWithoutOther = jettys.get(1); + JettySolrRunner nodeWithoutSecondColl = jettys.get(1); try (SolrClient client = - new HttpJettySolrClient.Builder(nodeWithoutOther.getBaseUrl().toString()).build()) { + new HttpJettySolrClient.Builder(nodeWithoutSecondColl.getBaseUrl().toString()).build()) { - String collectionsParameter = COLLECTION + "," + testCollection; + String collectionsParameter = COLLECTION + "," + secondColl; // Warm up LazyCollectionRef state cache with query client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); - SolrZKMetricsListener metrics = cluster.getZkStateReader().getZkClient().getMetrics(); + // Get ZK metrics from the coordinator node (the one we're querying) + SolrZKMetricsListener metrics = + nodeWithoutSecondColl + .getCoreContainer() + .getZkController() + .getZkClient() + .getMetrics(); long existsBefore = metrics.getExistsChecks(); // Query again and assert that exists call is not made @@ -755,7 +761,7 @@ public void testDistributedQueryDoesNotReadFromZk() throws Exception { metrics.getExistsChecks()); } } finally { - CollectionAdminRequest.deleteCollection(testCollection) + CollectionAdminRequest.deleteCollection(secondColl) .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); } } From bcd9ad8d5c124794e28e8eb3770d5416ccd701ee Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Thu, 2 Apr 2026 10:39:21 -0400 Subject: [PATCH 5/6] tidy --- .../DistributedQueryComponentOptimizationTest.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java index 65fca131af8f..9387d8680b75 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java @@ -746,11 +746,7 @@ public void testDistributedQueryDoesNotReadFromZk() throws Exception { // Get ZK metrics from the coordinator node (the one we're querying) SolrZKMetricsListener metrics = - nodeWithoutSecondColl - .getCoreContainer() - .getZkController() - .getZkClient() - .getMetrics(); + nodeWithoutSecondColl.getCoreContainer().getZkController().getZkClient().getMetrics(); long existsBefore = metrics.getExistsChecks(); // Query again and assert that exists call is not made From 620f0e2d77e5db66c64d93cd0ed18a240ce01d9f Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Mon, 6 Apr 2026 10:43:58 -0400 Subject: [PATCH 6/6] Changelog change --- changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml index 1ce13b101d4a..7ce9084e0cac 100644 --- a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -1,4 +1,4 @@ -title: Fix query throughput bottleneck caused by uncached ZooKeeper get calls for queries with explicit 'collection' parameter +title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached. Happens when Solr does distributed search over multiple collections, and when the coordinator has no local replica for some of them. type: changed authors: - name: Matthew Biscocho