Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog/unreleased/SOLR-17857-trackSolrMetricsContext.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
title: SolrMetricsContext is now reliably closed (once), and thus some components possessing one are too. Fixes double-close of OTEL, yielding warnings.
type: fixed
authors:
- name: David Smiley
- name: Houston Putman
links:
- name: SOLR-17857
url: https://issues.apache.org/jira/browse/SOLR-17857
150 changes: 70 additions & 80 deletions solr/core/src/java/org/apache/solr/blockcache/Metrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import io.opentelemetry.api.common.Attributes;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.solr.common.util.IOUtils;
import org.apache.solr.core.SolrInfoBean;
import org.apache.solr.metrics.SolrMetricsContext;
import org.apache.solr.search.SolrCacheBase;
Expand Down Expand Up @@ -51,8 +50,6 @@ public class Metrics extends SolrCacheBase implements SolrInfoBean {
private SolrMetricsContext solrMetricsContext;
private long previous = System.nanoTime();

private AutoCloseable toClose;

@Override
public void initializeMetrics(SolrMetricsContext parentContext, Attributes attributes) {
solrMetricsContext = parentContext.getChildContext(this);
Expand All @@ -70,89 +67,82 @@ public void initializeMetrics(SolrMetricsContext parentContext, Attributes attri
solrMetricsContext.doubleGaugeMeasurement(
"solr_buffer_cache_stats", "Buffer cache per second stats");

this.toClose =
solrMetricsContext.batchCallback(
() -> {
long now = System.nanoTime();
long delta = Math.max(now - previous, 1);
double seconds = delta / 1000000000.0;

long hits_total = blockCacheHit.get();
long hits_delta = hits_total - blockCacheHit_last.get();
blockCacheHit_last.set(hits_total);

long miss_total = blockCacheMiss.get();
long miss_delta = miss_total - blockCacheMiss_last.get();
blockCacheMiss_last.set(miss_total);

long evict_total = blockCacheEviction.get();
long evict_delta = evict_total - blockCacheEviction_last.get();
blockCacheEviction_last.set(evict_total);

long storeFail_total = blockCacheStoreFail.get();
long storeFail_delta = storeFail_total - blockCacheStoreFail_last.get();
blockCacheStoreFail_last.set(storeFail_total);

long lookups_delta = hits_delta + miss_delta;
long lookups_total = hits_total + miss_total;

blockcacheStats.record(
blockCacheSize.get(), baseAttributes.toBuilder().put(TYPE_ATTR, "size").build());
blockcacheStats.record(
lookups_total, baseAttributes.toBuilder().put(TYPE_ATTR, "lookups").build());
blockcacheStats.record(
hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, "hits").build());
blockcacheStats.record(
hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, "evictions").build());
blockcacheStats.record(
storeFail_total,
baseAttributes.toBuilder().put(TYPE_ATTR, "store_fails").build());
perSecStats.record(
getPerSecond(lookups_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "lookups")
.build()); // lookups per second since the last call
perSecStats.record(
getPerSecond(hits_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "hits")
.build()); // hits per second since the last call
perSecStats.record(
getPerSecond(evict_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "evictions")
.build()); // evictions per second since the last call
perSecStats.record(
getPerSecond(storeFail_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "store_fails")
.build()); // evictions per second since the last call
hitRatio.record(
calcHitRatio(lookups_delta, hits_delta),
baseAttributes); // hit ratio since the last call
bufferCacheStats.record(
getPerSecond(shardBuffercacheAllocate.getAndSet(0), seconds),
baseAttributes.toBuilder().put(TYPE_ATTR, "allocations").build());
bufferCacheStats.record(
getPerSecond(shardBuffercacheLost.getAndSet(0), seconds),
baseAttributes.toBuilder().put(TYPE_ATTR, "lost").build());
previous = now;
},
blockcacheStats,
perSecStats,
hitRatio,
bufferCacheStats);
solrMetricsContext.batchCallback(
() -> {
long now = System.nanoTime();
long delta = Math.max(now - previous, 1);
double seconds = delta / 1000000000.0;

long hits_total = blockCacheHit.get();
long hits_delta = hits_total - blockCacheHit_last.get();
blockCacheHit_last.set(hits_total);

long miss_total = blockCacheMiss.get();
long miss_delta = miss_total - blockCacheMiss_last.get();
blockCacheMiss_last.set(miss_total);

long evict_total = blockCacheEviction.get();
long evict_delta = evict_total - blockCacheEviction_last.get();
blockCacheEviction_last.set(evict_total);

long storeFail_total = blockCacheStoreFail.get();
long storeFail_delta = storeFail_total - blockCacheStoreFail_last.get();
blockCacheStoreFail_last.set(storeFail_total);

long lookups_delta = hits_delta + miss_delta;
long lookups_total = hits_total + miss_total;

blockcacheStats.record(
blockCacheSize.get(), baseAttributes.toBuilder().put(TYPE_ATTR, "size").build());
blockcacheStats.record(
lookups_total, baseAttributes.toBuilder().put(TYPE_ATTR, "lookups").build());
blockcacheStats.record(
hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, "hits").build());
blockcacheStats.record(
hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, "evictions").build());
blockcacheStats.record(
storeFail_total, baseAttributes.toBuilder().put(TYPE_ATTR, "store_fails").build());
perSecStats.record(
getPerSecond(lookups_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "lookups")
.build()); // lookups per second since the last call
perSecStats.record(
getPerSecond(hits_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "hits")
.build()); // hits per second since the last call
perSecStats.record(
getPerSecond(evict_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "evictions")
.build()); // evictions per second since the last call
perSecStats.record(
getPerSecond(storeFail_delta, seconds),
baseAttributes.toBuilder()
.put(TYPE_ATTR, "store_fails")
.build()); // evictions per second since the last call
hitRatio.record(
calcHitRatio(lookups_delta, hits_delta),
baseAttributes); // hit ratio since the last call
bufferCacheStats.record(
getPerSecond(shardBuffercacheAllocate.getAndSet(0), seconds),
baseAttributes.toBuilder().put(TYPE_ATTR, "allocations").build());
bufferCacheStats.record(
getPerSecond(shardBuffercacheLost.getAndSet(0), seconds),
baseAttributes.toBuilder().put(TYPE_ATTR, "lost").build());
previous = now;
},
blockcacheStats,
perSecStats,
hitRatio,
bufferCacheStats);
}

private float getPerSecond(long value, double seconds) {
return (float) (value / seconds);
}

@Override
public void close() {
IOUtils.closeQuietly(toClose);
}

// SolrInfoBean methods

@Override
Expand Down
23 changes: 11 additions & 12 deletions solr/core/src/java/org/apache/solr/cloud/Overseer.java
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,6 @@ private class ClusterStateUpdater implements SolrInfoBean, Runnable, Closeable {

private boolean isClosed = false;

private AutoCloseable toClose;

public ClusterStateUpdater(
final ZkStateReader reader,
final String myId,
Expand All @@ -201,19 +199,19 @@ public ClusterStateUpdater(
this.compressor = compressor;

this.clusterStateUpdaterMetricContext = solrMetricsContext.getChildContext(this);
initializeMetrics(solrMetricsContext, Attributes.of(CATEGORY_ATTR, getCategory().toString()));
initializeMetrics(
clusterStateUpdaterMetricContext, Attributes.of(CATEGORY_ATTR, getCategory().toString()));
}

@Override
public void initializeMetrics(SolrMetricsContext parentContext, Attributes attributes) {
this.toClose =
parentContext.observableLongGauge(
"solr_overseer_state_update_queue_size",
"Size of overseer's update queue",
(observableLongMeasurement) -> {
observableLongMeasurement.record(
stateUpdateQueue.getZkStats().getQueueLength(), attributes);
});
parentContext.observableLongGauge(
"solr_overseer_state_update_queue_size",
"Size of overseer's update queue",
(observableLongMeasurement) -> {
observableLongMeasurement.record(
stateUpdateQueue.getZkStats().getQueueLength(), attributes);
});
}

@Override
Expand Down Expand Up @@ -641,7 +639,7 @@ private LeaderStatus amILeader() {
@Override
public void close() {
this.isClosed = true;
IOUtils.closeQuietly(toClose);
IOUtils.closeQuietly(clusterStateUpdaterMetricContext);
}

@Override
Expand Down Expand Up @@ -836,6 +834,7 @@ public synchronized void close() {
}
this.closed = true;
doClose();
IOUtils.closeQuietly(solrMetricsContext);

assert ObjectReleaseTracker.release(this);
}
Expand Down
21 changes: 10 additions & 11 deletions solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ public class OverseerTaskProcessor implements SolrInfoBean, Runnable, Closeable

private final Stats stats;
private final SolrMetricsContext overseerTaskProcessorMetricsContext;
private AutoCloseable toClose;

/**
* This map may contain tasks which are read from work queue but could not be executed because
Expand Down Expand Up @@ -152,7 +151,9 @@ public OverseerTaskProcessor(
thisNode = MDCLoggingContext.getNodeName();

this.overseerTaskProcessorMetricsContext = solrMetricsContext.getChildContext(this);
initializeMetrics(solrMetricsContext, Attributes.of(CATEGORY_ATTR, getCategory().toString()));
initializeMetrics(
overseerTaskProcessorMetricsContext,
Attributes.of(CATEGORY_ATTR, getCategory().toString()));
}

@Override
Expand Down Expand Up @@ -407,13 +408,12 @@ private void cleanUpRunningTasks() {

@Override
public void initializeMetrics(SolrMetricsContext parentContext, Attributes attributes) {
this.toClose =
parentContext.observableLongGauge(
"solr_overseer_collection_work_queue_size",
"Size of overseer's collection work queue",
(observableLongMeasurement) -> {
observableLongMeasurement.record(workQueue.getZkStats().getQueueLength(), attributes);
});
parentContext.observableLongGauge(
"solr_overseer_collection_work_queue_size",
"Size of overseer's collection work queue",
(observableLongMeasurement) -> {
observableLongMeasurement.record(workQueue.getZkStats().getQueueLength(), attributes);
});
}

@Override
Expand All @@ -424,14 +424,13 @@ public SolrMetricsContext getSolrMetricsContext() {
@Override
public void close() {
isClosed = true;
overseerTaskProcessorMetricsContext.unregister();
if (tpe != null) {
if (!ExecutorUtil.isShutdown(tpe)) {
ExecutorUtil.shutdownAndAwaitTermination(tpe);
}
}
IOUtils.closeQuietly(selector);
IOUtils.closeQuietly(toClose);
IOUtils.closeQuietly(overseerTaskProcessorMetricsContext);
}

public static List<String> getSortedOverseerNodeNames(SolrZkClient zk)
Expand Down
42 changes: 30 additions & 12 deletions solr/core/src/java/org/apache/solr/core/CoreContainer.java
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,10 @@ private void loadInternal() {
}
logging = LogWatcher.newRegisteredLogWatcher(cfg.getLogWatcherConfig(), loader);

solrMetricsContext = new SolrMetricsContext(metricManager, NODE_REGISTRY);

initGpuMetricsService(); // Initialize GPU metrics service

ClusterPluginsSource pluginsSource =
ClusterPluginsSource.loadClusterPluginsSource(this, loader);
containerPluginsRegistry =
Expand All @@ -780,10 +784,12 @@ private void loadInternal() {
containerPluginsRegistry.registerListener(
clusterEventProducerFactory.getPluginRegistryListener());

solrMetricsContext = new SolrMetricsContext(metricManager, NODE_REGISTRY);

// Initialize GPU metrics service
initGpuMetricsService();
// PublicKeyHandler was added to containerHandlers in the constructor before metrics were ready
containerHandlers
.get(PublicKeyHandler.PATH)
.initializeMetrics(
solrMetricsContext,
Attributes.builder().put(HANDLER_ATTR, PublicKeyHandler.PATH).build());

shardHandlerFactory =
ShardHandlerFactory.newInstance(cfg.getShardHandlerFactoryPluginInfo(), loader);
Expand Down Expand Up @@ -1248,6 +1254,10 @@ public void shutdown() {
zkSys.zkController.tryCancelAllElections();
}

// Shut down the coreZkRegister executor early so that any in-progress async reloads
// (triggered by ZK config watchers) complete before we close cores.
ExecutorUtil.shutdownAndAwaitTermination(zkSys.getCoreZkRegisterExecutorService());

ExecutorUtil.shutdownAndAwaitTermination(coreLoadExecutor); // actually already shutdown

// Now clear all the cores that are being operated upon.
Expand Down Expand Up @@ -1279,14 +1289,6 @@ public void shutdown() {

customThreadPool.execute(replayUpdatesExecutor::shutdownAndAwaitTermination);

// Shutdown GPU metrics service if it was initialized
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shutdown/close should do its work in reverse order of construction. Since metrics is initialized super early, I moved the compensating shutdown logic to the end.

shutdownGpuMetricsService();

if (metricManager != null) {
// Close all OTEL meter providers and metrics
metricManager.closeAllRegistries();
}

if (isZooKeeperAware()) {
cancelCoreRecoveries();
}
Expand Down Expand Up @@ -1339,6 +1341,14 @@ public void shutdown() {
}

// It should be safe to close the authentication plugin at this point.
try {
if (pkiAuthenticationSecurityBuilder != null) {
pkiAuthenticationSecurityBuilder.close();
}
} catch (Exception e) {
log.warn("Exception while closing PKI authentication plugin.", e);
}

try {
if (authenticationPlugin != null) {
authenticationPlugin.plugin.close();
Expand All @@ -1362,6 +1372,14 @@ public void shutdown() {
org.apache.lucene.util.IOUtils.closeWhileHandlingException(packageLoader);
}
org.apache.lucene.util.IOUtils.closeWhileHandlingException(loader); // best effort

containerHandlers.close();
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line is new


shutdownGpuMetricsService(); // Shutdown GPU metrics service if it was initialized

IOUtils.closeQuietly(solrMetricsContext);
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line is new


metricManager.closeAllRegistries(); // Close all OTEL meter providers and metrics
}

public void cancelCoreRecoveries() {
Expand Down
Loading
Loading