From 1c4067be400760120e6fcaaa9bfebfd4234e1d34 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Fri, 20 Mar 2026 21:18:01 +0900 Subject: [PATCH 1/6] Add local cache to regexResolver Signed-off-by: SungJin1212 --- CHANGELOG.md | 2 +- docs/configuration/config-file-reference.md | 6 + integration/querier_tenant_federation_test.go | 15 +- .../tenantfederation/regex_resolver.go | 63 ++++- .../tenantfederation/regex_resolver_test.go | 258 ++++++++++++++++++ .../tenantfederation/tenant_federation.go | 3 + schemas/cortex-config-schema.json | 6 + 7 files changed, 341 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index efc085818af..b2f2ea4b795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ # Changelog ## master / unreleased - * [ENHANCEMENT] Cache: Add per-tenant TTL configuration for query results cache to control cache expiration on a per-tenant basis with separate TTLs for regular and out-of-order data. #7357 +* [ENHANCEMENT] Tenant Federation: Add a local cache to regex resolver. #7363 * [ENHANCEMENT] Query Scheduler: Add `cortex_query_scheduler_tracked_requests` metric to track the current number of requests held by the scheduler. #7355 ## 1.21.0 in progress diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index b2bd78549c0..7c1cd7265df 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -310,6 +310,12 @@ tenant_federation: # CLI flag: -tenant-federation.user-sync-interval [user_sync_interval: | default = 5m] + # [Experimental] Cache size of regex match results used by regex resolver. + # Each cache entry includes both regex patterns (e.g. `user-.+`) and resolved + # tenant IDs (e.g. `user-1`). Set to 0 or less to disable caching. + # CLI flag: -tenant-federation.regex-cache-size + [regex_cache_size: | default = 1000] + # [Experimental] If enabled, query errors from individual tenants are treated # as warnings, allowing partial results to be returned. # CLI flag: -tenant-federation.allow-partial-data diff --git a/integration/querier_tenant_federation_test.go b/integration/querier_tenant_federation_test.go index 4bd5d4265f8..0ee5024be8e 100644 --- a/integration/querier_tenant_federation_test.go +++ b/integration/querier_tenant_federation_test.go @@ -145,7 +145,7 @@ func runQuerierTenantFederationTest_UseRegexResolver(t *testing.T, cfg querierTe "-frontend.memcached.addresses": "dns+" + memcached.NetworkEndpoint(e2ecache.MemcachedPort), "-tenant-federation.enabled": "true", "-tenant-federation.regex-matcher-enabled": "true", - "-tenant-federation.user-sync-interval": "1s", + "-tenant-federation.user-sync-interval": "5s", // to upload block quickly "-blocks-storage.tsdb.block-ranges-period": blockRangePeriod.String(), @@ -255,6 +255,19 @@ func runQuerierTenantFederationTest_UseRegexResolver(t *testing.T, cfg querierTe assert.Equal(t, mergeResults(tenantIDs, expectedVectors), result.(model.Vector)) + // Verify regex resolver cache entries. + querierSum, err := querier.SumMetrics([]string{"cortex_regex_resolver_matched_cache_size"}, e2e.SkipMissingMetrics) + require.NoError(t, err) + + totalCacheSize := querierSum[0] + if cfg.shuffleShardingEnabled { + querier2Sum, err := querier2.SumMetrics([]string{"cortex_regex_resolver_matched_cache_size"}, e2e.SkipMissingMetrics) + require.NoError(t, err) + totalCacheSize += querier2Sum[0] + } + + require.Equal(t, float64(numUsers+1), totalCacheSize) + // ensure a push to multiple tenants is failing series, _ := generateSeries("series_1", now) res, err := c.Push(series) diff --git a/pkg/querier/tenantfederation/regex_resolver.go b/pkg/querier/tenantfederation/regex_resolver.go index cab49303f5b..69770e6a921 100644 --- a/pkg/querier/tenantfederation/regex_resolver.go +++ b/pkg/querier/tenantfederation/regex_resolver.go @@ -10,6 +10,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" + lru "github.com/hashicorp/golang-lru/v2" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -27,6 +28,8 @@ var ( errInvalidRegex = errors.New("invalid regex present") ErrTooManyTenants = "too many tenants, max: %d, actual: %d" + + defaultRegexCacheSize = 1000 ) // RegexResolver resolves tenantIDs matched given regex. @@ -38,12 +41,17 @@ type RegexResolver struct { maxTenant int userScanner users.Scanner logger log.Logger - sync.Mutex + sync.RWMutex + + // matchedCache stores the results of regex matching + matchedCache *lru.Cache[string, []string] // lastUpdateUserRun stores the timestamps of the latest update user loop run lastUpdateUserRun prometheus.Gauge // discoveredUsers stores the number of discovered user discoveredUsers prometheus.Gauge + // matchedCacheSize stores the size of the matchedCache + matchedCacheSize prometheus.Gauge } func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, reg prometheus.Registerer, bucketClientFactory func(ctx context.Context) (objstore.InstrumentedBucket, error), logger log.Logger) (*RegexResolver, error) { @@ -57,11 +65,20 @@ func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, return nil, errors.Wrap(err, "failed to create users scanner") } + var matchedCache *lru.Cache[string, []string] + if tenantFederationCfg.RegexCacheSize > 0 { + matchedCache, err = lru.New[string, []string](tenantFederationCfg.RegexCacheSize) + if err != nil { + return nil, errors.Wrap(err, "failed to create regex cache") + } + } + r := &RegexResolver{ userSyncInterval: tenantFederationCfg.UserSyncInterval, maxTenant: tenantFederationCfg.MaxTenant, userScanner: userScanner, logger: logger, + matchedCache: matchedCache, } r.lastUpdateUserRun = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ @@ -72,6 +89,10 @@ func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, Name: "cortex_regex_resolver_discovered_users", Help: "Number of discovered users.", }) + r.matchedCacheSize = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Name: "cortex_regex_resolver_matched_cache_size", + Help: "Number of entries stored in the matched cache.", + }) r.Service = services.NewBasicService(nil, r.running, nil) @@ -99,6 +120,12 @@ func (r *RegexResolver) running(ctx context.Context) error { r.knownUsers = append(active, deleting...) // We keep it sort sort.Strings(r.knownUsers) + + // Reset the cache because the set of available users has changed. + if r.matchedCache != nil { + r.matchedCache.Purge() + r.matchedCacheSize.Set(0) + } r.Unlock() r.lastUpdateUserRun.SetToCurrentTime() r.discoveredUsers.Set(float64(len(active) + len(deleting))) @@ -126,16 +153,15 @@ func (r *RegexResolver) TenantIDs(ctx context.Context) ([]string, error) { return nil, err } - orgIDs, err := r.getRegexMatchedOrgIds(orgID) - if err != nil { - return nil, err - } - - return users.ValidateOrgIDs(orgIDs) + return r.getRegexMatchedOrgIds(orgID) } func (r *RegexResolver) getRegexMatchedOrgIds(orgID string) ([]string, error) { - var matched []string + if r.matchedCache != nil { + if cachedMatched, ok := r.matchedCache.Get(orgID); ok { + return r.validateAndReturnMatched(orgID, cachedMatched) + } + } // Use the Prometheus FastRegexMatcher m, err := labels.NewFastRegexMatcher(orgID) @@ -143,14 +169,30 @@ func (r *RegexResolver) getRegexMatchedOrgIds(orgID string) ([]string, error) { return nil, errInvalidRegex } - r.Lock() - defer r.Unlock() + var matched []string + + r.RLock() for _, id := range r.knownUsers { if m.MatchString(id) { matched = append(matched, id) } } + r.RUnlock() + + validatedMatched, err := users.ValidateOrgIDs(matched) + if err != nil { + return nil, err + } + + if r.matchedCache != nil { + r.matchedCache.Add(orgID, validatedMatched) + r.matchedCacheSize.Set(float64(r.matchedCache.Len())) + } + + return r.validateAndReturnMatched(orgID, validatedMatched) +} +func (r *RegexResolver) validateAndReturnMatched(orgID string, matched []string) ([]string, error) { if len(matched) == 0 { if err := users.ValidTenantID(orgID); err == nil { // when querying for a newly created orgID, the query may not @@ -165,6 +207,7 @@ func (r *RegexResolver) getRegexMatchedOrgIds(orgID string) ([]string, error) { return []string{"fake"}, nil } + // Enforce the maximum number of tenants allowed in a federated query. if r.maxTenant > 0 && len(matched) > r.maxTenant { return nil, httpgrpc.Errorf(http.StatusBadRequest, "%s", fmt.Errorf(ErrTooManyTenants, r.maxTenant, len(matched)).Error()) } diff --git a/pkg/querier/tenantfederation/regex_resolver_test.go b/pkg/querier/tenantfederation/regex_resolver_test.go index 8e9d274979b..b58ff96c050 100644 --- a/pkg/querier/tenantfederation/regex_resolver_test.go +++ b/pkg/querier/tenantfederation/regex_resolver_test.go @@ -3,6 +3,7 @@ package tenantfederation import ( "context" "errors" + "fmt" "strings" "testing" "time" @@ -167,3 +168,260 @@ func Test_RegexValidator(t *testing.T) { }) } } + +func Test_RegexResolver_Cache(t *testing.T) { + tests := []struct { + description string + existingTenants []string + firstOrgID string + secondOrgID string + expectedFirst []string + expectedSecond []string + }{ + { + description: "cache hit with same regex", + existingTenants: []string{"user-1", "user-2", "user-3"}, + firstOrgID: "user-.+", + secondOrgID: "user-.+", + expectedFirst: []string{"user-1", "user-2", "user-3"}, + expectedSecond: []string{"user-1", "user-2", "user-3"}, + }, + { + description: "cache miss with different regex", + existingTenants: []string{"user-1", "user-2", "admin-1"}, + firstOrgID: "user-.+", + secondOrgID: "admin-.+", + expectedFirst: []string{"user-1", "user-2"}, + expectedSecond: []string{"admin-1"}, + }, + } + + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + reg := prometheus.NewRegistry() + bucketClient := &bucket.ClientMock{} + bucketClient.MockIter("", tc.existingTenants, nil) + bucketClient.MockIter("__markers__", []string{}, nil) + for _, existingTenant := range tc.existingTenants { + bucketClient.MockExists(users.GetGlobalDeletionMarkPath(existingTenant), false, nil) + bucketClient.MockExists(users.GetLocalDeletionMarkPath(existingTenant), false, nil) + } + + bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { + return bucketClient, nil + } + + usersScannerConfig := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} + tenantFederationConfig := Config{UserSyncInterval: time.Second, MaxTenant: 0, RegexCacheSize: 10} + regexResolver, err := NewRegexResolver(usersScannerConfig, tenantFederationConfig, reg, bucketClientFactory, log.NewNopLogger()) + require.NoError(t, err) + require.NoError(t, services.StartAndAwaitRunning(context.Background(), regexResolver)) + + // wait update knownUsers + test.Poll(t, time.Second*10, true, func() any { + return testutil.ToFloat64(regexResolver.lastUpdateUserRun) > 0 && testutil.ToFloat64(regexResolver.discoveredUsers) == float64(len(tc.existingTenants)) + }) + + // First query - cache miss + ctx1 := user.InjectOrgID(context.Background(), tc.firstOrgID) + orgIDs1, err := regexResolver.TenantIDs(ctx1) + require.NoError(t, err) + require.Equal(t, tc.expectedFirst, orgIDs1) + + // Verify cache was populated + regexResolver.RLock() + cached, exists := regexResolver.matchedCache.Get(tc.firstOrgID) + regexResolver.RUnlock() + require.True(t, exists, "cache should contain the first query result") + require.Equal(t, tc.expectedFirst, cached) + + // Second query - should use cache + ctx2 := user.InjectOrgID(context.Background(), tc.secondOrgID) + orgIDs2, err := regexResolver.TenantIDs(ctx2) + require.NoError(t, err) + require.Equal(t, tc.expectedSecond, orgIDs2) + + // Verify cache behavior + if tc.firstOrgID == tc.secondOrgID { + // Same query should hit cache + regexResolver.RLock() + cached2, exists2 := regexResolver.matchedCache.Get(tc.secondOrgID) + regexResolver.RUnlock() + require.True(t, exists2) + require.Equal(t, tc.expectedSecond, cached2) + } + }) + } +} + +func Test_RegexResolver_CacheInvalidation(t *testing.T) { + reg := prometheus.NewRegistry() + initialTenants := []string{"user-1", "user-2"} + bucketClient := &bucket.ClientMock{} + + // Initial mock setup + bucketClient.MockIter("", initialTenants, nil) + bucketClient.MockIter("__markers__", []string{}, nil) + for _, tenant := range initialTenants { + bucketClient.MockExists(users.GetGlobalDeletionMarkPath(tenant), false, nil) + bucketClient.MockExists(users.GetLocalDeletionMarkPath(tenant), false, nil) + } + + bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { + return bucketClient, nil + } + + usersScannerConfig := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} + tenantFederationConfig := Config{UserSyncInterval: time.Hour, MaxTenant: 0, RegexCacheSize: 10} + regexResolver, err := NewRegexResolver(usersScannerConfig, tenantFederationConfig, reg, bucketClientFactory, log.NewNopLogger()) + require.NoError(t, err) + + // Avoid background ticker races by setting known users directly for this unit test. + regexResolver.Lock() + regexResolver.knownUsers = append([]string(nil), initialTenants...) + regexResolver.discoveredUsers.Set(float64(len(initialTenants))) + regexResolver.Unlock() + + // First query - populate cache + ctx := user.InjectOrgID(context.Background(), "user-.+") + orgIDs, err := regexResolver.TenantIDs(ctx) + require.NoError(t, err) + require.Equal(t, []string{"user-1", "user-2"}, orgIDs) + + // Verify cache is populated + regexResolver.RLock() + require.Equal(t, 1, regexResolver.matchedCache.Len()) + require.Equal(t, float64(1), testutil.ToFloat64(regexResolver.matchedCacheSize)) + regexResolver.RUnlock() + + // Simulate user-sync invalidation by clearing cache under resolver lock. + regexResolver.Lock() + regexResolver.matchedCache.Purge() + regexResolver.matchedCacheSize.Set(0) + regexResolver.Unlock() + + regexResolver.RLock() + require.Equal(t, 0, regexResolver.matchedCache.Len()) + require.Equal(t, float64(0), testutil.ToFloat64(regexResolver.matchedCacheSize)) + regexResolver.RUnlock() + + // Query again - cache should be repopulated + orgIDs, err = regexResolver.TenantIDs(ctx) + require.NoError(t, err) + require.Equal(t, []string{"user-1", "user-2"}, orgIDs) +} + +func Test_RegexResolver_UsesConfiguredCacheSize(t *testing.T) { + reg := prometheus.NewRegistry() + bucketClient := &bucket.ClientMock{} + bucketClient.MockIter("", []string{}, nil) + bucketClient.MockIter("__markers__", []string{}, nil) + + bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { + return bucketClient, nil + } + + usersScannerConfig := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} + tenantFederationConfig := Config{ + UserSyncInterval: time.Hour, + MaxTenant: 0, + RegexCacheSize: 2, + } + + regexResolver, err := NewRegexResolver(usersScannerConfig, tenantFederationConfig, reg, bucketClientFactory, log.NewNopLogger()) + require.NoError(t, err) + + regexResolver.Lock() + regexResolver.knownUsers = []string{"user-1", "user-2", "user-3"} + regexResolver.Unlock() + + for _, orgID := range []string{"user-1", "user-2", "user-3"} { + ctx := user.InjectOrgID(context.Background(), orgID) + _, err := regexResolver.TenantIDs(ctx) + require.NoError(t, err) + } + + regexResolver.RLock() + require.Equal(t, 2, regexResolver.matchedCache.Len()) + require.Equal(t, float64(2), testutil.ToFloat64(regexResolver.matchedCacheSize)) + regexResolver.RUnlock() +} + +func BenchmarkRegexResolver_TenantIDs(b *testing.B) { + numUsers := 1000 + existingTenants := make([]string, numUsers) + for i := range numUsers { + existingTenants[i] = fmt.Sprintf("user-%d", i) + } + + reg := prometheus.NewRegistry() + bucketClient := &bucket.ClientMock{} + bucketClient.MockIter("", existingTenants, nil) + bucketClient.MockIter("__markers__", []string{}, nil) + for _, tenant := range existingTenants { + bucketClient.MockExists(users.GetGlobalDeletionMarkPath(tenant), false, nil) + bucketClient.MockExists(users.GetLocalDeletionMarkPath(tenant), false, nil) + } + + bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { + return bucketClient, nil + } + + usersScannerConfig := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} + tenantFederationConfig := Config{UserSyncInterval: time.Second, MaxTenant: 0, RegexCacheSize: 2000} + regexResolver, err := NewRegexResolver(usersScannerConfig, tenantFederationConfig, reg, bucketClientFactory, log.NewNopLogger()) + require.NoError(b, err) + err = services.StartAndAwaitRunning(context.Background(), regexResolver) + require.NoError(b, err) + + // Wait for initial sync + test.Poll(b, time.Second*10, true, func() any { + return testutil.ToFloat64(regexResolver.discoveredUsers) == float64(numUsers) + }) + + b.Run("ColdCache with regex tenant", func(b *testing.B) { + for b.Loop() { + // Cache clean + regexResolver.Lock() + regexResolver.matchedCache.Purge() + regexResolver.matchedCacheSize.Set(0) + regexResolver.Unlock() + + ctx := user.InjectOrgID(context.Background(), "user-.+") + _, _ = regexResolver.TenantIDs(ctx) + } + }) + + b.Run("WarmCache with regex tenant", func(b *testing.B) { + ctx := user.InjectOrgID(context.Background(), "user-.+") + _, _ = regexResolver.TenantIDs(ctx) // To warm up cache + + b.ResetTimer() + for b.Loop() { + _, _ = regexResolver.TenantIDs(ctx) + } + }) + + b.Run("ColdCache with resolved tenant", func(b *testing.B) { + for b.Loop() { + // Cache clean + regexResolver.Lock() + regexResolver.matchedCache.Purge() + regexResolver.matchedCacheSize.Set(0) + regexResolver.Unlock() + + ctx := user.InjectOrgID(context.Background(), "user-1") + _, _ = regexResolver.TenantIDs(ctx) + } + }) + + b.Run("WarmCache with resolved tenant", func(b *testing.B) { + ctx := user.InjectOrgID(context.Background(), "user-1") + _, _ = regexResolver.TenantIDs(ctx) // To warm up cache + + b.ResetTimer() + for b.Loop() { + _, _ = regexResolver.TenantIDs(ctx) + } + }) +} diff --git a/pkg/querier/tenantfederation/tenant_federation.go b/pkg/querier/tenantfederation/tenant_federation.go index 6aca128ab57..38fe0d60f98 100644 --- a/pkg/querier/tenantfederation/tenant_federation.go +++ b/pkg/querier/tenantfederation/tenant_federation.go @@ -16,6 +16,8 @@ type Config struct { RegexMatcherEnabled bool `yaml:"regex_matcher_enabled"` // UserSyncInterval How frequently to scan users, scanned users are used to calculate matched tenantIDs if the regex matcher is enabled. UserSyncInterval time.Duration `yaml:"user_sync_interval"` + // RegexCacheSize Maximum number of regex match results cached locally. + RegexCacheSize int `yaml:"regex_cache_size"` // AllowPartialData If true, enables returning partial results. AllowPartialData bool `yaml:"allow_partial_data"` } @@ -26,5 +28,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.IntVar(&cfg.MaxTenant, "tenant-federation.max-tenant", 0, "A maximum number of tenants to query at once. 0 means no limit.") f.BoolVar(&cfg.RegexMatcherEnabled, "tenant-federation.regex-matcher-enabled", false, "[Experimental] If enabled, the `X-Scope-OrgID` header value can accept a regex and the matched tenantIDs are automatically involved. The regex matching rule follows the Prometheus, see the detail: https://prometheus.io/docs/prometheus/latest/querying/basics/#regular-expressions. The user discovery is based on scanning block storage, so new users can get queries after uploading a block (generally 2h).") f.DurationVar(&cfg.UserSyncInterval, "tenant-federation.user-sync-interval", time.Minute*5, "[Experimental] If the regex matcher is enabled, it specifies how frequently to scan users. The scanned users are used to calculate matched tenantIDs. The scanning strategy depends on the `-blocks-storage.users-scanner.strategy`.") + f.IntVar(&cfg.RegexCacheSize, "tenant-federation.regex-cache-size", defaultRegexCacheSize, "[Experimental] Cache size of regex match results used by regex resolver. Each cache entry includes both regex patterns (e.g. `user-.+`) and resolved tenant IDs (e.g. `user-1`). Set to 0 or less to disable caching.") f.BoolVar(&cfg.AllowPartialData, "tenant-federation.allow-partial-data", false, "[Experimental] If enabled, query errors from individual tenants are treated as warnings, allowing partial results to be returned.") } diff --git a/schemas/cortex-config-schema.json b/schemas/cortex-config-schema.json index 37e5afe09e7..20cf970c35b 100644 --- a/schemas/cortex-config-schema.json +++ b/schemas/cortex-config-schema.json @@ -9111,6 +9111,12 @@ "type": "number", "x-cli-flag": "tenant-federation.max-tenant" }, + "regex_cache_size": { + "default": 1000, + "description": "[Experimental] Cache size of regex match results used by regex resolver. Each cache entry includes both regex patterns (e.g. `user-.+`) and resolved tenant IDs (e.g. `user-1`). Set to 0 or less to disable caching.", + "type": "number", + "x-cli-flag": "tenant-federation.regex-cache-size" + }, "regex_matcher_enabled": { "default": false, "description": "[Experimental] If enabled, the `X-Scope-OrgID` header value can accept a regex and the matched tenantIDs are automatically involved. The regex matching rule follows the Prometheus, see the detail: https://prometheus.io/docs/prometheus/latest/querying/basics/#regular-expressions. The user discovery is based on scanning block storage, so new users can get queries after uploading a block (generally 2h).", From c5c6e1271862cafdbfcfd32d265c2ba3cf19776f Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Fri, 20 Mar 2026 21:30:47 +0900 Subject: [PATCH 2/6] Add cache hit/miss metrics Signed-off-by: SungJin1212 --- .../tenantfederation/regex_resolver.go | 32 +++++++-- .../tenantfederation/regex_resolver_test.go | 65 +++++++++++++++++++ 2 files changed, 92 insertions(+), 5 deletions(-) diff --git a/pkg/querier/tenantfederation/regex_resolver.go b/pkg/querier/tenantfederation/regex_resolver.go index 69770e6a921..403d5703dee 100644 --- a/pkg/querier/tenantfederation/regex_resolver.go +++ b/pkg/querier/tenantfederation/regex_resolver.go @@ -52,6 +52,10 @@ type RegexResolver struct { discoveredUsers prometheus.Gauge // matchedCacheSize stores the size of the matchedCache matchedCacheSize prometheus.Gauge + + matchedCacheHits prometheus.Counter + matchedCacheMisses prometheus.Counter + } func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, reg prometheus.Registerer, bucketClientFactory func(ctx context.Context) (objstore.InstrumentedBucket, error), logger log.Logger) (*RegexResolver, error) { @@ -78,7 +82,27 @@ func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, maxTenant: tenantFederationCfg.MaxTenant, userScanner: userScanner, logger: logger, - matchedCache: matchedCache, + } + + if tenantFederationCfg.RegexCacheSize > 0 { + matchedCache, err = lru.New[string, []string](tenantFederationCfg.RegexCacheSize) + if err != nil { + return nil, errors.Wrap(err, "failed to create regex cache") + } + r.matchedCache = matchedCache + + r.matchedCacheSize = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Name: "cortex_regex_resolver_matched_cache_size", + Help: "Number of entries stored in the matched cache.", + }) + r.matchedCacheHits = promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "cortex_regex_resolver_matched_cache_hits_total", + Help: "Total number of successful cache lookups for the regex matched.", + }) + r.matchedCacheMisses = promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "cortex_regex_resolver_matched_cache_misses_total", + Help: "Total number of cache misses for the regex matched.", + }) } r.lastUpdateUserRun = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ @@ -89,10 +113,6 @@ func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, Name: "cortex_regex_resolver_discovered_users", Help: "Number of discovered users.", }) - r.matchedCacheSize = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ - Name: "cortex_regex_resolver_matched_cache_size", - Help: "Number of entries stored in the matched cache.", - }) r.Service = services.NewBasicService(nil, r.running, nil) @@ -159,8 +179,10 @@ func (r *RegexResolver) TenantIDs(ctx context.Context) ([]string, error) { func (r *RegexResolver) getRegexMatchedOrgIds(orgID string) ([]string, error) { if r.matchedCache != nil { if cachedMatched, ok := r.matchedCache.Get(orgID); ok { + r.matchedCacheHits.Inc() return r.validateAndReturnMatched(orgID, cachedMatched) } + r.matchedCacheMisses.Inc() } // Use the Prometheus FastRegexMatcher diff --git a/pkg/querier/tenantfederation/regex_resolver_test.go b/pkg/querier/tenantfederation/regex_resolver_test.go index b58ff96c050..9a48e5645ed 100644 --- a/pkg/querier/tenantfederation/regex_resolver_test.go +++ b/pkg/querier/tenantfederation/regex_resolver_test.go @@ -347,6 +347,71 @@ func Test_RegexResolver_UsesConfiguredCacheSize(t *testing.T) { regexResolver.RUnlock() } +func Test_RegexResolver_CacheHitAndMissMetrics(t *testing.T) { + reg := prometheus.NewRegistry() + bucketClient := &bucket.ClientMock{} + bucketClient.MockIter("", []string{"user-1", "user-2"}, nil) + bucketClient.MockIter("__markers__", []string{}, nil) + + bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { + return bucketClient, nil + } + + usersScannerConfig := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} + tenantFederationConfig := Config{ + UserSyncInterval: time.Hour, + MaxTenant: 0, + RegexCacheSize: 10, + } + + regexResolver, err := NewRegexResolver(usersScannerConfig, tenantFederationConfig, reg, bucketClientFactory, log.NewNopLogger()) + require.NoError(t, err) + + regexResolver.Lock() + regexResolver.knownUsers = []string{"user-1", "user-2"} + regexResolver.Unlock() + + checkMetrics := func(expectedHits, expectedMisses float64) { + t.Helper() + require.Equal(t, expectedHits, testutil.ToFloat64(regexResolver.matchedCacheHits)) + require.Equal(t, expectedMisses, testutil.ToFloat64(regexResolver.matchedCacheMisses)) + } + + ctx1 := user.InjectOrgID(context.Background(), "user-.+") + _, err = regexResolver.TenantIDs(ctx1) + require.NoError(t, err) + checkMetrics(0, 1) // 0 Hits, 1 Miss + + // Same lookup -> Hit + _, err = regexResolver.TenantIDs(ctx1) + require.NoError(t, err) + checkMetrics(1, 1) // 1 Hit, 1 Miss + + // Same lookup again -> Hit + _, err = regexResolver.TenantIDs(ctx1) + require.NoError(t, err) + checkMetrics(2, 1) // 2 Hits, 1 Miss + + // Different lookup -> Miss + ctx2 := user.InjectOrgID(context.Background(), "user-1") + _, err = regexResolver.TenantIDs(ctx2) + require.NoError(t, err) + checkMetrics(2, 2) // 2 Hits, 2 Misses + + // Purge cache (simulate user sync) + regexResolver.Lock() + regexResolver.matchedCache.Purge() + regexResolver.Unlock() + + // Look up previously cached items after purge -> Misses + _, err = regexResolver.TenantIDs(ctx1) + require.NoError(t, err) + _, err = regexResolver.TenantIDs(ctx2) + require.NoError(t, err) + + checkMetrics(2, 4) // 2 Hits, 4 Misses +} + func BenchmarkRegexResolver_TenantIDs(b *testing.B) { numUsers := 1000 existingTenants := make([]string, numUsers) From f401869ea8dcff3e4d6220927bfc131dcbc68b24 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Fri, 20 Mar 2026 21:37:25 +0900 Subject: [PATCH 3/6] Refactor test Signed-off-by: SungJin1212 --- .../tenantfederation/regex_resolver_test.go | 79 ++++--------------- 1 file changed, 14 insertions(+), 65 deletions(-) diff --git a/pkg/querier/tenantfederation/regex_resolver_test.go b/pkg/querier/tenantfederation/regex_resolver_test.go index 9a48e5645ed..84970f47eaa 100644 --- a/pkg/querier/tenantfederation/regex_resolver_test.go +++ b/pkg/querier/tenantfederation/regex_resolver_test.go @@ -222,11 +222,18 @@ func Test_RegexResolver_Cache(t *testing.T) { return testutil.ToFloat64(regexResolver.lastUpdateUserRun) > 0 && testutil.ToFloat64(regexResolver.discoveredUsers) == float64(len(tc.existingTenants)) }) + checkMetrics := func(expectedHits, expectedMisses float64) { + t.Helper() + require.Equal(t, expectedHits, testutil.ToFloat64(regexResolver.matchedCacheHits)) + require.Equal(t, expectedMisses, testutil.ToFloat64(regexResolver.matchedCacheMisses)) + } + // First query - cache miss ctx1 := user.InjectOrgID(context.Background(), tc.firstOrgID) orgIDs1, err := regexResolver.TenantIDs(ctx1) require.NoError(t, err) require.Equal(t, tc.expectedFirst, orgIDs1) + checkMetrics(0, 1) // Verify cache was populated regexResolver.RLock() @@ -249,6 +256,10 @@ func Test_RegexResolver_Cache(t *testing.T) { regexResolver.RUnlock() require.True(t, exists2) require.Equal(t, tc.expectedSecond, cached2) + checkMetrics(1, 1) + } else { + // Different query should miss cache + checkMetrics(0, 2) } }) } @@ -309,6 +320,9 @@ func Test_RegexResolver_CacheInvalidation(t *testing.T) { orgIDs, err = regexResolver.TenantIDs(ctx) require.NoError(t, err) require.Equal(t, []string{"user-1", "user-2"}, orgIDs) + require.Equal(t, 1, regexResolver.matchedCache.Len()) + require.Equal(t, float64(1), testutil.ToFloat64(regexResolver.matchedCacheSize)) + } func Test_RegexResolver_UsesConfiguredCacheSize(t *testing.T) { @@ -347,71 +361,6 @@ func Test_RegexResolver_UsesConfiguredCacheSize(t *testing.T) { regexResolver.RUnlock() } -func Test_RegexResolver_CacheHitAndMissMetrics(t *testing.T) { - reg := prometheus.NewRegistry() - bucketClient := &bucket.ClientMock{} - bucketClient.MockIter("", []string{"user-1", "user-2"}, nil) - bucketClient.MockIter("__markers__", []string{}, nil) - - bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { - return bucketClient, nil - } - - usersScannerConfig := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} - tenantFederationConfig := Config{ - UserSyncInterval: time.Hour, - MaxTenant: 0, - RegexCacheSize: 10, - } - - regexResolver, err := NewRegexResolver(usersScannerConfig, tenantFederationConfig, reg, bucketClientFactory, log.NewNopLogger()) - require.NoError(t, err) - - regexResolver.Lock() - regexResolver.knownUsers = []string{"user-1", "user-2"} - regexResolver.Unlock() - - checkMetrics := func(expectedHits, expectedMisses float64) { - t.Helper() - require.Equal(t, expectedHits, testutil.ToFloat64(regexResolver.matchedCacheHits)) - require.Equal(t, expectedMisses, testutil.ToFloat64(regexResolver.matchedCacheMisses)) - } - - ctx1 := user.InjectOrgID(context.Background(), "user-.+") - _, err = regexResolver.TenantIDs(ctx1) - require.NoError(t, err) - checkMetrics(0, 1) // 0 Hits, 1 Miss - - // Same lookup -> Hit - _, err = regexResolver.TenantIDs(ctx1) - require.NoError(t, err) - checkMetrics(1, 1) // 1 Hit, 1 Miss - - // Same lookup again -> Hit - _, err = regexResolver.TenantIDs(ctx1) - require.NoError(t, err) - checkMetrics(2, 1) // 2 Hits, 1 Miss - - // Different lookup -> Miss - ctx2 := user.InjectOrgID(context.Background(), "user-1") - _, err = regexResolver.TenantIDs(ctx2) - require.NoError(t, err) - checkMetrics(2, 2) // 2 Hits, 2 Misses - - // Purge cache (simulate user sync) - regexResolver.Lock() - regexResolver.matchedCache.Purge() - regexResolver.Unlock() - - // Look up previously cached items after purge -> Misses - _, err = regexResolver.TenantIDs(ctx1) - require.NoError(t, err) - _, err = regexResolver.TenantIDs(ctx2) - require.NoError(t, err) - - checkMetrics(2, 4) // 2 Hits, 4 Misses -} - func BenchmarkRegexResolver_TenantIDs(b *testing.B) { numUsers := 1000 existingTenants := make([]string, numUsers) From 9c4240d5f233d3dbcc7d85a631648f792bb14322 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Fri, 20 Mar 2026 21:47:48 +0900 Subject: [PATCH 4/6] fix lint Signed-off-by: SungJin1212 --- pkg/querier/tenantfederation/regex_resolver.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pkg/querier/tenantfederation/regex_resolver.go b/pkg/querier/tenantfederation/regex_resolver.go index 403d5703dee..b120033952b 100644 --- a/pkg/querier/tenantfederation/regex_resolver.go +++ b/pkg/querier/tenantfederation/regex_resolver.go @@ -55,7 +55,6 @@ type RegexResolver struct { matchedCacheHits prometheus.Counter matchedCacheMisses prometheus.Counter - } func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, reg prometheus.Registerer, bucketClientFactory func(ctx context.Context) (objstore.InstrumentedBucket, error), logger log.Logger) (*RegexResolver, error) { @@ -69,14 +68,6 @@ func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, return nil, errors.Wrap(err, "failed to create users scanner") } - var matchedCache *lru.Cache[string, []string] - if tenantFederationCfg.RegexCacheSize > 0 { - matchedCache, err = lru.New[string, []string](tenantFederationCfg.RegexCacheSize) - if err != nil { - return nil, errors.Wrap(err, "failed to create regex cache") - } - } - r := &RegexResolver{ userSyncInterval: tenantFederationCfg.UserSyncInterval, maxTenant: tenantFederationCfg.MaxTenant, @@ -85,7 +76,7 @@ func NewRegexResolver(cfg users.UsersScannerConfig, tenantFederationCfg Config, } if tenantFederationCfg.RegexCacheSize > 0 { - matchedCache, err = lru.New[string, []string](tenantFederationCfg.RegexCacheSize) + matchedCache, err := lru.New[string, []string](tenantFederationCfg.RegexCacheSize) if err != nil { return nil, errors.Wrap(err, "failed to create regex cache") } From 363d6a56064d86d66cc984ca8459961bbb7899c0 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Sat, 21 Mar 2026 12:49:24 +0900 Subject: [PATCH 5/6] update v1-guarantees Signed-off-by: SungJin1212 --- docs/configuration/v1-guarantees.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/configuration/v1-guarantees.md b/docs/configuration/v1-guarantees.md index 0508abb6e90..4bb632023e5 100644 --- a/docs/configuration/v1-guarantees.md +++ b/docs/configuration/v1-guarantees.md @@ -55,6 +55,7 @@ Currently experimental features are: - Blocks storage user index - Querier: tenant federation - `-tenant-federation.regex-matcher-enabled` + - `-tenant-federation.regex-cache-size` - `-tenant-federation.user-sync-interval` - `-tenant-federation.allow-partial-data` - The thanosconvert tool for converting Thanos block metadata to Cortex From 63c919a33a112585d89db4eb11906e7c3dabb91a Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Mon, 23 Mar 2026 08:50:44 +0900 Subject: [PATCH 6/6] annotation Signed-off-by: SungJin1212 --- pkg/querier/tenantfederation/regex_resolver.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/querier/tenantfederation/regex_resolver.go b/pkg/querier/tenantfederation/regex_resolver.go index b120033952b..7016c2ae0e9 100644 --- a/pkg/querier/tenantfederation/regex_resolver.go +++ b/pkg/querier/tenantfederation/regex_resolver.go @@ -120,8 +120,7 @@ func (r *RegexResolver) running(ctx context.Context) error { case <-ctx.Done(): return ctx.Err() case <-ticker.C: - // active and deleting users are considered - // The store-gateway can query for deleting users. + // Active and deleting users are considered active, deleting, _, err := r.userScanner.ScanUsers(ctx) if err != nil { level.Error(r.logger).Log("msg", "failed to discover users from bucket", "err", err)