From a50fbf6b5e3536f6e7549197aa1292327287fd62 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Fri, 30 Jan 2026 11:25:01 +0900 Subject: [PATCH] Delete duplicate ha tracker config in distributor Signed-off-by: SungJin1212 --- docs/configuration/config-file-reference.md | 18 +++--- pkg/distributor/distributor.go | 10 ++-- pkg/distributor/distributor_ha_tracker.go | 61 --------------------- pkg/distributor/distributor_test.go | 2 +- pkg/ha/ha_tracker.go | 8 +-- schemas/cortex-config-schema.json | 8 +-- 6 files changed, 22 insertions(+), 85 deletions(-) delete mode 100644 pkg/distributor/distributor_ha_tracker.go diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 4b78f1d5f0..6bd9dbb452 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -3074,24 +3074,24 @@ pool: [health_check_ingesters: | default = true] ha_tracker: - # Enable the distributors HA tracker so that it can accept samples from - # Prometheus HA replicas gracefully (requires labels). + # Enable the HA tracker so that it can accept data from Prometheus HA replicas + # gracefully (requires labels). # CLI flag: -distributor.ha-tracker.enable [enable_ha_tracker: | default = false] - # Update the timestamp in the KV store for a given cluster/replica only after - # this amount of time has passed since the current stored timestamp. + # The time interval that must pass since the last timestamp update in the KV + # store before updating it again for a given cluster. # CLI flag: -distributor.ha-tracker.update-timeout [ha_tracker_update_timeout: | default = 15s] - # Maximum jitter applied to the update timeout, in order to spread the HA - # heartbeats over time. + # The maximum jitter applied to the update timeout to spread KV store updates + # over time. # CLI flag: -distributor.ha-tracker.update-timeout-jitter-max [ha_tracker_update_timeout_jitter_max: | default = 5s] - # If we don't receive any samples from the accepted replica for a cluster in - # this amount of time we will failover to the next replica we receive a sample - # from. This value must be greater than the update timeout + # The timeout after which a new replica will be accepted if the currently + # elected replica stops sending data. This value must be greater than the + # update timeout plus the maximum jitter. # CLI flag: -distributor.ha-tracker.failover-timeout [ha_tracker_failover_timeout: | default = 30s] diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 6ac313200a..56b57a3c09 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -144,7 +144,7 @@ type Distributor struct { type Config struct { PoolConfig PoolConfig `yaml:"pool"` - HATrackerConfig HATrackerConfig `yaml:"ha_tracker"` + HATrackerConfig ha.HATrackerConfig `yaml:"ha_tracker"` MaxRecvMsgSize int `yaml:"max_recv_msg_size"` OTLPMaxRecvMsgSize int `yaml:"otlp_max_recv_msg_size"` @@ -207,7 +207,7 @@ type OTLPConfig struct { // RegisterFlags adds the flags required to config this to the given FlagSet func (cfg *Config) RegisterFlags(f *flag.FlagSet) { cfg.PoolConfig.RegisterFlags(f) - cfg.HATrackerConfig.RegisterFlags(f) + cfg.HATrackerConfig.RegisterFlagsWithPrefix("distributor.", "", f) cfg.DistributorRing.RegisterFlags(f) f.IntVar(&cfg.MaxRecvMsgSize, "distributor.max-recv-msg-size", 100<<20, "remote_write API max receive message size (bytes).") @@ -243,9 +243,7 @@ func (cfg *Config) Validate(limits validation.Limits) error { return errInvalidTenantShardSize } - haHATrackerConfig := cfg.HATrackerConfig.ToHATrackerConfig() - - return haHATrackerConfig.Validate() + return cfg.HATrackerConfig.Validate() } const ( @@ -268,7 +266,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove Title: "Cortex HA Tracker Status", ReplicaGroupLabel: "Cluster", } - haTracker, err := ha.NewHATracker(cfg.HATrackerConfig.ToHATrackerConfig(), limits, haTrackerStatusConfig, prometheus.WrapRegistererWithPrefix("cortex_", reg), "distributor-hatracker", log) + haTracker, err := ha.NewHATracker(cfg.HATrackerConfig, limits, haTrackerStatusConfig, prometheus.WrapRegistererWithPrefix("cortex_", reg), "distributor-hatracker", log) if err != nil { return nil, err } diff --git a/pkg/distributor/distributor_ha_tracker.go b/pkg/distributor/distributor_ha_tracker.go deleted file mode 100644 index c8d2011b3b..0000000000 --- a/pkg/distributor/distributor_ha_tracker.go +++ /dev/null @@ -1,61 +0,0 @@ -package distributor - -import ( - "flag" - "time" - - "github.com/cortexproject/cortex/pkg/ha" - "github.com/cortexproject/cortex/pkg/ring/kv" - "github.com/cortexproject/cortex/pkg/util/flagext" -) - -type HATrackerConfig struct { - EnableHATracker bool `yaml:"enable_ha_tracker"` - // We should only update the timestamp if the difference - // between the stored timestamp and the time we received a sample at - // is more than this duration. - UpdateTimeout time.Duration `yaml:"ha_tracker_update_timeout"` - UpdateTimeoutJitterMax time.Duration `yaml:"ha_tracker_update_timeout_jitter_max"` - // We should only failover to accepting samples from a replica - // other than the replica written in the KVStore if the difference - // between the stored timestamp and the time we received a sample is - // more than this duration - FailoverTimeout time.Duration `yaml:"ha_tracker_failover_timeout"` - // EnableStartupSync controls whether to fetch all tracked keys from the KV store - // on startup to populate the local cache. - // This prevents duplicate GET calls for the same key while the cache is cold, - // but could cause a spike in GET requests during initialization if the number - // of tracked keys is large. - EnableStartupSync bool `yaml:"enable_startup_sync"` - - KVStore kv.Config `yaml:"kvstore" doc:"description=Backend storage to use for the ring. Please be aware that memberlist is not supported by the HA tracker since gossip propagation is too slow for HA purposes."` -} - -// RegisterFlags adds the flags required to config this to the given FlagSet. -func (cfg *HATrackerConfig) RegisterFlags(f *flag.FlagSet) { - f.BoolVar(&cfg.EnableHATracker, "distributor.ha-tracker.enable", false, "Enable the distributors HA tracker so that it can accept samples from Prometheus HA replicas gracefully (requires labels).") - f.DurationVar(&cfg.UpdateTimeout, "distributor.ha-tracker.update-timeout", 15*time.Second, "Update the timestamp in the KV store for a given cluster/replica only after this amount of time has passed since the current stored timestamp.") - f.DurationVar(&cfg.UpdateTimeoutJitterMax, "distributor.ha-tracker.update-timeout-jitter-max", 5*time.Second, "Maximum jitter applied to the update timeout, in order to spread the HA heartbeats over time.") - f.DurationVar(&cfg.FailoverTimeout, "distributor.ha-tracker.failover-timeout", 30*time.Second, "If we don't receive any samples from the accepted replica for a cluster in this amount of time we will failover to the next replica we receive a sample from. This value must be greater than the update timeout") - f.BoolVar(&cfg.EnableStartupSync, "distributor.ha-tracker.enable-startup-sync", false, "[Experimental] If enabled, fetches all tracked keys on startup to populate the local cache. This prevents duplicate GET calls for the same key while the cache is cold, but could cause a spike in GET requests during initialization if the number of tracked keys is large.") - - // We want the ability to use different Consul instances for the ring and - // for HA cluster tracking. We also customize the default keys prefix, in - // order to not clash with the ring key if they both share the same KVStore - // backend (ie. run on the same consul cluster). - cfg.KVStore.RegisterFlagsWithPrefix("distributor.ha-tracker.", "ha-tracker/", f) -} - -func (cfg *HATrackerConfig) ToHATrackerConfig() ha.HATrackerConfig { - haCfg := ha.HATrackerConfig{} - flagext.DefaultValues(&haCfg) - - haCfg.EnableHATracker = cfg.EnableHATracker - haCfg.UpdateTimeout = cfg.UpdateTimeout - haCfg.UpdateTimeoutJitterMax = cfg.UpdateTimeoutJitterMax - haCfg.FailoverTimeout = cfg.FailoverTimeout - haCfg.KVStore = cfg.KVStore - haCfg.EnableStartupSync = cfg.EnableStartupSync - - return haCfg -} diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 9cfc6a1501..c189a6c42f 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -3216,7 +3216,7 @@ func prepare(tb testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, [] ringStore, closer := consul.NewInMemoryClient(codec, log.NewNopLogger(), nil) tb.Cleanup(func() { assert.NoError(tb, closer.Close()) }) mock := kv.PrefixClient(ringStore, "prefix") - distributorCfg.HATrackerConfig = HATrackerConfig{ + distributorCfg.HATrackerConfig = ha.HATrackerConfig{ EnableHATracker: true, KVStore: kv.Config{Mock: mock}, UpdateTimeout: 100 * time.Millisecond, diff --git a/pkg/ha/ha_tracker.go b/pkg/ha/ha_tracker.go index e3161a7500..11b38eb79c 100644 --- a/pkg/ha/ha_tracker.go +++ b/pkg/ha/ha_tracker.go @@ -92,10 +92,10 @@ func (cfg *HATrackerConfig) RegisterFlagsWithPrefix(flagPrefix string, kvPrefix finalKVPrefix = kvPrefix } - f.BoolVar(&cfg.EnableHATracker, finalFlagPrefix+"ha-tracker.enable", false, "Enable the HA tracker so that it can accept data from Prometheus HA replicas gracefully.") - f.DurationVar(&cfg.UpdateTimeout, finalFlagPrefix+"ha-tracker.update-timeout", 15*time.Second, "Update the timestamp in the KV store for a given cluster/replicaGroup only after this amount of time has passed since the current stored timestamp.") - f.DurationVar(&cfg.UpdateTimeoutJitterMax, finalFlagPrefix+"ha-tracker.update-timeout-jitter-max", 5*time.Second, "Maximum jitter applied to the update timeout, in order to spread the HA heartbeats over time.") - f.DurationVar(&cfg.FailoverTimeout, finalFlagPrefix+"ha-tracker.failover-timeout", 30*time.Second, "If we don't receive any data from the accepted replica for a cluster/replicaGroup in this amount of time we will failover to the next replica we receive a sample from. This value must be greater than the update timeout") + f.BoolVar(&cfg.EnableHATracker, finalFlagPrefix+"ha-tracker.enable", false, "Enable the HA tracker so that it can accept data from Prometheus HA replicas gracefully (requires labels).") + f.DurationVar(&cfg.UpdateTimeout, finalFlagPrefix+"ha-tracker.update-timeout", 15*time.Second, "The time interval that must pass since the last timestamp update in the KV store before updating it again for a given cluster.") + f.DurationVar(&cfg.UpdateTimeoutJitterMax, finalFlagPrefix+"ha-tracker.update-timeout-jitter-max", 5*time.Second, "The maximum jitter applied to the update timeout to spread KV store updates over time.") + f.DurationVar(&cfg.FailoverTimeout, finalFlagPrefix+"ha-tracker.failover-timeout", 30*time.Second, "The timeout after which a new replica will be accepted if the currently elected replica stops sending data. This value must be greater than the update timeout plus the maximum jitter.") f.BoolVar(&cfg.EnableStartupSync, finalFlagPrefix+"ha-tracker.enable-startup-sync", false, "[Experimental] If enabled, fetches all tracked keys on startup to populate the local cache. This prevents duplicate GET calls for the same key while the cache is cold, but could cause a spike in GET requests during initialization if the number of tracked keys is large.") // We want the ability to use different Consul instances for the ring and diff --git a/schemas/cortex-config-schema.json b/schemas/cortex-config-schema.json index 47ebd5b036..d8cc325820 100644 --- a/schemas/cortex-config-schema.json +++ b/schemas/cortex-config-schema.json @@ -3718,7 +3718,7 @@ "properties": { "enable_ha_tracker": { "default": false, - "description": "Enable the distributors HA tracker so that it can accept samples from Prometheus HA replicas gracefully (requires labels).", + "description": "Enable the HA tracker so that it can accept data from Prometheus HA replicas gracefully (requires labels).", "type": "boolean", "x-cli-flag": "distributor.ha-tracker.enable" }, @@ -3730,21 +3730,21 @@ }, "ha_tracker_failover_timeout": { "default": "30s", - "description": "If we don't receive any samples from the accepted replica for a cluster in this amount of time we will failover to the next replica we receive a sample from. This value must be greater than the update timeout", + "description": "The timeout after which a new replica will be accepted if the currently elected replica stops sending data. This value must be greater than the update timeout plus the maximum jitter.", "type": "string", "x-cli-flag": "distributor.ha-tracker.failover-timeout", "x-format": "duration" }, "ha_tracker_update_timeout": { "default": "15s", - "description": "Update the timestamp in the KV store for a given cluster/replica only after this amount of time has passed since the current stored timestamp.", + "description": "The time interval that must pass since the last timestamp update in the KV store before updating it again for a given cluster.", "type": "string", "x-cli-flag": "distributor.ha-tracker.update-timeout", "x-format": "duration" }, "ha_tracker_update_timeout_jitter_max": { "default": "5s", - "description": "Maximum jitter applied to the update timeout, in order to spread the HA heartbeats over time.", + "description": "The maximum jitter applied to the update timeout to spread KV store updates over time.", "type": "string", "x-cli-flag": "distributor.ha-tracker.update-timeout-jitter-max", "x-format": "duration"