From a021f67b165529fc4ed619838fdb300327e0ef4f Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Sat, 21 Feb 2026 05:30:25 +0000 Subject: [PATCH 1/4] controller: refactor GetConfig into helper functions and add architecture docs Extract config generation logic into reusable functions: - generateConfig() - renders device config with deduplication - processConfigRequest() - validates request and finds unknown BGP peers This refactoring prepares for adding GetConfigHash endpoint that will share the same config generation logic. Also add architecture documentation with sequence diagram showing agent-controller communication flow. --- controlplane/controller/README.md | 92 ++++++++ .../controller/internal/controller/server.go | 202 +++++++++++------- 2 files changed, 214 insertions(+), 80 deletions(-) diff --git a/controlplane/controller/README.md b/controlplane/controller/README.md index 27dbc7b87..0306430ae 100644 --- a/controlplane/controller/README.md +++ b/controlplane/controller/README.md @@ -2,6 +2,98 @@ The controller generates device configurations from Solana smart contract state and serves them to agents running on network devices via gRPC. +## Architecture + +### Agent-Controller Communication Flow + +The controller provides two gRPC endpoints, GetConfig and GetConfigHash, that the config agent (in ../agent/) uses to detect and apply configuration changes. The agent polls the controller every 5 seconds by default. + +The design includes two optimizations: +1. Applying configuration to an Arista EOS device causes the EOS ConfigAgent process CPU to spike, so the agent only applies the config when the config generated by the controller is different than the last polling cycle +2. To make success more likely on lossy networks, + +Here's how the agent uses the endpoints: + +``` +┌─────────┐ ┌────────────┐ ┌────────────┐ ┌─────────┐ +│ Agent │ │ Controller │ │ Controller │ │ EOS │ +│ main() │ │GetConfigHash │ Config │ │ Device │ +│ │ │ GetConfig()│ │ Generator │ │ │ +└────┬────┘ └─────┬──────┘ └─────┬──────┘ └────┬────┘ + │ │ │ │ + │ Every 5s: │ │ │ + │ │ │ │ + │ GetBgpNeighbors() │ │ │ + ├─────────────────────────────────────────────────────────────────────────────────────────►│ + │◄─────────────────────────────────────────────────────────────────────────────────────────┤ + │ [peer IPs] │ │ │ + │ │ │ │ + │ Decision: should fetch? │ │ │ + │ • First run (no hash)? │ │ │ + │ • 1m since last apply? │ │ │ + │ • Hash changed? │ │ │ + │ │ │ │ + │ GetConfigHashFromServer() │ │ │ + ├───────────────────────────►│ │ │ + │ │ processConfigRequest() │ │ + │ ├─────────────────────────────►│ │ + │ │ │ generateConfig() │ + │ │ │ • deduplicateTunnels() │ + │ │ │ • renderConfig() │ + │ │ │ SHA256(config) │ + │ │◄─────────────────────────────┤ │ + │ │ [hash only] │ │ + │◄───────────────────────────┤ │ │ + │ ConfigHashResponse │ │ │ + │ {hash: "abc123..."} │ │ │ + │ (64 bytes) │ │ │ + │ │ │ │ + │ Compare: hash != lastHash? │ │ │ + │ │ │ │ + ├─── if YES (or first run or 5m timeout): │ + │ │ │ │ + │ fetchConfigFromController() │ │ + │ ├─► GetConfigFromServer() │ │ + │ │ ──────────────────► │ │ │ + │ │ │ processConfigRequest() │ │ + │ │ ├─────────────────────────────►│ │ + │ │ │ │ generateConfig() │ + │ │ │ │ • deduplicateTunnels() │ + │ │ │ │ • renderConfig() │ + │ │ │ │ (entire config text) │ + │ │ │◄─────────────────────────────┤ │ + │ │ ◄──────────────────│ [config string] │ │ + │ │ ConfigResponse │ │ │ + │ │ {config: "..."} │ │ │ + │ │ │ │ │ + │ ├─► computeChecksum(config) │ │ + │ │ [local SHA256] │ │ │ + │ │ │ │ │ + │ └─► return config+hash │ │ │ + │ │ │ │ + │ applyConfig() │ │ │ + │ └─► AddConfigToDevice(config) │ │ + │ ─────────────────────────────────────────────────────────────────────────────────►│ + │ ◄─────────────────────────────────────────────────────────────────────────────────┤ + │ [config applied] │ │ │ + │ │ │ │ + │ lastChecksum = hash │ │ │ + │ lastApplyTime = now │ │ │ + │ │ │ │ + ├─── else: skip this cycle (hash unchanged, no work needed) | │ + │ │ │ │ + │ sleep(5s) │ │ │ + │ goto top │ │ │ + │ │ │ │ +``` + +**Key Benefits:** +- **Network**: 64 bytes vs ~50KB on most cycles (99%+ reduction when config unchanged) +- **CPU**: Config generation still happens on controller (for hash), but EOS device skips apply +- **Safety**: Full config check every 5 minutes (300s) as fallback +- **Responsiveness**: Still checks for changes every 5 seconds +- **Decision points**: First run, 5m timeout, or hash mismatch triggers full fetch + ## Configuration ### ClickHouse Integration diff --git a/controlplane/controller/internal/controller/server.go b/controlplane/controller/internal/controller/server.go index ce3fd38d2..36d140846 100644 --- a/controlplane/controller/internal/controller/server.go +++ b/controlplane/controller/internal/controller/server.go @@ -2,7 +2,9 @@ package controller import ( "context" + "crypto/sha256" "crypto/tls" + "encoding/hex" "errors" "fmt" "log/slog" @@ -683,78 +685,13 @@ func (c *Controller) deduplicateTunnels(device *Device) []*Tunnel { return unique } -// GetConfig renders the latest device configuration based on cached device data -func (c *Controller) GetConfig(ctx context.Context, req *pb.ConfigRequest) (*pb.ConfigResponse, error) { - reqStart := time.Now() - c.mu.RLock() - defer c.mu.RUnlock() - device, ok := c.cache.Devices[req.GetPubkey()] - if !ok { - getConfigPubkeyErrors.WithLabelValues(req.GetPubkey()).Inc() - err := status.Errorf(codes.NotFound, "pubkey %s not found", req.Pubkey) - return nil, err - } - if len(device.DevicePathologies) > 0 { - err := status.Errorf(codes.FailedPrecondition, "cannot render config for device %s: %v", req.Pubkey, device.DevicePathologies) - return nil, err - } - +// generateConfig renders the device configuration. It must be called with c.mu held +// because it reads from c.cache, which is updated by a background goroutine. +func (c *Controller) generateConfig(pubkey string, device *Device, unknownPeers []net.IP) (string, error) { // Create shallow copy of device with deduplicated tunnels deviceForRender := *device deviceForRender.Tunnels = c.deduplicateTunnels(device) - agentVersion := req.GetAgentVersion() - agentCommit := req.GetAgentCommit() - agentDate := req.GetAgentDate() - - // Record metrics with device labels - getConfigOps.WithLabelValues( - req.GetPubkey(), - device.Code, - device.ContributorCode, - device.ExchangeCode, - device.LocationCode, - device.Status.String(), - agentVersion, - agentCommit, - agentDate, - ).Inc() - - // compare peers from device to on-chain - peerFound := func(peer net.IP) bool { - for _, tun := range deviceForRender.Tunnels { - if tun.OverlayDstIP.Equal(peer) { - return true - } - } - for _, bgpPeer := range c.cache.Vpnv4BgpPeers { // TODO: write a test that proves we don't remove ipv4/vpnv4 BGP peers - if bgpPeer.PeerIP.Equal(peer) { - return true - } - } - for _, bgpPeer := range c.cache.Ipv4BgpPeers { - if bgpPeer.PeerIP.Equal(peer) { - return true - } - } - return false - } - - unknownPeers := []net.IP{} - for _, peer := range req.GetBgpPeers() { - ip := net.ParseIP(peer) - if ip == nil { - continue - } - if peerFound(ip) { - continue - } - // Only remove peers with addresses that DZ has assigned. This will avoid removal of contributor-configured peers like DIA. - if isIPInBlock(ip, c.cache.Config.UserTunnelBlock) || isIPInBlock(ip, c.cache.Config.TunnelTunnelBlock) { - unknownPeers = append(unknownPeers, ip) - } - } - multicastGroupBlock := formatCIDR(&c.cache.Config.MulticastGroupBlock) // This check avoids the situation where the template produces the following useless output, which happens in any test case with a single DZD. @@ -769,21 +706,17 @@ func (c *Controller) GetConfig(ctx context.Context, req *pb.ConfigRequest) (*pb. var localASN uint32 if c.deviceLocalASN != 0 { - // Use the explicitly provided ASN localASN = c.deviceLocalASN } else if c.environment != "" { - // Get ASN from environment networkConfig, err := config.NetworkConfigForEnv(c.environment) if err != nil { - getConfigRenderErrors.WithLabelValues(req.GetPubkey()).Inc() - err := status.Errorf(codes.Internal, "failed to get network config for environment %s: %v", c.environment, err) - return nil, err + getConfigRenderErrors.WithLabelValues(pubkey).Inc() + return "", status.Errorf(codes.Internal, "failed to get network config for environment %s: %v", c.environment, err) } localASN = networkConfig.DeviceLocalASN } else { - getConfigRenderErrors.WithLabelValues(req.GetPubkey()).Inc() - err := status.Errorf(codes.Internal, "device local ASN not configured") - return nil, err + getConfigRenderErrors.WithLabelValues(pubkey).Inc() + return "", status.Errorf(codes.Internal, "device local ASN not configured") } data := templateData{ @@ -799,13 +732,94 @@ func (c *Controller) GetConfig(ctx context.Context, req *pb.ConfigRequest) (*pb. Strings: StringsHelper{}, } - config, err := renderConfig(data) + configStr, err := renderConfig(data) + if err != nil { + getConfigRenderErrors.WithLabelValues(pubkey).Inc() + return "", status.Errorf(codes.Aborted, "config rendering for pubkey %s failed: %v", pubkey, err) + } + + return configStr, nil +} + +// processConfigRequest validates the request and generates the config. +// It must be called with c.mu held. +// Returns the config string and the device (for metric labeling by the caller). +func (c *Controller) processConfigRequest(req *pb.ConfigRequest) (string, *Device, error) { + pubkey := req.GetPubkey() + device, ok := c.cache.Devices[pubkey] + if !ok { + getConfigPubkeyErrors.WithLabelValues(pubkey).Inc() + return "", nil, status.Errorf(codes.NotFound, "pubkey %s not found", pubkey) + } + if len(device.DevicePathologies) > 0 { + return "", nil, status.Errorf(codes.FailedPrecondition, "cannot render config for device %s: %v", pubkey, device.DevicePathologies) + } + + // Find unknown BGP peers that need to be removed + peerFound := func(peer net.IP) bool { + for _, tun := range device.Tunnels { + if tun.OverlayDstIP.Equal(peer) { + return true + } + } + for _, bgpPeer := range c.cache.Vpnv4BgpPeers { + if bgpPeer.PeerIP.Equal(peer) { + return true + } + } + for _, bgpPeer := range c.cache.Ipv4BgpPeers { + if bgpPeer.PeerIP.Equal(peer) { + return true + } + } + return false + } + + var unknownPeers []net.IP + for _, peer := range req.GetBgpPeers() { + ip := net.ParseIP(peer) + if ip == nil { + continue + } + if peerFound(ip) { + continue + } + if isIPInBlock(ip, c.cache.Config.UserTunnelBlock) || isIPInBlock(ip, c.cache.Config.TunnelTunnelBlock) { + unknownPeers = append(unknownPeers, ip) + } + } + + configStr, err := c.generateConfig(pubkey, device, unknownPeers) + if err != nil { + return "", nil, err + } + return configStr, device, nil +} + +// GetConfig renders the latest device configuration based on cached device data +func (c *Controller) GetConfig(ctx context.Context, req *pb.ConfigRequest) (*pb.ConfigResponse, error) { + reqStart := time.Now() + c.mu.RLock() + defer c.mu.RUnlock() + + configStr, device, err := c.processConfigRequest(req) if err != nil { - getConfigRenderErrors.WithLabelValues(req.GetPubkey()).Inc() - err := status.Errorf(codes.Aborted, "config rendering for pubkey %s failed: %v", req.Pubkey, err) return nil, err } - resp := &pb.ConfigResponse{Config: config} + + getConfigOps.WithLabelValues( + req.GetPubkey(), + device.Code, + device.ContributorCode, + device.ExchangeCode, + device.LocationCode, + device.Status.String(), + req.GetAgentVersion(), + req.GetAgentCommit(), + req.GetAgentDate(), + ).Inc() + + resp := &pb.ConfigResponse{Config: configStr} getConfigMsgSize.Observe(float64(proto.Size(resp))) getConfigDuration.Observe(float64(time.Since(reqStart).Seconds())) if c.clickhouse != nil { @@ -817,6 +831,34 @@ func (c *Controller) GetConfig(ctx context.Context, req *pb.ConfigRequest) (*pb. return resp, nil } +// GetConfigHash returns only the hash of the configuration for change detection +func (c *Controller) GetConfigHash(ctx context.Context, req *pb.ConfigRequest) (*pb.ConfigHashResponse, error) { + reqStart := time.Now() + c.mu.RLock() + defer c.mu.RUnlock() + + configStr, device, err := c.processConfigRequest(req) + if err != nil { + return nil, err + } + + getConfigHashOps.WithLabelValues( + req.GetPubkey(), + device.Code, + device.ContributorCode, + device.ExchangeCode, + device.LocationCode, + device.Status.String(), + req.GetAgentVersion(), + req.GetAgentCommit(), + req.GetAgentDate(), + ).Inc() + + hash := sha256.Sum256([]byte(configStr)) + getConfigDuration.Observe(float64(time.Since(reqStart).Seconds())) + return &pb.ConfigHashResponse{Hash: hex.EncodeToString(hash[:])}, nil +} + // formatCIDR formats a 5-byte network block into CIDR notation func formatCIDR(b *[5]byte) string { ip := net.IPv4(b[0], b[1], b[2], b[3]) From 9b8090c48aae44bb0003c46b3236b2696ae764aa Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Sat, 21 Feb 2026 05:30:56 +0000 Subject: [PATCH 2/4] controller: add GetConfigHash gRPC endpoint for lightweight config change detection Add new GetConfigHash RPC that returns only the SHA256 hash of the device configuration (64 bytes) instead of the full config (~50KB). This enables agents to efficiently check for config changes without transferring the full configuration on every poll. Changes: - Add GetConfigHash RPC to controller.proto - Implement GetConfigHash() handler that reuses processConfigRequest() - Add controller_grpc_getconfighash_requests_total metric - Regenerate protobuf code --- .../controller/internal/controller/metrics.go | 8 ++ .../proto/controller/controller.proto | 7 ++ .../controller/gen/pb-go/controller.pb.go | 111 ++++++++++++++---- .../gen/pb-go/controller_grpc.pb.go | 38 ++++++ 4 files changed, 144 insertions(+), 20 deletions(-) diff --git a/controlplane/controller/internal/controller/metrics.go b/controlplane/controller/internal/controller/metrics.go index 706a9b838..ed909a854 100644 --- a/controlplane/controller/internal/controller/metrics.go +++ b/controlplane/controller/internal/controller/metrics.go @@ -43,6 +43,13 @@ var ( []string{"pubkey", "device_code", "contributor_code", "exchange_code", "location_code", "device_status", "agent_version", "agent_commit", "agent_date"}, ) + getConfigHashOps = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_grpc_getconfighash_requests_total", + Help: "The total number of getconfighash requests", + }, + []string{"pubkey", "device_code", "contributor_code", "exchange_code", "location_code", "device_status", "agent_version", "agent_commit", "agent_date"}, + ) + getConfigMsgSize = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "controller_grpc_getconfig_msg_size_bytes", Help: "The size of GetConfig response messages in bytes", @@ -101,6 +108,7 @@ func init() { prometheus.MustRegister(getConfigRenderErrors) prometheus.MustRegister(duplicateTunnelPairs) prometheus.MustRegister(getConfigOps) + prometheus.MustRegister(getConfigHashOps) prometheus.MustRegister(getConfigMsgSize) prometheus.MustRegister(getConfigDuration) diff --git a/controlplane/proto/controller/controller.proto b/controlplane/proto/controller/controller.proto index 8298e8a81..08f4cd189 100644 --- a/controlplane/proto/controller/controller.proto +++ b/controlplane/proto/controller/controller.proto @@ -8,6 +8,8 @@ option go_package = "github.com/malbeclabs/doublezero/controlplane/proto/control service Controller { // Returns the latest configuration of a DoubleZero node based on on-chain data rpc GetConfig (ConfigRequest) returns (ConfigResponse) {} + // Returns only the hash of the latest configuration (lightweight check for changes) + rpc GetConfigHash (ConfigRequest) returns (ConfigHashResponse) {} } // Request for latest configuration of a DoubleZero node based on its public key @@ -30,4 +32,9 @@ message BgpPeers { message ConfigResponse { string config = 1; string hash = 2; +} + +// Response containing only the config hash +message ConfigHashResponse { + string hash = 1; } \ No newline at end of file diff --git a/controlplane/proto/controller/gen/pb-go/controller.pb.go b/controlplane/proto/controller/gen/pb-go/controller.pb.go index d9813c885..ec4862417 100644 --- a/controlplane/proto/controller/gen/pb-go/controller.pb.go +++ b/controlplane/proto/controller/gen/pb-go/controller.pb.go @@ -220,6 +220,54 @@ func (x *ConfigResponse) GetHash() string { return "" } +// Response containing only the config hash +type ConfigHashResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Hash string `protobuf:"bytes,1,opt,name=hash,proto3" json:"hash,omitempty"` +} + +func (x *ConfigHashResponse) Reset() { + *x = ConfigHashResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_controller_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ConfigHashResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ConfigHashResponse) ProtoMessage() {} + +func (x *ConfigHashResponse) ProtoReflect() protoreflect.Message { + mi := &file_controller_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ConfigHashResponse.ProtoReflect.Descriptor instead. +func (*ConfigHashResponse) Descriptor() ([]byte, []int) { + return file_controller_proto_rawDescGZIP(), []int{3} +} + +func (x *ConfigHashResponse) GetHash() string { + if x != nil { + return x.Hash + } + return "" +} + var File_controller_proto protoreflect.FileDescriptor var file_controller_proto_rawDesc = []byte{ @@ -258,16 +306,24 @@ var file_controller_proto_rawDesc = []byte{ 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x12, 0x0a, 0x04, 0x68, 0x61, 0x73, 0x68, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x68, 0x61, 0x73, 0x68, - 0x32, 0x52, 0x0a, 0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x12, 0x44, - 0x0a, 0x09, 0x47, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x19, 0x2e, 0x63, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, - 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1a, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x2e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, - 0x73, 0x65, 0x22, 0x00, 0x42, 0x40, 0x5a, 0x3e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, - 0x6f, 0x6d, 0x2f, 0x6d, 0x61, 0x6c, 0x62, 0x65, 0x63, 0x6c, 0x61, 0x62, 0x73, 0x2f, 0x64, 0x6f, - 0x75, 0x62, 0x6c, 0x65, 0x7a, 0x65, 0x72, 0x6f, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x70, 0x6c, 0x61, 0x6e, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x6f, 0x6e, 0x74, - 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x22, 0x28, 0x0a, 0x12, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x48, 0x61, 0x73, 0x68, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x68, 0x61, 0x73, 0x68, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x68, 0x61, 0x73, 0x68, 0x32, 0xa0, 0x01, 0x0a, 0x0a, 0x43, + 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x12, 0x44, 0x0a, 0x09, 0x47, 0x65, 0x74, + 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x19, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x6c, 0x65, 0x72, 0x2e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x1a, 0x1a, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x43, + 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, + 0x4c, 0x0a, 0x0d, 0x47, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x48, 0x61, 0x73, 0x68, + 0x12, 0x19, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x43, 0x6f, + 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1e, 0x2e, 0x63, 0x6f, + 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x48, + 0x61, 0x73, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x40, 0x5a, + 0x3e, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x6d, 0x61, 0x6c, 0x62, + 0x65, 0x63, 0x6c, 0x61, 0x62, 0x73, 0x2f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x7a, 0x65, 0x72, + 0x6f, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x70, 0x6c, 0x61, 0x6e, 0x65, 0x2f, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x62, + 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( @@ -282,20 +338,23 @@ func file_controller_proto_rawDescGZIP() []byte { return file_controller_proto_rawDescData } -var file_controller_proto_msgTypes = make([]protoimpl.MessageInfo, 4) +var file_controller_proto_msgTypes = make([]protoimpl.MessageInfo, 5) var file_controller_proto_goTypes = []interface{}{ - (*ConfigRequest)(nil), // 0: controller.ConfigRequest - (*BgpPeers)(nil), // 1: controller.BgpPeers - (*ConfigResponse)(nil), // 2: controller.ConfigResponse - nil, // 3: controller.ConfigRequest.BgpPeersByVrfEntry + (*ConfigRequest)(nil), // 0: controller.ConfigRequest + (*BgpPeers)(nil), // 1: controller.BgpPeers + (*ConfigResponse)(nil), // 2: controller.ConfigResponse + (*ConfigHashResponse)(nil), // 3: controller.ConfigHashResponse + nil, // 4: controller.ConfigRequest.BgpPeersByVrfEntry } var file_controller_proto_depIdxs = []int32{ - 3, // 0: controller.ConfigRequest.bgp_peers_by_vrf:type_name -> controller.ConfigRequest.BgpPeersByVrfEntry + 4, // 0: controller.ConfigRequest.bgp_peers_by_vrf:type_name -> controller.ConfigRequest.BgpPeersByVrfEntry 1, // 1: controller.ConfigRequest.BgpPeersByVrfEntry.value:type_name -> controller.BgpPeers 0, // 2: controller.Controller.GetConfig:input_type -> controller.ConfigRequest - 2, // 3: controller.Controller.GetConfig:output_type -> controller.ConfigResponse - 3, // [3:4] is the sub-list for method output_type - 2, // [2:3] is the sub-list for method input_type + 0, // 3: controller.Controller.GetConfigHash:input_type -> controller.ConfigRequest + 2, // 4: controller.Controller.GetConfig:output_type -> controller.ConfigResponse + 3, // 5: controller.Controller.GetConfigHash:output_type -> controller.ConfigHashResponse + 4, // [4:6] is the sub-list for method output_type + 2, // [2:4] is the sub-list for method input_type 2, // [2:2] is the sub-list for extension type_name 2, // [2:2] is the sub-list for extension extendee 0, // [0:2] is the sub-list for field type_name @@ -343,6 +402,18 @@ func file_controller_proto_init() { return nil } } + file_controller_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ConfigHashResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } } file_controller_proto_msgTypes[0].OneofWrappers = []interface{}{} type x struct{} @@ -351,7 +422,7 @@ func file_controller_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_controller_proto_rawDesc, NumEnums: 0, - NumMessages: 4, + NumMessages: 5, NumExtensions: 0, NumServices: 1, }, diff --git a/controlplane/proto/controller/gen/pb-go/controller_grpc.pb.go b/controlplane/proto/controller/gen/pb-go/controller_grpc.pb.go index c41e285ae..fde73232e 100644 --- a/controlplane/proto/controller/gen/pb-go/controller_grpc.pb.go +++ b/controlplane/proto/controller/gen/pb-go/controller_grpc.pb.go @@ -24,6 +24,8 @@ const _ = grpc.SupportPackageIsVersion7 type ControllerClient interface { // Returns the latest configuration of a DoubleZero node based on on-chain data GetConfig(ctx context.Context, in *ConfigRequest, opts ...grpc.CallOption) (*ConfigResponse, error) + // Returns only the hash of the latest configuration (lightweight check for changes) + GetConfigHash(ctx context.Context, in *ConfigRequest, opts ...grpc.CallOption) (*ConfigHashResponse, error) } type controllerClient struct { @@ -43,12 +45,23 @@ func (c *controllerClient) GetConfig(ctx context.Context, in *ConfigRequest, opt return out, nil } +func (c *controllerClient) GetConfigHash(ctx context.Context, in *ConfigRequest, opts ...grpc.CallOption) (*ConfigHashResponse, error) { + out := new(ConfigHashResponse) + err := c.cc.Invoke(ctx, "/controller.Controller/GetConfigHash", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + // ControllerServer is the server API for Controller service. // All implementations should embed UnimplementedControllerServer // for forward compatibility type ControllerServer interface { // Returns the latest configuration of a DoubleZero node based on on-chain data GetConfig(context.Context, *ConfigRequest) (*ConfigResponse, error) + // Returns only the hash of the latest configuration (lightweight check for changes) + GetConfigHash(context.Context, *ConfigRequest) (*ConfigHashResponse, error) } // UnimplementedControllerServer should be embedded to have forward compatible implementations. @@ -58,6 +71,9 @@ type UnimplementedControllerServer struct { func (UnimplementedControllerServer) GetConfig(context.Context, *ConfigRequest) (*ConfigResponse, error) { return nil, status.Errorf(codes.Unimplemented, "method GetConfig not implemented") } +func (UnimplementedControllerServer) GetConfigHash(context.Context, *ConfigRequest) (*ConfigHashResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetConfigHash not implemented") +} // UnsafeControllerServer may be embedded to opt out of forward compatibility for this service. // Use of this interface is not recommended, as added methods to ControllerServer will @@ -88,6 +104,24 @@ func _Controller_GetConfig_Handler(srv interface{}, ctx context.Context, dec fun return interceptor(ctx, in, info, handler) } +func _Controller_GetConfigHash_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(ConfigRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(ControllerServer).GetConfigHash(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/controller.Controller/GetConfigHash", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(ControllerServer).GetConfigHash(ctx, req.(*ConfigRequest)) + } + return interceptor(ctx, in, info, handler) +} + // Controller_ServiceDesc is the grpc.ServiceDesc for Controller service. // It's only intended for direct use with grpc.RegisterService, // and not to be introspected or modified (even as a copy) @@ -99,6 +133,10 @@ var Controller_ServiceDesc = grpc.ServiceDesc{ MethodName: "GetConfig", Handler: _Controller_GetConfig_Handler, }, + { + MethodName: "GetConfigHash", + Handler: _Controller_GetConfigHash_Handler, + }, }, Streams: []grpc.StreamDesc{}, Metadata: "controller.proto", From 37d58e00034fe6c21bdab21bb5187db81934d251 Mon Sep 17 00:00:00 2001 From: Nik Weidenbacher Date: Sat, 21 Feb 2026 05:31:16 +0000 Subject: [PATCH 3/4] agent: implement hash-based config polling with configurable cache timeout Replace aggressive 5-second full config polling with hash-based change detection. The agent now: - Checks config hash every 5 seconds (64 bytes) - Only fetches and applies full config when hash changes - Forces full config check after timeout (default 60s) as safety net This dramatically reduces: - Network bandwidth (99%+ when config unchanged) - EOS device load (no config application when unchanged) - Agent CPU (hash computed only when fetching new config) Add --config-cache-timeout-in-seconds flag to control the forced full config check interval. Refactor main loop: - Split pollControllerAndConfigureDevice into focused functions - Add computeChecksum() helper for SHA256 hashing - Add fetchConfigFromController() to get config and compute hash - Add applyConfig() to apply config to EOS device - Rename variables: cachedConfigHash, configCacheTime, configCacheTimeout Add GetConfigHashFromServer() client function to call new gRPC endpoint. --- CHANGELOG.md | 2 + controlplane/agent/cmd/agent/main.go | 95 +++++++++++++------ controlplane/agent/internal/agent/dzclient.go | 22 +++++ controlplane/controller/README.md | 2 +- 4 files changed, 93 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fef50c14..71a7a0e5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -294,6 +294,8 @@ All notable changes to this project will be documented in this file. - feat(smartcontract): add use_onchain_deallocation flag to MulticastGroup ([#2748](https://github.com/malbeclabs/doublezero/pull/2748)) - CLI - Remove restriction for a single tunnel per user; now a user can have a unicast and multicast tunnel concurrently (but can only be a publisher _or_ a subscriber) ([2728](https://github.com/malbeclabs/doublezero/pull/2728)) +- Device agents + - Reduce config agent network and CPU usage by checking config checksums every 5 seconds, and reducing full config check frquency to 1m ## [v0.8.3](https://github.com/malbeclabs/doublezero/compare/client/v0.8.2...client/v0.8.3) – 2026-01-22 diff --git a/controlplane/agent/cmd/agent/main.go b/controlplane/agent/cmd/agent/main.go index 7114b17ff..9985a71ec 100644 --- a/controlplane/agent/cmd/agent/main.go +++ b/controlplane/agent/cmd/agent/main.go @@ -2,6 +2,8 @@ package main import ( "context" + "crypto/sha256" + "encoding/hex" "flag" "fmt" "log" @@ -20,16 +22,17 @@ import ( ) var ( - localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network") - controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to") - device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.") - sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls") - controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up") - maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.") - verbose = flag.Bool("verbose", false, "Enable verbose logging") - showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit") - metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics") - metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics") + localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network") + controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to") + device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.") + sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls") + controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up") + configCacheTimeoutInSeconds = flag.Int("config-cache-timeout-in-seconds", 60, "Force full config fetch after this many seconds, even if hash unchanged") + maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.") + verbose = flag.Bool("verbose", false, "Enable verbose logging") + showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit") + metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics") + metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics") // set by LDFLAGS version = "dev" @@ -37,35 +40,33 @@ var ( date = "unknown" ) -func pollControllerAndConfigureDevice(ctx context.Context, dzclient pb.ControllerClient, eapiClient *arista.EAPIClient, pubkey string, verbose *bool, maxLockAge int, agentVersion string, agentCommit string, agentDate string) error { - var err error - - // The dz controller needs to know what BGP sessions we have configured locally - var neighborIpMap map[string][]string - neighborIpMap, err = eapiClient.GetBgpNeighbors(ctx) - if err != nil { - log.Println("pollControllerAndConfigureDevice: eapiClient.GetBgpNeighbors returned error:", err) - agent.ErrorsBgpNeighbors.Inc() - } +func computeChecksum(data string) string { + hash := sha256.Sum256([]byte(data)) + return hex.EncodeToString(hash[:]) +} - var configText string +func fetchConfigFromController(ctx context.Context, dzclient pb.ControllerClient, pubkey string, neighborIpMap map[string][]string, verbose *bool, agentVersion string, agentCommit string, agentDate string) (configText string, configHash string, err error) { configText, err = agent.GetConfigFromServer(ctx, dzclient, pubkey, neighborIpMap, controllerTimeoutInSeconds, agentVersion, agentCommit, agentDate) if err != nil { - log.Printf("pollControllerAndConfigureDevice failed to call agent.GetConfigFromServer: %q", err) + log.Printf("fetchConfigFromController failed to call agent.GetConfigFromServer: %q", err) agent.ErrorsGetConfig.Inc() - return err + return "", "", err } if *verbose { log.Printf("controller returned the following config: '%s'", configText) } + configHash = computeChecksum(configText) + return configText, configHash, nil +} + +func applyConfig(ctx context.Context, eapiClient *arista.EAPIClient, configText string, maxLockAge int) error { if configText == "" { - // Controller returned empty config return nil } - _, err = eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge) // 3rd arg (diffCmd) is only used for testing + _, err := eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge) if err != nil { agent.ErrorsApplyConfig.Inc() return err @@ -121,15 +122,55 @@ func main() { client := aristapb.NewEapiMgrServiceClient(clientConn) eapiClient = arista.NewEAPIClient(slog.Default(), client) + var cachedConfigHash string + var configCacheTime time.Time + configCacheTimeout := time.Duration(*configCacheTimeoutInSeconds) * time.Second + for { select { case <-ctx.Done(): return case <-ticker.C: - err := pollControllerAndConfigureDevice(ctx, dzclient, eapiClient, *localDevicePubkey, verbose, *maxLockAge, version, commit, date) + neighborIpMap, err := eapiClient.GetBgpNeighbors(ctx) + if err != nil { + log.Println("ERROR: eapiClient.GetBgpNeighbors returned", err) + agent.ErrorsBgpNeighbors.Inc() + } + + shouldFetchAndApply := false + + if cachedConfigHash == "" { + shouldFetchAndApply = true + } else if time.Since(configCacheTime) >= configCacheTimeout { + shouldFetchAndApply = true + } else { + hash, err := agent.GetConfigHashFromServer(ctx, dzclient, *localDevicePubkey, neighborIpMap, controllerTimeoutInSeconds, version, commit, date) + if err != nil { + log.Println("ERROR: GetConfigHashFromServer returned", err) + continue + } + if hash != cachedConfigHash { + shouldFetchAndApply = true + } + } + + if !shouldFetchAndApply { + continue + } + + configText, configHash, err := fetchConfigFromController(ctx, dzclient, *localDevicePubkey, neighborIpMap, verbose, version, commit, date) + if err != nil { + log.Println("ERROR: fetchConfigFromController returned", err) + continue + } + + err = applyConfig(ctx, eapiClient, configText, *maxLockAge) if err != nil { - log.Println("ERROR: pollAndConfigureDevice returned", err) + log.Println("ERROR: applyConfig returned", err) + continue } + cachedConfigHash = configHash + configCacheTime = time.Now() } } } diff --git a/controlplane/agent/internal/agent/dzclient.go b/controlplane/agent/internal/agent/dzclient.go index 430dae3a8..886b94fd9 100644 --- a/controlplane/agent/internal/agent/dzclient.go +++ b/controlplane/agent/internal/agent/dzclient.go @@ -35,6 +35,28 @@ func GetConfigFromServer(ctx context.Context, client pb.ControllerClient, localD return config, nil } +func GetConfigHashFromServer(ctx context.Context, client pb.ControllerClient, localDevicePubkey string, neighborIpMap map[string][]string, controllerTimeoutInSeconds *float64, agentVersion string, agentCommit string, agentDate string) (hash string, err error) { + ctx, cancel := context.WithTimeout(ctx, time.Duration(*controllerTimeoutInSeconds*float64(time.Second))) + defer cancel() + + var bgpPeers []string + bgpPeersByVrf := make(map[string]*pb.BgpPeers) + for vrf, peers := range neighborIpMap { + bgpPeersByVrf[vrf] = &pb.BgpPeers{Peers: peers} + bgpPeers = append(bgpPeers, peers...) + } + slices.Sort(bgpPeers) + + req := &pb.ConfigRequest{Pubkey: localDevicePubkey, BgpPeers: bgpPeers, BgpPeersByVrf: bgpPeersByVrf, AgentVersion: &agentVersion, AgentCommit: &agentCommit, AgentDate: &agentDate} + resp, err := client.GetConfigHash(ctx, req) + if err != nil { + log.Printf("Error calling GetConfigHash: %v\n", err) + return "", err + } + + return resp.GetHash(), nil +} + func GetDzClient(controllerAddressAndPort string) (pb.ControllerClient, error) { conn, err := grpc.NewClient(controllerAddressAndPort, grpc.WithTransportCredentials(insecure.NewCredentials())) log.Printf("controllerAddressAndPort %s\n", controllerAddressAndPort) diff --git a/controlplane/controller/README.md b/controlplane/controller/README.md index 0306430ae..d4732ea32 100644 --- a/controlplane/controller/README.md +++ b/controlplane/controller/README.md @@ -12,7 +12,7 @@ The design includes two optimizations: 1. Applying configuration to an Arista EOS device causes the EOS ConfigAgent process CPU to spike, so the agent only applies the config when the config generated by the controller is different than the last polling cycle 2. To make success more likely on lossy networks, -Here's how the agent uses the endpoints: +fHere's how the agent uses the endpoints: ``` ┌─────────┐ ┌────────────┐ ┌────────────┐ ┌─────────┐ From ab051f92f36c565582c1d723c6aadc78a6aa91a8 Mon Sep 17 00:00:00 2001 From: nikw9944 Date: Mon, 2 Mar 2026 20:19:26 +0000 Subject: [PATCH 4/4] controller: address PR review feedback for config hash optimization --- CHANGELOG.md | 2 +- controlplane/controller/README.md | 10 +- .../controller/internal/controller/metrics.go | 7 + .../controller/internal/controller/server.go | 19 +- .../internal/controller/server_test.go | 191 ++++++++++++++++++ 5 files changed, 214 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71a7a0e5d..0bf06f47b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -295,7 +295,7 @@ All notable changes to this project will be documented in this file. - CLI - Remove restriction for a single tunnel per user; now a user can have a unicast and multicast tunnel concurrently (but can only be a publisher _or_ a subscriber) ([2728](https://github.com/malbeclabs/doublezero/pull/2728)) - Device agents - - Reduce config agent network and CPU usage by checking config checksums every 5 seconds, and reducing full config check frquency to 1m + - Reduce config agent network and CPU usage by checking config checksums every 5 seconds, and reducing full config check frequency to 1m ## [v0.8.3](https://github.com/malbeclabs/doublezero/compare/client/v0.8.2...client/v0.8.3) – 2026-01-22 diff --git a/controlplane/controller/README.md b/controlplane/controller/README.md index d4732ea32..0680e5846 100644 --- a/controlplane/controller/README.md +++ b/controlplane/controller/README.md @@ -10,9 +10,9 @@ The controller provides two gRPC endpoints, GetConfig and GetConfigHash, that th The design includes two optimizations: 1. Applying configuration to an Arista EOS device causes the EOS ConfigAgent process CPU to spike, so the agent only applies the config when the config generated by the controller is different than the last polling cycle -2. To make success more likely on lossy networks, +2. To make success more likely on lossy networks, GetConfigHash returns only the hash (64 bytes) instead of the full config (~50KB+) -fHere's how the agent uses the endpoints: +Here's how the agent uses the endpoints: ``` ┌─────────┐ ┌────────────┐ ┌────────────┐ ┌─────────┐ @@ -50,7 +50,7 @@ fHere's how the agent uses the endpoints: │ │ │ │ │ Compare: hash != lastHash? │ │ │ │ │ │ │ - ├─── if YES (or first run or 5m timeout): │ + ├─── if YES (or first run or 1m timeout): │ │ │ │ │ │ fetchConfigFromController() │ │ │ ├─► GetConfigFromServer() │ │ @@ -90,9 +90,9 @@ fHere's how the agent uses the endpoints: **Key Benefits:** - **Network**: 64 bytes vs ~50KB on most cycles (99%+ reduction when config unchanged) - **CPU**: Config generation still happens on controller (for hash), but EOS device skips apply -- **Safety**: Full config check every 5 minutes (300s) as fallback +- **Safety**: Full config check every 60s as fallback - **Responsiveness**: Still checks for changes every 5 seconds -- **Decision points**: First run, 5m timeout, or hash mismatch triggers full fetch +- **Decision points**: First run, 60s timeout, or hash mismatch triggers full fetch ## Configuration diff --git a/controlplane/controller/internal/controller/metrics.go b/controlplane/controller/internal/controller/metrics.go index ed909a854..10285aef5 100644 --- a/controlplane/controller/internal/controller/metrics.go +++ b/controlplane/controller/internal/controller/metrics.go @@ -62,6 +62,12 @@ var ( Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 5}, }) + getConfigHashDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "controller_grpc_getconfighash_duration_seconds", + Help: "The duration of GetConfigHash requests in seconds", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 5}, + }) + // cache update metrics cacheUpdateErrors = prometheus.NewCounter(prometheus.CounterOpts{ Name: "controller_cache_update_errors_total", @@ -111,6 +117,7 @@ func init() { prometheus.MustRegister(getConfigHashOps) prometheus.MustRegister(getConfigMsgSize) prometheus.MustRegister(getConfigDuration) + prometheus.MustRegister(getConfigHashDuration) // cache update metrics prometheus.MustRegister(cacheUpdateErrors) diff --git a/controlplane/controller/internal/controller/server.go b/controlplane/controller/internal/controller/server.go index 36d140846..e1891ea25 100644 --- a/controlplane/controller/internal/controller/server.go +++ b/controlplane/controller/internal/controller/server.go @@ -687,11 +687,8 @@ func (c *Controller) deduplicateTunnels(device *Device) []*Tunnel { // generateConfig renders the device configuration. It must be called with c.mu held // because it reads from c.cache, which is updated by a background goroutine. +// The provided device should already have deduplicated tunnels (via deduplicateTunnels). func (c *Controller) generateConfig(pubkey string, device *Device, unknownPeers []net.IP) (string, error) { - // Create shallow copy of device with deduplicated tunnels - deviceForRender := *device - deviceForRender.Tunnels = c.deduplicateTunnels(device) - multicastGroupBlock := formatCIDR(&c.cache.Config.MulticastGroupBlock) // This check avoids the situation where the template produces the following useless output, which happens in any test case with a single DZD. @@ -700,7 +697,7 @@ func (c *Controller) generateConfig(pubkey string, device *Device, unknownPeers // router msdp // ``` ipv4Peers := c.cache.Ipv4BgpPeers - if len(ipv4Peers) == 1 && ipv4Peers[0].PeerIP.Equal(deviceForRender.Ipv4LoopbackIP) { + if len(ipv4Peers) == 1 && ipv4Peers[0].PeerIP.Equal(device.Ipv4LoopbackIP) { ipv4Peers = nil } @@ -721,7 +718,7 @@ func (c *Controller) generateConfig(pubkey string, device *Device, unknownPeers data := templateData{ MulticastGroupBlock: multicastGroupBlock, - Device: &deviceForRender, + Device: device, Vpnv4BgpPeers: c.cache.Vpnv4BgpPeers, Ipv4BgpPeers: ipv4Peers, UnknownBgpPeers: unknownPeers, @@ -755,9 +752,13 @@ func (c *Controller) processConfigRequest(req *pb.ConfigRequest) (string, *Devic return "", nil, status.Errorf(codes.FailedPrecondition, "cannot render config for device %s: %v", pubkey, device.DevicePathologies) } + // Create shallow copy of device with deduplicated tunnels + deviceForRender := *device + deviceForRender.Tunnels = c.deduplicateTunnels(device) + // Find unknown BGP peers that need to be removed peerFound := func(peer net.IP) bool { - for _, tun := range device.Tunnels { + for _, tun := range deviceForRender.Tunnels { if tun.OverlayDstIP.Equal(peer) { return true } @@ -789,7 +790,7 @@ func (c *Controller) processConfigRequest(req *pb.ConfigRequest) (string, *Devic } } - configStr, err := c.generateConfig(pubkey, device, unknownPeers) + configStr, err := c.generateConfig(pubkey, &deviceForRender, unknownPeers) if err != nil { return "", nil, err } @@ -855,7 +856,7 @@ func (c *Controller) GetConfigHash(ctx context.Context, req *pb.ConfigRequest) ( ).Inc() hash := sha256.Sum256([]byte(configStr)) - getConfigDuration.Observe(float64(time.Since(reqStart).Seconds())) + getConfigHashDuration.Observe(float64(time.Since(reqStart).Seconds())) return &pb.ConfigHashResponse{Hash: hex.EncodeToString(hash[:])}, nil } diff --git a/controlplane/controller/internal/controller/server_test.go b/controlplane/controller/internal/controller/server_test.go index 451746550..8389f6026 100644 --- a/controlplane/controller/internal/controller/server_test.go +++ b/controlplane/controller/internal/controller/server_test.go @@ -3,6 +3,8 @@ package controller import ( "bytes" "context" + "crypto/sha256" + "encoding/hex" "io" "log" "log/slog" @@ -2444,3 +2446,192 @@ func Test_GetConfig_DuplicateTunnelPairs_Integration(t *testing.T) { t.Errorf("expected 2 tunnels in config (2 'tunnel source' lines), got %d", tunnelSourceCount) } } + +func TestGetConfigHash(t *testing.T) { + listener := bufconn.Listen(1024 * 1024) + server := grpc.NewServer() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + controller := &Controller{ + log: logger, + deviceLocalASN: 65342, + } + pb.RegisterControllerServer(server, controller) + + go func() { + if err := server.Serve(listener); err != nil { + log.Fatal(err) + } + }() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + opts := []grpc.DialOption{ + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) { + return listener.Dial() + }), + } + conn, err := grpc.NewClient("passthrough://bufnet", opts...) + if err != nil { + t.Fatalf("error creating controller client: %v", err) + } + defer conn.Close() + defer cancel() + + agent := pb.NewControllerClient(conn) + + cache := stateCache{ + Config: serviceability.Config{ + MulticastGroupBlock: [5]uint8{239, 0, 0, 0, 24}, + }, + UnicastVrfs: []uint16{1}, + Devices: map[string]*Device{ + "abc123": { + PubKey: "abc123", + Code: "dz1", + Interfaces: []Interface{}, + ExchangeCode: "tst", + BgpCommunity: 10050, + PublicIP: net.IP{7, 7, 7, 7}, + Vpn4vLoopbackIP: net.IP{14, 14, 14, 14}, + Ipv4LoopbackIP: net.IP{15, 15, 15, 15}, + Vpn4vLoopbackIntfName: "Loopback255", + Ipv4LoopbackIntfName: "Loopback0", + IsisNet: "49.0000.0e0e.0e0e.0000.00", + DevicePathologies: []string{}, + Tunnels: []*Tunnel{ + { + Id: 500, + UnderlaySrcIP: net.IP{1, 1, 1, 1}, + UnderlayDstIP: net.IP{2, 2, 2, 2}, + OverlaySrcIP: net.IP{169, 254, 0, 0}, + OverlayDstIP: net.IP{169, 254, 0, 1}, + DzIp: net.IP{100, 0, 0, 0}, + PubKey: "user1", + Allocated: true, + VrfId: 1, + MetroRouting: true, + }, + }, + }, + }, + } + + controller.swapCache(cache) + + t.Run("returns valid sha256 hash", func(t *testing.T) { + resp, err := agent.GetConfigHash(ctx, &pb.ConfigRequest{Pubkey: "abc123"}) + if err != nil { + t.Fatalf("GetConfigHash returned error: %v", err) + } + if len(resp.Hash) != 64 { + t.Errorf("expected 64-character hex hash, got %d characters: %s", len(resp.Hash), resp.Hash) + } + // Verify it's valid hex + _, err = hex.DecodeString(resp.Hash) + if err != nil { + t.Errorf("hash is not valid hex: %v", err) + } + }) + + t.Run("hash matches sha256 of GetConfig response", func(t *testing.T) { + configResp, err := agent.GetConfig(ctx, &pb.ConfigRequest{Pubkey: "abc123"}) + if err != nil { + t.Fatalf("GetConfig returned error: %v", err) + } + hashResp, err := agent.GetConfigHash(ctx, &pb.ConfigRequest{Pubkey: "abc123"}) + if err != nil { + t.Fatalf("GetConfigHash returned error: %v", err) + } + expectedHash := sha256.Sum256([]byte(configResp.Config)) + expectedHashStr := hex.EncodeToString(expectedHash[:]) + if hashResp.Hash != expectedHashStr { + t.Errorf("GetConfigHash hash %q does not match SHA256 of GetConfig config %q", hashResp.Hash, expectedHashStr) + } + }) + + t.Run("hash is stable across calls", func(t *testing.T) { + resp1, err := agent.GetConfigHash(ctx, &pb.ConfigRequest{Pubkey: "abc123"}) + if err != nil { + t.Fatalf("first GetConfigHash returned error: %v", err) + } + resp2, err := agent.GetConfigHash(ctx, &pb.ConfigRequest{Pubkey: "abc123"}) + if err != nil { + t.Fatalf("second GetConfigHash returned error: %v", err) + } + if resp1.Hash != resp2.Hash { + t.Errorf("expected stable hash across calls, got %q and %q", resp1.Hash, resp2.Hash) + } + }) + + t.Run("hash changes when config changes", func(t *testing.T) { + resp1, err := agent.GetConfigHash(ctx, &pb.ConfigRequest{Pubkey: "abc123"}) + if err != nil { + t.Fatalf("GetConfigHash returned error: %v", err) + } + + // Update cache with an additional tunnel + updatedCache := cache + updatedDevices := map[string]*Device{ + "abc123": { + PubKey: "abc123", + Code: "dz1", + Interfaces: []Interface{}, + ExchangeCode: "tst", + BgpCommunity: 10050, + PublicIP: net.IP{7, 7, 7, 7}, + Vpn4vLoopbackIP: net.IP{14, 14, 14, 14}, + Ipv4LoopbackIP: net.IP{15, 15, 15, 15}, + Vpn4vLoopbackIntfName: "Loopback255", + Ipv4LoopbackIntfName: "Loopback0", + IsisNet: "49.0000.0e0e.0e0e.0000.00", + DevicePathologies: []string{}, + Tunnels: []*Tunnel{ + { + Id: 500, + UnderlaySrcIP: net.IP{1, 1, 1, 1}, + UnderlayDstIP: net.IP{2, 2, 2, 2}, + OverlaySrcIP: net.IP{169, 254, 0, 0}, + OverlayDstIP: net.IP{169, 254, 0, 1}, + DzIp: net.IP{100, 0, 0, 0}, + PubKey: "user1", + Allocated: true, + VrfId: 1, + MetroRouting: true, + }, + { + Id: 501, + UnderlaySrcIP: net.IP{3, 3, 3, 3}, + UnderlayDstIP: net.IP{4, 4, 4, 4}, + OverlaySrcIP: net.IP{169, 254, 0, 2}, + OverlayDstIP: net.IP{169, 254, 0, 3}, + DzIp: net.IP{100, 0, 0, 1}, + PubKey: "user2", + Allocated: true, + VrfId: 1, + MetroRouting: true, + }, + }, + }, + } + updatedCache.Devices = updatedDevices + controller.swapCache(updatedCache) + + resp2, err := agent.GetConfigHash(ctx, &pb.ConfigRequest{Pubkey: "abc123"}) + if err != nil { + t.Fatalf("GetConfigHash returned error after update: %v", err) + } + if resp1.Hash == resp2.Hash { + t.Error("expected hash to change after config update, but it stayed the same") + } + + // Restore original cache + controller.swapCache(cache) + }) + + t.Run("returns error for unknown pubkey", func(t *testing.T) { + _, err := agent.GetConfigHash(ctx, &pb.ConfigRequest{Pubkey: "unknown"}) + if err == nil { + t.Fatal("expected error for unknown pubkey, got nil") + } + }) +}