From a816effcb57bb4bccc14c3e3c4c793a599d23cb4 Mon Sep 17 00:00:00 2001 From: Lars Erik Wik Date: Thu, 21 May 2026 12:38:56 +0200 Subject: [PATCH 1/3] Raised cf-apache.service start timeout to avoid PID-file race cf-apache.service is Type=forking with PIDFile=$(sys.workdir)/httpd/httpd.pid, so systemd waits for the PID file before declaring the service started. apachectl writes the PID file shortly after fork, but on a busy host (e.g. during mission-portal upgrade with concurrent SELinux relabeling, cf-postgres and cf-php-fpm restarts) that gap has been observed to exceed the inherited default TimeoutStartSec of 90 s (see systemd-system.conf(5), DefaultTimeoutStartSec=). When systemd then SIGKILLs the apache parent, worker children survive holding 0.0.0.0:80, the unit enters a restart loop, and subsequent apachectl invocations from policy fail with "Address already in use". Raising TimeoutStartSec to 300 s gives apache enough headroom on a loaded host while still bounding startup time, so a genuinely hung httpd will still be terminated by systemd. Ticket: ENT-11189 ChangeLog: Title Signed-off-by: Lars Erik Wik Co-Authored-By: Claude Opus 4.7 (1M context) --- templates/cf-apache.service.mustache | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/templates/cf-apache.service.mustache b/templates/cf-apache.service.mustache index 9169be66b4..b0029646b0 100644 --- a/templates/cf-apache.service.mustache +++ b/templates/cf-apache.service.mustache @@ -18,6 +18,11 @@ ExecStart={{{vars.sys.workdir}}}/httpd/bin/apachectl start ExecStop={{{vars.sys.workdir}}}/httpd/bin/apachectl stop ExecReload={{{vars.sys.workdir}}}/httpd/bin/apachectl graceful PIDFile={{{vars.sys.workdir}}}/httpd/httpd.pid +# ENT-11189: apachectl writes the PID file shortly after fork. On a busy host +# (e.g. mid-upgrade with SELinux relabel, cf-postgres and cf-php-fpm churning) +# the default 90s start timeout has been observed to fire while apache is still +# coming up, leaving worker children bound to :80 and the unit in a restart loop. +TimeoutStartSec=300 Restart=always RestartSec=10 UMask=0177 From 22c4f31c7a5cca85e286a017c927675b42bd764b Mon Sep 17 00:00:00 2001 From: Lars Erik Wik Date: Thu, 21 May 2026 12:53:09 +0200 Subject: [PATCH 2/3] Check 'systemctl cat' instead of 'is-active' for cf-apache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mission_portal_apache_from_stage bundle uses the 'systemd_supervised' class to decide whether to manage cf-apache via systemd (services: promise) or by invoking apachectl directly (commands: promise). The class was set from 'systemctl -q is-active cf-apache', which returns non-zero whenever the unit is currently inactive or failed — including transient failures during an upgrade. In ENT-11189 we observed that this caused the policy to fall back to the direct-apachectl branch while systemd was concurrently retrying cf-apache in its own restart loop, leaving the two racing each other and apachectl failing with "Address already in use". Switching the probe to 'systemctl cat cf-apache' answers the right question — "does systemd know about this unit?" — which is true regardless of the unit's current active/failed/inactive state. Ticket: ENT-11189 ChangeLog: Title Signed-off-by: Lars Erik Wik Co-Authored-By: Claude Opus 4.7 (1M context) --- cfe_internal/enterprise/mission_portal.cf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cfe_internal/enterprise/mission_portal.cf b/cfe_internal/enterprise/mission_portal.cf index 4b75aab016..70118d736d 100644 --- a/cfe_internal/enterprise/mission_portal.cf +++ b/cfe_internal/enterprise/mission_portal.cf @@ -288,9 +288,10 @@ bundle agent mission_portal_apache_from_stage(config, staged_config) string => "Configure apache based on successfully staged config"; classes: - "systemd_supervised" - expression => returnszero("$(paths.systemctl) -q is-active cf-apache > /dev/null 2>&1", "useshell"), - if => fileexists( $(paths.systemctl) ); + "systemd_supervised" -> { "ENT-11189" } + expression => returnszero("$(paths.systemctl) cat cf-apache > /dev/null 2>&1", "useshell"), + if => fileexists( $(paths.systemctl) ), + comment => "Set when cf-apache.service is a unit known to systemd"; "httpd_config_validated" expression => strcmp("$(validate_result[exit_code])", "0"); From 4dd50f8d9088d284ad66c4e1a332559b60fc68d4 Mon Sep 17 00:00:00 2001 From: Lars Erik Wik Date: Thu, 21 May 2026 13:07:30 +0200 Subject: [PATCH 3/3] Reset cf-apache failed state before restarting it If cf-apache.service has been failing repeatedly, systemd latches it as 'failed' and refuses subsequent restart requests (StartLimitBurst / StartLimitIntervalSec, see systemd.unit(5)). The service_policy => "restart" below is then a silent no-op and the hub stays down. Add a methods promise that runs 'systemctl reset-failed cf-apache' via a new cf_apache_reset_failed_state helper, gated on mission_portal_apache_config_repaired so it only fires in the same agent pass that has just rewritten the apache config and is about to issue a restart. On idle runs it does nothing. Ticket: ENT-11189 ChangeLog: Title Signed-off-by: Lars Erik Wik Co-Authored-By: Claude Opus 4.7 (1M context) --- cfe_internal/enterprise/mission_portal.cf | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cfe_internal/enterprise/mission_portal.cf b/cfe_internal/enterprise/mission_portal.cf index 70118d736d..c9f8402768 100644 --- a/cfe_internal/enterprise/mission_portal.cf +++ b/cfe_internal/enterprise/mission_portal.cf @@ -368,6 +368,13 @@ bundle agent mission_portal_apache_from_stage(config, staged_config) contain => in_shell, comment => "We restart apache after the new valid config is in place"; + methods: + systemd_supervised:: + "Reset cf-apache failed state" -> { "ENT-11189" } + usebundle => cf_apache_reset_failed_state, + if => "mission_portal_apache_config_repaired", + comment => "Clear any latched failed state before restarting cf-apache"; + services: systemd_supervised:: "cf-apache" @@ -541,3 +548,15 @@ bundle agent cfe_enterprise_selfsigned_cert "DEBUG $(this.bundle): No Certificate Generation Requested" if => "!_cfe_enterprise_selfsigned_cert_regenerate_certificate"; } + +bundle agent cf_apache_reset_failed_state +# @brief Clear any latched 'failed' state on cf-apache.service so subsequent +# service operations are not refused by systemd's start rate limiter +# (StartLimitBurst). Safe no-op when the unit is not in a failed state. +{ + commands: + "$(paths.systemctl) reset-failed cf-apache" -> { "ENT-11189" } + contain => in_shell, + handle => "cf_apache_systemctl_reset_failed", + comment => "Reset latched failed state on cf-apache.service"; +}