Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions cardano-node/src/Cardano/Node/Tracing/API.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE MonoLocalBinds #-}
{-# LANGUAGE PackageImports #-}
Expand All @@ -11,7 +12,7 @@ module Cardano.Node.Tracing.API
) where

import Cardano.Logging hiding (traceWith)
import Cardano.Logging.Prometheus.TCPServer (runPrometheusSimple)
import Cardano.Logging.Prometheus.TCPServer
import Cardano.Node.Configuration.NodeAddress (PortNumber)
import Cardano.Node.Configuration.POM (NodeConfiguration (..))
import Cardano.Node.Protocol.Types
Expand All @@ -36,11 +37,12 @@ import Ouroboros.Network.NodeToNode (RemoteAddress)

import Prelude

import Control.Concurrent.Async (link)
import Control.DeepSeq (deepseq)
import Control.Exception (SomeException (..))
import Control.Monad (forM_)
import "contra-tracer" Control.Tracer (traceWith)
import "trace-dispatcher" Control.Tracer (nullTracer)
import Data.Functor.Contravariant ((>$<))
import qualified Data.Map.Strict as Map
import Data.Maybe
import Data.Time.Clock (getCurrentTime)
Expand Down Expand Up @@ -83,10 +85,7 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do

traceWith (nodeStateTracer tracers) NodeTracingOnlineConfiguring

mError <- kickoffPrometheusSimple
forM_ mError $ \errMsg ->
let errMsg' = "PrometheusSimple backend disabled due to initialisation error: " ++ errMsg
in traceWith (nodeStateTracer tracers) (NodeTracingFailure errMsg')
kickoffPrometheusSimple

startResourceTracer
(resourcesTracer tracers)
Expand All @@ -105,16 +104,14 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do
mkTracers
:: TraceConfig
-> IO ( IO ()
, IO (Maybe String)
, IO ()
, Tracers RemoteAddress LocalAddress blk IO
)
mkTracers trConfig = mdo
ekgStore <- EKG.newStore
EKG.registerGcMetrics ekgStore
ekgTrace <- ekgTracer trConfig ekgStore

let kickoffPrometheusSimple = maybe (pure Nothing) (runPrometheusSimple ekgStore) prometheusSimple

stdoutTrace <- standardTracer

-- We should initialize forwarding only if 'Forwarder' backend
Expand Down Expand Up @@ -155,6 +152,16 @@ initTraceDispatcher nc p networkMagic nodeKernel noBlockForging = do
dpTracer
trConfig
p

let
kickoffPrometheusSimple = case prometheusSimple of
Nothing -> pure ()
Just ps ->
let
!nsTr = nodeStateTracer tracers
!tracePrometheus = NodePrometheusSimple >$< nsTr
in runPrometheusSimple tracePrometheus ekgStore ps >>= link

pure (kickoffForwarder, kickoffPrometheusSimple, tracers)

where
Expand Down
27 changes: 26 additions & 1 deletion cardano-node/src/Cardano/Node/Tracing/StateRep.hs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ module Cardano.Node.Tracing.StateRep
import Cardano.Api (textShow)

import Cardano.Logging
import Cardano.Logging.Prometheus.TCPServer (TracePrometheusSimple (..))
import Cardano.Node.Handlers.Shutdown (ShutdownTrace)
import Cardano.Node.Protocol.Types (SomeConsensusProtocol (..))
import qualified Cardano.Node.Startup as Startup
Expand All @@ -48,6 +49,11 @@ deriving instance ToJSON ChunkNo

deriving instance NFData ChunkNo

deriving instance Generic TracePrometheusSimple
deriving instance FromJSON TracePrometheusSimple
deriving instance ToJSON TracePrometheusSimple
deriving instance NFData TracePrometheusSimple

data OpeningDbs
= StartedOpeningImmutableDB
| OpenedImmutableDB (WithOrigin SlotNo) ChunkNo
Expand Down Expand Up @@ -100,6 +106,7 @@ data NodeState
= NodeTracingOnlineConfiguring
| NodeTracingFailure String
| NodeTracingForwardingInterrupted HowToConnect String
| NodePrometheusSimple TracePrometheusSimple
| NodeOpeningDbs OpeningDbs
| NodeReplays Replays
| NodeInitChainSelection InitChainSelection
Expand All @@ -112,7 +119,7 @@ data NodeState
deriving instance (NFData NodeState)

instance LogFormatting NodeState where
forMachine _ = \case
forMachine _dtal = \case
NodeTracingOnlineConfiguring -> mconcat
[ "kind" .= String "NodeTracingOnlineConfiguring" ]
NodeOpeningDbs x -> mconcat
Expand All @@ -136,12 +143,16 @@ instance LogFormatting NodeState where
, "conn" .= howToConnect
, "message" .= toJSON x
]
NodePrometheusSimple promSimple ->
forMachine _dtal promSimple

forHuman = \case
NodeTracingFailure errMsg ->
T.pack errMsg
NodeTracingForwardingInterrupted howToConnect errMsg ->
T.pack $ "trace forwarding connection with " <> show howToConnect <> " failed: " <> errMsg
NodePrometheusSimple promSimple ->
forHuman promSimple
_
-> ""

Expand All @@ -152,6 +163,10 @@ instance MetaTrace NodeState where
Namespace [] ["NodeTracingFailure"]
namespaceFor NodeTracingForwardingInterrupted {} =
Namespace [] ["NodeTracingForwardingInterrupted"]
namespaceFor (NodePrometheusSimple TracePrometheusSimpleStart{}) =
Namespace [] ["PrometheusSimple", "Start"]
namespaceFor (NodePrometheusSimple TracePrometheusSimpleStop{}) =
Namespace [] ["PrometheusSimple", "Stop"]
namespaceFor NodeOpeningDbs {} =
Namespace [] ["OpeningDbs"]
namespaceFor NodeReplays {} =
Expand All @@ -173,6 +188,10 @@ instance MetaTrace NodeState where
Just Error
severityFor (Namespace _ ["NodeTracingForwardingInterrupted"]) _ =
Just Warning
severityFor (Namespace _ ["PrometheusSimple", "Start"]) _ =
Just Info
severityFor (Namespace _ ["PrometheusSimple", "Stop"]) _ =
Just Warning
severityFor (Namespace _ ["OpeningDbs"]) _ =
Just Info
severityFor (Namespace _ ["NodeReplays"]) _ =
Expand Down Expand Up @@ -210,12 +229,18 @@ instance MetaTrace NodeState where
"Node startup"
documentFor (Namespace _ ["NodeShutdown"]) = Just
"Node shutting down"
documentFor (Namespace _ ["PrometheusSimple", "Start"]) =
Just "PrometheusSimple backend is starting"
documentFor (Namespace _ ["PrometheusSimple", "Stop"]) =
Just "PrometheusSimple backend stopped"
documentFor _ns = Nothing

allNamespaces = [
Namespace [] ["NodeTracingOnlineConfiguring"]
, Namespace [] ["NodeTracingFailure"]
, Namespace [] ["NodeTracingForwardingInterrupted"]
, Namespace [] ["PrometheusSimple", "Start"]
, Namespace [] ["PrometheusSimple", "Stop"]
, Namespace [] ["OpeningDbs"]
, Namespace [] ["NodeReplays"]
, Namespace [] ["NodeInitChainSelection"]
Expand Down
1 change: 1 addition & 0 deletions trace-dispatcher/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## 2.11.1 -- Dez 2025

* Increase `PrometheusSimple` robustness by restarting the backend upon crash, adding start/stop traces and more eagerly reaping of dangling sockets
* Removed `TraceConfig.tcPeerFrequency` and hence `TraceOptionPeerFrequency` from config representation
* Removed unused module `Cardano.Logging.Types.NodePeers`

Expand Down
18 changes: 9 additions & 9 deletions trace-dispatcher/src/Cardano/Logging/Prometheus/NetworkRun.hs
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ data NetworkRunParams = NetworkRunParams

defaultRunParams :: String -> NetworkRunParams
defaultRunParams name = NetworkRunParams
{ runSocketTimeout = 30
{ runSocketTimeout = 22
, runSocketGraceful = 1000
, runRecvMaxSize = 2048
, runRateLimit = 3.0
, runConnLimitGlobal = 12
, runConnLimitPerHost = 3
, runConnLimitPerHost = 4
, runServerName = name
}

Expand All @@ -68,8 +68,8 @@ mkTCPServerRunner
:: NetworkRunParams
-> Maybe HostName
-> PortNumber
-> TimeoutServer a
-> IO (IO a)
-> TimeoutServer ()
-> IO (IO ())
mkTCPServerRunner runParams (fromMaybe "127.0.0.1" -> host) portNo server = do
!sock <- openTCPServerSocket =<< resolve host portNo
let
Expand All @@ -81,8 +81,8 @@ mkTCPServerRunner runParams (fromMaybe "127.0.0.1" -> host) portNo server = do
runTCPServerWithSocket
:: NetworkRunParams
-> Socket
-> TimeoutServer a
-> IO a
-> TimeoutServer ()
-> IO ()
runTCPServerWithSocket runParams@NetworkRunParams{..} sock server = do
rateLimiter <- mkRateLimiter runServerName runRateLimit
ConnLimiter{..} <- mkConnLimiter runConnLimitGlobal runConnLimitPerHost
Expand All @@ -91,13 +91,13 @@ runTCPServerWithSocket runParams@NetworkRunParams{..} sock server = do
E.bracketOnError (accept sock) (close . fst) $ \(conn, peer) -> do
noLimitHit <- canServeThisPeer peer
if noLimitHit
then void $ forkFinally (server' mgr conn) (const $ gclose conn >> releasePeer peer)
then void $ forkFinally (runServer mgr conn) (const $ gclose conn >> releasePeer peer)
else close conn
where
gclose = if runSocketGraceful > 0 then flip gracefulClose runSocketGraceful else close
server' mgr conn = do
runServer mgr conn = do
threadLabelMe $ runServerName ++ " timeout server"
T.withHandle mgr (return ()) $ \timeoutHandle ->
T.withHandleKillThread mgr (return ()) $ \timeoutHandle ->
server runParams (T.tickle timeoutHandle) conn

resolve :: HostName -> PortNumber -> IO AddrInfo
Expand Down
61 changes: 52 additions & 9 deletions trace-dispatcher/src/Cardano/Logging/Prometheus/TCPServer.hs
Original file line number Diff line number Diff line change
@@ -1,20 +1,35 @@
{-# LANGUAGE PackageImports #-}

{-# OPTIONS_GHC -Wno-partial-fields #-}

-- | Run a simple Prometheus TCP server, responding *only* to the '/metrics' URL with current Node metrics
module Cardano.Logging.Prometheus.TCPServer (runPrometheusSimple) where
module Cardano.Logging.Prometheus.TCPServer
( runPrometheusSimple
, runPrometheusSimpleSilent

, TracePrometheusSimple (..)
) where

import Cardano.Logging.Prometheus.Exposition (renderExpositionFromSample)
import Cardano.Logging.Prometheus.NetworkRun
import Cardano.Logging.Types
import Cardano.Logging.Utils (runInLoop, showT)

import Control.Concurrent.Async (async, link)
import Control.Concurrent.Async (Async, async)
import qualified Control.Exception as E
import Control.Monad (when)
import Control.Monad (join, when)
import "contra-tracer" Control.Tracer
import Data.Aeson.Types as AE (Value (String), (.=))
import Data.ByteString (ByteString)
import Data.ByteString.Builder
import qualified Data.ByteString.Char8 as BC
import Data.Int (Int64)
import Data.List (find, intersperse)
import Data.Text as TS (pack)
import Data.Text.Lazy (Text)
import qualified Data.Text.Lazy as T
import qualified Data.Text.Lazy.Encoding as T (encodeUtf8Builder)
import Data.Word (Word16)
import Network.HTTP.Date (epochTimeToHTTPDate, formatHTTPDate)
import Network.Socket (HostName, PortNumber)
import qualified Network.Socket.ByteString as Strict (recv)
Expand All @@ -24,13 +39,41 @@ import System.Posix.Types (EpochTime)
import System.PosixCompat.Time (epochTime)


-- Will provide a 'Just errormessage' iff creating the Prometheus server failed
runPrometheusSimple :: EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Maybe String)
runPrometheusSimple ekgStore (noSuffixes, mHost, portNo) =
E.try createRunner >>= \case
Left (E.SomeException e) -> pure (Just $ E.displayException e)
Right runner -> async runner >>= link >> pure Nothing
data TracePrometheusSimple =
TracePrometheusSimpleStart { port :: Word16 }
| TracePrometheusSimpleStop { message :: String }
deriving Show

instance LogFormatting TracePrometheusSimple where
forMachine _ = \case
TracePrometheusSimpleStart portNo -> mconcat
[ "kind" .= AE.String "PrometheusSimpleStart"
, "port" .= portNo
]
TracePrometheusSimpleStop message -> mconcat
[ "kind" .= AE.String "TracePrometheusSimpleStop"
, "message" .= message
]

forHuman = \case
TracePrometheusSimpleStart portNo -> "PrometheusSimple backend starting on port " <> showT portNo
TracePrometheusSimpleStop message -> "PrometheusSimple backend stop: " <> TS.pack message


-- Same as below, but will not trace anything
runPrometheusSimpleSilent :: EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Async ())
runPrometheusSimpleSilent = runPrometheusSimple nullTracer

-- Will retry / restart Prometheus server when an exception occurs, in increasing intervals
runPrometheusSimple :: Tracer IO TracePrometheusSimple -> EKG.Store -> (Bool, Maybe HostName, PortNumber) -> IO (Async ())
runPrometheusSimple tr ekgStore (noSuffixes, mHost, portNo) =
async $ runInLoop fromScratchThrowing traceInterruption 1 60
where
traceInterruption (E.SomeException e) =
traceWith tr $ TracePrometheusSimpleStop (E.displayException e)

fromScratchThrowing = traceWith tr (TracePrometheusSimpleStart $ fromIntegral portNo) >> join createRunner

getCurrentExposition = renderExpositionFromSample noSuffixes <$> sampleAll ekgStore
createRunner = mkTCPServerRunner (defaultRunParams "PrometheusSimple") mHost portNo (serveAccepted getCurrentExposition)

Expand Down
Loading