diff --git a/docs/playbook-reference/actions/remediation.rst b/docs/playbook-reference/actions/remediation.rst index abeee9198..5fb0d426e 100644 --- a/docs/playbook-reference/actions/remediation.rst +++ b/docs/playbook-reference/actions/remediation.rst @@ -16,6 +16,8 @@ Robusta includes actions that modify Kubernetes resources in your cluster. See a .. robusta-action:: playbooks.robusta_playbooks.autoscaler.alert_on_hpa_reached_limit on_horizontalpodautoscaler_update +.. robusta-action:: playbooks.robusta_playbooks.aws_node_group_actions.eks_scale_node_group on_prometheus_alert + .. robusta-action:: playbooks.robusta_playbooks.workload_actions.rollout_restart on_prometheus_alert .. robusta-action:: playbooks.robusta_playbooks.workload_actions.restart_named_rollout on_prometheus_alert diff --git a/playbooks/robusta_playbooks/aws_node_group_actions.py b/playbooks/robusta_playbooks/aws_node_group_actions.py new file mode 100644 index 000000000..dfe8a5209 --- /dev/null +++ b/playbooks/robusta_playbooks/aws_node_group_actions.py @@ -0,0 +1,121 @@ +import logging +from typing import Optional + +import boto3 +from botocore.exceptions import ClientError + +from robusta.api import ( + ActionException, + ActionParams, + ErrorCodes, + ExecutionBaseEvent, + Finding, + FindingSeverity, + FindingSource, + MarkdownBlock, + action, +) + + +class EksNodeGroupParams(ActionParams): + """ + :var cluster_name: EKS cluster name. + :var region: AWS region where the cluster is located (e.g. us-east-1). + :var node_group_name: Name of the EKS managed node group to scale. + :var new_max_size: New maximum node count. Must exceed the current maxSize. + :var aws_access_key_id: Optional AWS access key ID. Falls back to instance role or environment. + :var aws_secret_access_key: Optional AWS secret access key. + """ + + cluster_name: str + region: str + node_group_name: str + new_max_size: int + aws_access_key_id: Optional[str] = None + aws_secret_access_key: Optional[str] = None + + +@action +def eks_scale_node_group(event: ExecutionBaseEvent, params: EksNodeGroupParams): + """ + Increase the maximum size of an EKS managed node group. + + Use as a remediation action when the cluster autoscaler cannot provision + new nodes because the node group has reached its configured maxSize. + + Requires IAM permission: eks:DescribeNodegroup and eks:UpdateNodegroupConfig. + """ + eks = boto3.client( + "eks", + region_name=params.region, + aws_access_key_id=params.aws_access_key_id, + aws_secret_access_key=params.aws_secret_access_key, + ) + + try: + ng = eks.describe_nodegroup( + clusterName=params.cluster_name, + nodegroupName=params.node_group_name, + )["nodegroup"] + except ClientError as e: + raise ActionException( + ErrorCodes.ACTION_UNEXPECTED_ERROR, + f"Failed to describe node group '{params.node_group_name}' " + f"in cluster '{params.cluster_name}': {e}", + ) from e + + scaling = ng["scalingConfig"] + current_min = scaling["minSize"] + current_max = scaling["maxSize"] + current_desired = scaling["desiredSize"] + + if params.new_max_size <= current_max: + event.add_enrichment( + [ + MarkdownBlock( + f"Node group *{params.node_group_name}* already has maxSize={current_max}. " + f"Requested new_max_size={params.new_max_size} is not larger — no change made." + ) + ] + ) + return + + try: + eks.update_nodegroup_config( + clusterName=params.cluster_name, + nodegroupName=params.node_group_name, + scalingConfig={ + "minSize": current_min, + "maxSize": params.new_max_size, + "desiredSize": current_desired, + }, + ) + except ClientError as e: + raise ActionException( + ErrorCodes.ACTION_UNEXPECTED_ERROR, + f"Failed to update node group '{params.node_group_name}': {e}", + ) from e + + logging.info( + f"eks_scale_node_group: {params.cluster_name}/{params.node_group_name} " + f"maxSize {current_max} -> {params.new_max_size}" + ) + + finding = Finding( + title=f"EKS node group *{params.node_group_name}* maxSize updated", + severity=FindingSeverity.INFO, + source=FindingSource.MANUAL, + aggregation_key="EksNodeGroupScaled", + ) + finding.add_enrichment( + [ + MarkdownBlock( + f"*Cluster:* {params.cluster_name}\n" + f"*Region:* {params.region}\n" + f"*Node group:* {params.node_group_name}\n" + f"*maxSize:* {current_max} → {params.new_max_size}\n" + f"*desiredSize:* {current_desired} | *minSize:* {current_min}" + ) + ] + ) + event.add_finding(finding) diff --git a/tests/test_aws_node_group_actions.py b/tests/test_aws_node_group_actions.py new file mode 100644 index 000000000..c3d158860 --- /dev/null +++ b/tests/test_aws_node_group_actions.py @@ -0,0 +1,158 @@ +from unittest.mock import MagicMock, patch + +import pytest +from botocore.exceptions import ClientError + +from robusta.api import ActionException + + +DESCRIBE_RESPONSE = { + "nodegroup": { + "nodegroupName": "workers", + "scalingConfig": { + "minSize": 1, + "maxSize": 5, + "desiredSize": 3, + }, + } +} + +BASE_PARAMS = { + "cluster_name": "my-cluster", + "region": "us-west-2", + "node_group_name": "workers", + "new_max_size": 10, +} + + +def _make_client_error(code: str) -> ClientError: + """Build a ClientError with the given error code.""" + return ClientError( + {"Error": {"Code": code, "Message": "mocked"}}, + "operation", + ) + + +@pytest.fixture +def mock_event(): + """Return a MagicMock configured to track enrichments and findings.""" + event = MagicMock() + event.findings = [] + event.add_enrichment = MagicMock() + event.add_finding = MagicMock() + return event + + +@patch("boto3.client") +def test_scale_up_succeeds(mock_boto_client, mock_event): + """Verify the EKS API is called with the correct scaling config on a successful scale-up.""" + from robusta_playbooks.aws_node_group_actions import EksNodeGroupParams, eks_scale_node_group + + eks_mock = MagicMock() + eks_mock.describe_nodegroup.return_value = DESCRIBE_RESPONSE + mock_boto_client.return_value = eks_mock + + params = EksNodeGroupParams(**BASE_PARAMS) + eks_scale_node_group(mock_event, params) + + eks_mock.update_nodegroup_config.assert_called_once_with( + clusterName="my-cluster", + nodegroupName="workers", + scalingConfig={"minSize": 1, "maxSize": 10, "desiredSize": 3}, + ) + mock_event.add_finding.assert_called_once() + finding = mock_event.add_finding.call_args[0][0] + assert "workers" in finding.title + assert "maxSize updated" in finding.title + + +@patch("boto3.client") +def test_no_op_when_new_max_not_larger(mock_boto_client, mock_event): + """Verify no update is made when the requested max is below the current max.""" + from robusta_playbooks.aws_node_group_actions import EksNodeGroupParams, eks_scale_node_group + + eks_mock = MagicMock() + eks_mock.describe_nodegroup.return_value = DESCRIBE_RESPONSE + mock_boto_client.return_value = eks_mock + + params = EksNodeGroupParams(**{**BASE_PARAMS, "new_max_size": 3}) + eks_scale_node_group(mock_event, params) + + eks_mock.update_nodegroup_config.assert_not_called() + mock_event.add_enrichment.assert_called_once() + mock_event.add_finding.assert_not_called() + enrichment_blocks = mock_event.add_enrichment.call_args[0][0] + assert any("no change" in block.text for block in enrichment_blocks) + + +@patch("boto3.client") +def test_no_op_when_new_max_is_equal(mock_boto_client, mock_event): + """Verify no update is made when the requested max matches the current max.""" + from robusta_playbooks.aws_node_group_actions import EksNodeGroupParams, eks_scale_node_group + + eks_mock = MagicMock() + eks_mock.describe_nodegroup.return_value = DESCRIBE_RESPONSE + mock_boto_client.return_value = eks_mock + + params = EksNodeGroupParams(**{**BASE_PARAMS, "new_max_size": 5}) + eks_scale_node_group(mock_event, params) + + eks_mock.update_nodegroup_config.assert_not_called() + mock_event.add_enrichment.assert_called_once() + mock_event.add_finding.assert_not_called() + + +@patch("boto3.client") +def test_raises_on_describe_failure(mock_boto_client, mock_event): + """Verify ActionException is raised and update is skipped when describe fails.""" + from robusta_playbooks.aws_node_group_actions import EksNodeGroupParams, eks_scale_node_group + + eks_mock = MagicMock() + eks_mock.describe_nodegroup.side_effect = _make_client_error("ResourceNotFoundException") + mock_boto_client.return_value = eks_mock + + params = EksNodeGroupParams(**BASE_PARAMS) + with pytest.raises(ActionException) as exc_info: + eks_scale_node_group(mock_event, params) + + assert isinstance(exc_info.value.__cause__, ClientError) + eks_mock.update_nodegroup_config.assert_not_called() + + +@patch("boto3.client") +def test_raises_on_update_failure(mock_boto_client, mock_event): + """Verify ActionException is raised when the nodegroup update call fails.""" + from robusta_playbooks.aws_node_group_actions import EksNodeGroupParams, eks_scale_node_group + + eks_mock = MagicMock() + eks_mock.describe_nodegroup.return_value = DESCRIBE_RESPONSE + eks_mock.update_nodegroup_config.side_effect = _make_client_error("InvalidParameterException") + mock_boto_client.return_value = eks_mock + + params = EksNodeGroupParams(**BASE_PARAMS) + with pytest.raises(ActionException) as exc_info: + eks_scale_node_group(mock_event, params) + + assert isinstance(exc_info.value.__cause__, ClientError) + + +@patch("boto3.client") +def test_boto_client_uses_explicit_credentials(mock_boto_client, mock_event): + """Verify explicit AWS credentials are forwarded to boto3.client.""" + from robusta_playbooks.aws_node_group_actions import EksNodeGroupParams, eks_scale_node_group + + eks_mock = MagicMock() + eks_mock.describe_nodegroup.return_value = DESCRIBE_RESPONSE + mock_boto_client.return_value = eks_mock + + params = EksNodeGroupParams( + **{**BASE_PARAMS, "aws_access_key_id": "AKID", "aws_secret_access_key": "SECRET"} + ) + eks_scale_node_group(mock_event, params) + + mock_boto_client.assert_called_once_with( + "eks", + region_name="us-west-2", + aws_access_key_id="AKID", + aws_secret_access_key="SECRET", + )