Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions docs/additional_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,31 @@ If default labels are removed:
| 'custom5' | Linux | no match |
| 'custom5' | [ self-hosted, Linux ] | no match |
| 'custom5' | [ custom5, self-hosted, Linux ] | no match |

# Preventing Runner Scale-Down for Debugging

The module supports a bypass mechanism that allows you to prevent specific runners from being scaled down during debugging or investigation. This is useful when you need to access a runner instance directly to troubleshoot issues.

## Usage

To prevent a runner from being terminated during scale-down operations, add the `ghr:bypass-removal` tag to the EC2 instance with a value of `true`:

```bash
aws ec2 create-tags --resources <instance-id> --tags Key=ghr:bypass-removal,Value=true
```

When this tag is set, the scale-down process will skip the runner and log a message indicating that the runner is protected:

```
Runner 'i-xxxxxxxxxxxx' has bypass-removal tag set, skipping removal. Remove the tag to allow scale-down.
```

## Removing the Protection

Once you've finished debugging and want to allow the runner to be scaled down normally, remove the tag or set it to any other value:

```bash
aws ec2 delete-tags --resources <instance-id> --tags Key=ghr:bypass-removal
```

**Note:** The bypass-removal tag only prevents automatic scale-down. The runner will still continue to process job(s) as normal. Make sure to remove the tag after debugging to ensure proper resource management. It will also still terminate itself if the instance is empheral and the job is complete.
1 change: 1 addition & 0 deletions lambdas/functions/control-plane/src/aws/runners.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export interface RunnerList {
org?: string;
orphan?: boolean;
runnerId?: string;
bypassRemoval?: boolean;
}

export interface RunnerInfo {
Expand Down
3 changes: 3 additions & 0 deletions lambdas/functions/control-plane/src/aws/runners.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ describe('list instances', () => {
type: 'Org',
owner: 'CoderToCat',
orphan: false,
bypassRemoval: false,
});
});

Expand All @@ -105,6 +106,7 @@ describe('list instances', () => {
owner: 'CoderToCat',
orphan: false,
runnerId: '9876543210',
bypassRemoval: false,
});
});

Expand All @@ -124,6 +126,7 @@ describe('list instances', () => {
type: 'Org',
owner: 'CoderToCat',
orphan: true,
bypassRemoval: false,
});
});

Expand Down
1 change: 1 addition & 0 deletions lambdas/functions/control-plane/src/aws/runners.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) {
org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string,
orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true',
runnerId: i.Tags?.find((e) => e.Key === 'ghr:github_runner_id')?.Value as string,
bypassRemoval: i.Tags?.find((e) => e.Key === 'ghr:bypass-removal')?.Value === 'true',
});
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,25 @@ describe('Scale down runners', () => {
checkNonTerminated(runners);
});

it(`Should not terminate runner with bypass-removal tag set.`, async () => {
// setup
const runners = [
createRunnerTestData('idle-with-bypass', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 10, true, false, false),
];
// Set bypass-removal tag
runners[0].bypassRemoval = true;

mockGitHubRunners(runners);
mockAwsRunners(runners);

// act
await scaleDown();

// assert
expect(terminateRunner).not.toHaveBeenCalled();
checkNonTerminated(runners);
});

it(`Should not terminate a runner that became busy just before deregister runner.`, async () => {
// setup
const runners = [
Expand Down Expand Up @@ -813,5 +832,6 @@ function createRunnerTestData(
orphan,
shouldBeTerminated,
runnerId: runnerId !== undefined ? String(runnerId) : undefined,
bypassRemoval: false,
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
const githubAppClient = await getOrCreateOctokit(ec2runner);
try {
const runnerList = ec2runner as unknown as RunnerList;
if (runnerList.bypassRemoval) {
logger.info(
`Runner '${ec2runner.instanceId}' has bypass-removal tag set, skipping removal. Remove the tag to allow scale-down.`,
);
return;
}

const states = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
Expand Down