Skip to content

Commit 1185b5f

Browse files
committed
contest-hw: initial implementation
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 04f4ebe commit 1185b5f

21 files changed

+4565
-8
lines changed

contest/__init__.py

Whitespace-only changes.

contest/hw/README.rst

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,22 @@ Config
216216

217217
- reservation timeout, seconds
218218

219+
CLI
220+
---
221+
222+
The ``nipa-mctrl`` CLI (``/usr/local/bin/nipa-mctrl`` on ctrl) provides
223+
command-line access to the machine_control API::
224+
225+
nipa-mctrl machines # list machines and health state
226+
nipa-mctrl nics # list NICs
227+
nipa-mctrl sol --machine-id 1 # view SOL logs
228+
nipa-mctrl reserve --machine-ids 1,2 # reserve machines
229+
nipa-mctrl close --reservation-id 5 # release a reservation
230+
nipa-mctrl power-cycle --machine-id 1 # power cycle via BMC
231+
232+
Add ``--json`` for machine-parseable output. Defaults to
233+
``http://localhost:5050``; override with ``--url`` or ``MC_URL`` env var.
234+
219235
In-memory state
220236
---------------
221237

@@ -256,14 +272,12 @@ The service discovers all machines using the ``machine_info`` table at startup.
256272
SOL collection
257273
~~~~~~~~~~~~~~
258274

259-
Service assumes BMC of the machines is already configured to send SOL
260-
logs to the correct place. The service uses ipmitool call to
261-
enable the SOL output at startup (and disable it at shutdown).
262-
263-
The service maintains a UDP socket to receive the logs.
264-
The BMC ipaddr from ``machine_info_sec`` is used to identify the sending
265-
machine. The service inserts the logs into the correct table
266-
and does line chunking if necessary.
275+
At startup the service spawns a persistent ``ipmitool sol activate``
276+
session for each machine (using BMC credentials from ``machine_info_sec``).
277+
Each session runs in its own thread, reading stdout and inserting lines
278+
into the ``sol`` table. If a session drops it is automatically
279+
reconnected after a short delay. Stale sessions are deactivated before
280+
each new connection attempt.
267281

268282
Managing reservations
269283
~~~~~~~~~~~~~~~~~~~~~

contest/hw/hw_worker.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: GPL-2.0
3+
4+
"""NIPA HW worker — one-shot on-boot test runner."""
5+
6+
import json
7+
import os
8+
import subprocess
9+
import time
10+
11+
from lib.runner import find_newest_unseen, mark_all_seen, run_tests
12+
13+
14+
TESTS_DIR = '/srv/hw-worker/tests'
15+
RESULTS_DIR = '/srv/hw-worker/results'
16+
17+
# kselftest net.config keys (see drivers/net/README.rst)
18+
_NET_CONFIG_KEYS = ['NETIF', 'LOCAL_V4', 'LOCAL_V6', 'REMOTE_V4', 'REMOTE_V6',
19+
'LOCAL_PREFIX_V6', 'REMOTE_TYPE', 'REMOTE_ARGS']
20+
21+
22+
def _parse_env_file(path):
23+
"""Parse a simple KEY=VALUE env file."""
24+
env = {}
25+
if not os.path.exists(path):
26+
return env
27+
with open(path, encoding='utf-8') as fp:
28+
for line in fp:
29+
line = line.strip()
30+
if not line or line.startswith('#'):
31+
continue
32+
key, sep, val = line.partition('=')
33+
if sep:
34+
env[key.strip()] = val.strip()
35+
return env
36+
37+
38+
def _ensure_link_up(ifname):
39+
"""Bring a network interface up and wait for carrier."""
40+
ret = subprocess.run(['ip', 'link', 'set', ifname, 'up'],
41+
capture_output=True, check=False)
42+
if ret.returncode != 0:
43+
stderr = ret.stderr.decode('utf-8', 'ignore').strip()
44+
raise RuntimeError(f"Failed to bring up {ifname}: {stderr}")
45+
46+
# Wait for carrier (link partner detected)
47+
for _ in range(30):
48+
ret = subprocess.run(['ip', '-json', 'link', 'show', 'dev', ifname],
49+
capture_output=True, check=False)
50+
try:
51+
info = json.loads(ret.stdout)[0]
52+
if info.get('operstate', '').upper() == 'UP':
53+
return
54+
except (json.JSONDecodeError, IndexError):
55+
pass
56+
time.sleep(1)
57+
print(f"Warning: {ifname} carrier not detected after 30s")
58+
59+
60+
def _ensure_addr(ifname, addr):
61+
"""Add an IP address to an interface if not already present."""
62+
bare_addr = addr.split('/')[0]
63+
ret = subprocess.run(['ip', 'addr', 'show', 'dev', ifname],
64+
capture_output=True, check=False)
65+
if bare_addr in ret.stdout.decode():
66+
return
67+
if '/' not in addr:
68+
addr += '/64' if ':' in addr else '/24'
69+
subprocess.run(['ip', 'addr', 'add', addr, 'dev', ifname], check=True)
70+
71+
72+
def setup_test_interfaces(test_dir):
73+
"""Configure test NICs and write net.config from nic-test.env.
74+
75+
The hwksft orchestrator deploys nic-test.env with interface names,
76+
IP addresses, and remote connectivity info. This function:
77+
1. Brings up the DUT and peer interfaces
78+
2. Adds IP addresses if not already configured
79+
3. Writes drivers/net/net.config for the kselftest framework
80+
"""
81+
env = _parse_env_file(os.path.join(test_dir, 'nic-test.env'))
82+
if not env:
83+
return
84+
85+
# Configure DUT interface
86+
netif = env.get('NETIF')
87+
if netif:
88+
_ensure_link_up(netif)
89+
if env.get('LOCAL_V4'):
90+
_ensure_addr(netif, env['LOCAL_V4'])
91+
if env.get('LOCAL_V6'):
92+
_ensure_addr(netif, env['LOCAL_V6'])
93+
94+
# Configure peer interface (for loopback / same-machine peers)
95+
remote_ifname = env.get('REMOTE_IFNAME')
96+
if remote_ifname:
97+
_ensure_link_up(remote_ifname)
98+
if env.get('REMOTE_V4'):
99+
_ensure_addr(remote_ifname, env['REMOTE_V4'])
100+
if env.get('REMOTE_V6'):
101+
_ensure_addr(remote_ifname, env['REMOTE_V6'])
102+
103+
# Wait for peer to be reachable
104+
peer_ip = env.get('REMOTE_V4', '').split('/')[0]
105+
if peer_ip and netif:
106+
for attempt in range(15):
107+
ret = subprocess.run(['ping', '-c', '1', '-W', '1', '-I', netif, peer_ip],
108+
capture_output=True, check=False)
109+
if ret.returncode == 0:
110+
print(f"Peer {peer_ip} reachable after {attempt + 1}s")
111+
break
112+
time.sleep(1)
113+
else:
114+
print(f"Warning: peer {peer_ip} not reachable after 15s")
115+
116+
# Write net.config for the kselftest framework
117+
config_lines = []
118+
for key in _NET_CONFIG_KEYS:
119+
if env.get(key):
120+
config_lines.append(f'{key}={env[key]}')
121+
122+
if config_lines:
123+
config_content = '\n'.join(config_lines) + '\n'
124+
for subdir in ['drivers/net', 'drivers/net/hw']:
125+
config_dir = os.path.join(test_dir, subdir)
126+
if os.path.isdir(config_dir):
127+
path = os.path.join(config_dir, 'net.config')
128+
with open(path, 'w', encoding='utf-8') as fp:
129+
fp.write(config_content)
130+
print(f"Wrote {path}")
131+
132+
133+
def main():
134+
"""Find pending tests, run them, and write results."""
135+
tests_dir = TESTS_DIR
136+
results_base = RESULTS_DIR
137+
138+
test_dir = find_newest_unseen(tests_dir)
139+
if test_dir is None:
140+
print("No outstanding tests found")
141+
return
142+
143+
# Verify we booted into the expected test kernel by comparing
144+
# the deployed kernel version against the running kernel.
145+
kver_path = os.path.join(test_dir, '.kernel-version')
146+
if not os.path.exists(kver_path):
147+
print("No kernel version file, skipping")
148+
return
149+
with open(kver_path, encoding='utf-8') as fp:
150+
expected = fp.read().strip()
151+
152+
actual = os.uname().release
153+
# The kernel version includes the git hash and instance name
154+
# (via CONFIG_LOCALVERSION), so accidental prefix collisions
155+
# (e.g. "6.1" matching "6.12.0") cannot happen in practice.
156+
# The '-' separator check is an extra safety measure.
157+
if actual != expected and not actual.startswith(expected + '-'):
158+
print(f"Kernel mismatch: running {actual}, expected {expected}")
159+
return
160+
161+
mark_all_seen(tests_dir)
162+
163+
# Configure test interfaces and write net.config
164+
setup_test_interfaces(test_dir)
165+
166+
reservation_id = os.path.basename(test_dir)
167+
results_dir = os.path.join(results_base, reservation_id)
168+
os.makedirs(results_dir, exist_ok=True)
169+
170+
results = run_tests(test_dir, results_dir)
171+
172+
results_file = os.path.join(results_dir, 'results.json')
173+
fd = os.open(results_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
174+
with os.fdopen(fd, 'w') as fp:
175+
json.dump(results, fp)
176+
fp.flush()
177+
os.fsync(fp.fileno())
178+
179+
print(f"Completed {len(results)} tests, results in {results_dir}")
180+
181+
182+
if __name__ == '__main__':
183+
main()

0 commit comments

Comments
 (0)