Skip to content

Commit 6d08175

Browse files
Merge pull request #1119 from NHSDigital/NRL-1884-seed-script-generates-input-files
NRL-1884 seed script generates perf test input files
2 parents dc4f477 + 5cd1939 commit 6d08175

File tree

10 files changed

+155
-447
lines changed

10 files changed

+155
-447
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,4 @@ allure-report/*
7979

8080
# Performance test ref data
8181
tests/performance/reference-data.json
82+
tests/performance/producer/expanded_pointer_distributions.json

Makefile

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ HOST ?= $(TF_WORKSPACE_NAME).api.record-locator.$(ENV).national.nhs.uk
1717
ENV_TYPE ?= $(ENV)
1818
PERFTEST_TABLE_NAME ?= perftest
1919
PERFTEST_HOST ?= perftest-1.perftest.record-locator.national.nhs.uk
20+
PERFTEST_PATIENTS_WITH_POINTERS ?= 0
21+
PERFTEST_POINTERS_PER_PATIENT ?= 0
22+
PERFTEST_TYPE_DIST_PROFILE ?= default
23+
PERFTEST_CUSTODIAN_DIST_PROFILE ?= default
2024

2125
export PATH := $(PATH):$(PWD)/.venv/bin
2226
export USE_SHARED_RESOURCES := $(shell poetry run python scripts/are_resources_shared_for_stack.py $(TF_WORKSPACE_NAME))
@@ -249,31 +253,39 @@ generate-models: check-warn ## Generate Pydantic Models
249253
--output-model-type "pydantic_v2.BaseModel"
250254

251255

252-
generate-perftest-permissions: ## Generate perftest permissions and add to nrlf_permissions
253-
poetry run python tests/performance/producer/generate_permissions.py --output_dir="$(DIST_PATH)/nrlf_permissions/K6PerformanceTest"
256+
perftest-generate-permissions: ## Generate perftest permissions and add to nrlf_permissions
257+
PYTHONPATH=. poetry run python tests/performance/producer/generate_permissions.py --output_dir="$(DIST_PATH)/nrlf_permissions/K6PerformanceTest"
254258

255-
perftest-producer:
259+
perftest-seed-tables: ## Seed tables and upload generated perftest input files to s3
260+
@echo "Seeding performance test pointer tables with ENV=$(ENV) and PERFTEST_TABLE_NAME=$(PERFTEST_TABLE_NAME) and PERFTEST_PATIENTS_WITH_POINTERS=$(PERFTEST_PATIENTS_WITH_POINTERS) and PERFTEST_POINTERS_PER_PATIENT=$(PERFTEST_POINTERS_PER_PATIENT) and PERFTEST_TYPE_DIST_PROFILE=$(PERFTEST_TYPE_DIST_PROFILE) and PERFTEST_CUSTODIAN_DIST_PROFILE=$(PERFTEST_CUSTODIAN_DIST_PROFILE)"
261+
rm -rf "${DIST_PATH}/nft"
262+
mkdir -p "${DIST_PATH}/nft"
263+
PYTHONPATH=. poetry run python ./scripts/seed_nft_tables.py --table_name=$(PERFTEST_TABLE_NAME) --patients_with_pointers=$(PERFTEST_PATIENTS_WITH_POINTERS) --pointers_per_patient=$(PERFTEST_POINTERS_PER_PATIENT) --type_dist_profile=$(PERFTEST_TYPE_DIST_PROFILE) --custodian_dist_profile=$(PERFTEST_CUSTODIAN_DIST_PROFILE)
264+
zip -r "${DIST_PATH}/pointer_extract-${PERFTEST_TABLE_NAME}.zip" "${DIST_PATH}/nft"
265+
aws s3 cp "${DIST_PATH}/pointer_extract-${PERFTEST_TABLE_NAME}.zip" "s3://nhsd-nrlf--${ENV}-metadata/performance/seed-pointers-extract-${PERFTEST_TABLE_NAME}.zip"
266+
267+
perftest-prepare: ## Prepare input files for producer & consumer perf tests
268+
@echo "Preparing performance tests with ENV=$(ENV) and PERFTEST_TABLE_NAME=$(PERFTEST_TABLE_NAME) and DIST_PATH=$(DIST_PATH)"
269+
rm -rf "${DIST_PATH}/nft"
270+
mkdir -p "${DIST_PATH}/nft"
271+
aws s3 cp "s3://nhsd-nrlf--${ENV}-metadata/performance/seed-pointers-extract-${PERFTEST_TABLE_NAME}.zip" "${DIST_PATH}/pointer_extract-${PERFTEST_TABLE_NAME}.zip"
272+
unzip "${DIST_PATH}/pointer_extract-${PERFTEST_TABLE_NAME}.zip"
273+
# cp "${DIST_PATH}/nft/seed-pointers-extract-${PERFTEST_TABLE_NAME}.csv" "${DIST_PATH}/seed-pointers-extract.csv"
274+
PYTHONPATH=. poetry run python ./tests/performance/generate_producer_distributions.py
275+
276+
perftest-producer: ## Run producer perf tests
256277
@echo "Running producer performance tests with HOST=$(PERFTEST_HOST) and ENV_TYPE=$(ENV_TYPE) and DIST_PATH=$(DIST_PATH)"
257278
k6 run tests/performance/producer/perftest.js -e HOST=$(PERFTEST_HOST) -e ENV_TYPE=$(ENV_TYPE) -e DIST_PATH=$(DIST_PATH)
258279

259-
perftest-consumer:
280+
perftest-consumer: ## Run consumer perf tests
260281
@echo "Running consumer performance tests with HOST=$(PERFTEST_HOST) and ENV_TYPE=$(ENV_TYPE) and DIST_PATH=$(DIST_PATH)"
261282
k6 run tests/performance/consumer/perftest.js -e HOST=$(PERFTEST_HOST) -e ENV_TYPE=$(ENV_TYPE) -e DIST_PATH=$(DIST_PATH)
262283

263-
perftest-prep-generate-producer-data:
264-
@echo "Generating producer reference with PERFTEST_TABLE_NAME=$(PERFTEST_TABLE_NAME) and DIST_PATH=$(DIST_PATH)"
265-
mkdir -p $(DIST_PATH)
266-
PYTHONPATH=. poetry run python tests/performance/perftest_environment.py generate_producer_data --output_dir="$(DIST_PATH)"
267-
268-
perftest-prep-extract-consumer-data:
269-
@echo "Generating consumer reference with PERFTEST_TABLE_NAME=$(PERFTEST_TABLE_NAME) and DIST_PATH=$(DIST_PATH)"
270-
mkdir -p $(DIST_PATH)
271-
PYTHONPATH=. poetry run python tests/performance/perftest_environment.py extract_consumer_data --output_dir="$(DIST_PATH)"
272-
273-
perftest-prep-generate-pointer-table-extract:
284+
perftest-generate-pointer-table-extract: ## Refresh the perf test input files in s3. Can be expensive to run on large tables
274285
@echo "Generating pointer table extract with PERFTEST_TABLE_NAME=$(PERFTEST_TABLE_NAME) and DIST_PATH=$(DIST_PATH)"
275-
mkdir -p $(DIST_PATH)
276-
PYTHONPATH=. poetry run python tests/performance/perftest_environment.py generate_pointer_table_extract --output_dir="$(DIST_PATH)"
277-
278-
perftest-prepare: perftest-prep-generate-producer-data perftest-prep-extract-consumer-data perftest-prep-generate-pointer-table-extract
279-
@echo "Prepared performance tests with PERFTEST_TABLE_NAME=$(PERFTEST_TABLE_NAME) and DIST_PATH=$(DIST_PATH)"
286+
rm -rf "${DIST_PATH}/nft"
287+
mkdir -p "${DIST_PATH}/nft"
288+
PYTHONPATH=. poetry run python tests/performance/perftest_environment.py generate_pointer_table_extract --output_dir="${DIST_PATH}/nft"
289+
./scripts/get-current-info.sh > "${DIST_PATH}/nft/info.json"
290+
zip -r "${DIST_PATH}/pointer_extract-${PERFTEST_TABLE_NAME}.zip" "${DIST_PATH}/nft"
291+
aws s3 cp "${DIST_PATH}/pointer_extract-${PERFTEST_TABLE_NAME}.zip" "s3://nhsd-nrlf--${ENV}-metadata/performance/seed-pointers-extract-${PERFTEST_TABLE_NAME}.zip"

scripts/seed_nft_tables.py

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import csv
2+
import os
23
from datetime import datetime, timedelta, timezone
34
from itertools import cycle
45
from math import gcd
@@ -7,10 +8,9 @@
78

89
import boto3
910
import fire
10-
11-
# import json
1211
import numpy as np
1312

13+
from nrlf.core.boto import get_s3_client
1414
from nrlf.core.constants import (
1515
CATEGORY_ATTRIBUTES,
1616
SNOMED_SYSTEM_URL,
@@ -20,12 +20,16 @@
2020
from nrlf.core.dynamodb.model import DocumentPointer
2121
from nrlf.core.logger import logger
2222
from nrlf.tests.data import load_document_reference
23+
from tests.performance.perftest_environment import create_extract_metadata_file
2324
from tests.performance.seed_data_constants import ( # DEFAULT_COUNT_DISTRIBUTIONS,
2425
CHECKSUM_WEIGHTS,
2526
CUSTODIAN_DISTRIBUTION_PROFILES,
2627
TYPE_DISTRIBUTION_PROFILES,
2728
)
2829

30+
dist_path = os.getenv("DIST_PATH", "./dist")
31+
nft_dist_path = f"{dist_path}/nft"
32+
2933
dynamodb = boto3.client("dynamodb")
3034
resource = boto3.resource("dynamodb")
3135

@@ -83,35 +87,56 @@ def _make_seed_pointer(
8387
return nft_pointer
8488

8589

90+
def _write_pointer_extract_to_file(table_name, pointer_data):
91+
local_csv_out = f"{nft_dist_path}/seed-pointers-extract.csv"
92+
local_meta_out = f"{nft_dist_path}/info.json"
93+
94+
print(f"writing pointer extract to files {local_csv_out} {local_meta_out}")
95+
96+
with open(local_csv_out, "w") as file:
97+
writer = csv.writer(file)
98+
writer.writerow(["pointer_id", "pointer_type", "custodian", "nhs_number"])
99+
writer.writerows(pointer_data)
100+
print(f"Pointer data saved to {local_csv_out}")
101+
102+
create_extract_metadata_file(table_name, nft_dist_path)
103+
104+
86105
def _populate_seed_table(
87106
table_name: str,
88-
px_with_pointers: int,
89-
pointers_per_px: float = 1.0,
107+
patients_with_pointers: int,
108+
pointers_per_patient: float = 1.0,
90109
type_dist_profile: str = "default",
91110
custodian_dist_profile: str = "default",
92111
):
93112
"""
94113
Seeds a table with example data for non-functional testing.
95114
"""
96-
if pointers_per_px < 1.0:
115+
if pointers_per_patient < 1.0:
97116
raise ValueError("Cannot populate table with patients with zero pointers")
98117

118+
print(
119+
f"Populating table {table_name} with patients_with_pointers={patients_with_pointers} pointers_per_patient={pointers_per_patient}",
120+
type_dist_profile,
121+
custodian_dist_profile,
122+
)
123+
99124
type_dists = TYPE_DISTRIBUTION_PROFILES[type_dist_profile]
100125
custodian_dists = CUSTODIAN_DISTRIBUTION_PROFILES[custodian_dist_profile]
101126

102127
# set up iterations
103128
type_iter = _set_up_cyclical_iterator(type_dists)
104129
custodian_iters = _set_up_custodian_iterators(custodian_dists)
105130
count_iter = _get_pointer_count_poisson_distributions(
106-
px_with_pointers, pointers_per_px
131+
patients_with_pointers, pointers_per_patient
107132
)
108133
testnum_cls = TestNhsNumbersIterator()
109134
testnum_iter = iter(testnum_cls)
110135

111-
px_counter = 0
112-
doc_ref_target = int(pointers_per_px * px_with_pointers)
136+
patient_counter = 0
137+
doc_ref_target = int(pointers_per_patient * patients_with_pointers)
113138
print(
114-
f"Will upsert ~{doc_ref_target} test pointers for {px_with_pointers} patients."
139+
f"Will upsert ~{doc_ref_target} test pointers for {patients_with_pointers} patients."
115140
)
116141
doc_ref_counter = 0
117142
batch_counter = 0
@@ -120,12 +145,15 @@ def _populate_seed_table(
120145
pointer_data: list[list[str]] = []
121146

122147
start_time = datetime.now(tz=timezone.utc)
123-
124148
batch_upsert_items: list[dict[str, Any]] = []
125-
while px_counter < px_with_pointers:
126-
pointers_for_px = int(next(count_iter))
127149

128-
if batch_counter + pointers_for_px > 25 or px_counter == px_with_pointers:
150+
while patient_counter <= patients_with_pointers:
151+
pointers_for_patient = int(next(count_iter))
152+
153+
if (
154+
batch_counter + pointers_for_patient > 25
155+
or patient_counter == patients_with_pointers
156+
):
129157
response = resource.batch_write_item(
130158
RequestItems={table_name: batch_upsert_items}
131159
)
@@ -138,45 +166,43 @@ def _populate_seed_table(
138166
batch_upsert_items = []
139167
batch_counter = 0
140168

141-
new_px = next(testnum_iter)
142-
for _ in range(pointers_for_px):
169+
new_patient = next(testnum_iter)
170+
for _ in range(pointers_for_patient):
143171
new_type = next(type_iter)
144172
new_custodian = next(custodian_iters[new_type])
145173
doc_ref_counter += 1
146174
batch_counter += 1
147175

148176
pointer = _make_seed_pointer(
149-
new_type, new_custodian, new_px, doc_ref_counter
177+
new_type, new_custodian, new_patient, doc_ref_counter
150178
)
151179
put_req = {"PutRequest": {"Item": pointer.model_dump()}}
152180
batch_upsert_items.append(put_req)
153181
pointer_data.append(
154182
[
155183
pointer.id,
156-
pointer.type,
184+
new_type, # not full type url
157185
pointer.custodian,
158186
pointer.nhs_number,
159187
]
160188
)
161-
px_counter += 1
189+
patient_counter += 1
162190

163-
if px_counter % 1000 == 0:
191+
if patient_counter % 1000 == 0:
164192
print(".", end="", flush=True)
165-
if px_counter % 100000 == 0:
166-
print(f" {px_counter} patients processed ({doc_ref_counter} pointers).")
193+
if patient_counter % 100000 == 0:
194+
print(
195+
f" {patient_counter} patients processed ({doc_ref_counter} pointers)."
196+
)
167197

168-
print(" Done.")
198+
print("Done")
169199

170200
end_time = datetime.now(tz=timezone.utc)
171201
print(
172202
f"Created {doc_ref_counter} pointers in {timedelta.total_seconds(end_time - start_time)} seconds (unprocessed: {unprocessed_count})."
173203
)
174204

175-
with open("./dist/seed-nft-pointers.csv", "w") as f:
176-
writer = csv.writer(f)
177-
writer.writerow(["pointer_id", "pointer_type", "custodian", "nhs_number"])
178-
writer.writerows(pointer_data)
179-
print(f"Pointer data saved to ./dist/seed-nft-pointers.csv") # noqa
205+
_write_pointer_extract_to_file(table_name, pointer_data)
180206

181207

182208
def _set_up_cyclical_iterator(dists: dict[str, int]) -> Iterator[str]:

tests/performance/README.md

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ We have performance tests which give us a benchmark of how NRLF performs under l
88

99
Perf tests are generally conducted in the perftest env. There's a selection of tables in the perftest env representing different pointer volume scenarios e.g. perftest-baseline vs perftest-1million (todo: update with real names!).
1010

11+
#### Pull certs for perftest
12+
13+
```sh
14+
assume nhsd-nrlf-mgmt
15+
make truststore-pull-all ENV=perftest
16+
```
17+
1118
#### Point perftest at a different pointers table
1219

1320
We (will) have multiple tables representing different states of NRLF in the future e.g. all patients receiving an IPS (International Patient Summary), onboarding particular high-volume suppliers.
@@ -29,44 +36,37 @@ Currently, this requires tearing down the existing environment and restoring fro
2936
2. once backed up, delete your table. In the AWS console: dynamodb > tables > your perftest table > actions > delete table
3037
3. Rerun the Deploy Account-wide infrastructure action.
3138
4. Terraform will create an empty table with the correct name & (most importantly!) read/write IAM policies.
32-
5. Delete the empty table created by terraform and restore from the backup, specifying the same table name you've defined in code.
39+
5. Delete the empty table created by terraform and restore from the backup, specifying the same table name you've defined in code & selecting the matching customer managed encryption key.
3340
6. Run the [Persistent Environment Deploy](https://github.com/NHSDigital/NRLF/actions/workflows/persistent-environment.yml) workflow against your branch & `perftest` to restore the environment with lambdas pointed at your chosen table.
3441
7. You can check this has been successful by checking the table name in the lambdas.
3542
- In the AWS console: Lambda > functions > pick any perftest-1 lambda > Configuration > Environment variables > `TABLE_NAME` should be your desired pointer table e.g. `nhsd-nrlf--perftest-baseline-pointers-table`
3643

3744
If you've followed these steps, you will also need to [generate permissions](#generate-permissions) as the organisation permissions will have been lost when the environment was torn down.
3845

39-
### Prepare to run tests
40-
41-
#### Pull certs for perftest
42-
43-
```sh
44-
assume management
45-
make truststore-pull-all ENV=perftest
46-
```
47-
4846
#### Generate permissions
4947

5048
You will need to generate pointer permissions the first time performance tests are run in an environment e.g. if the perftest environment is destroyed & recreated.
5149

5250
```sh
5351
# In project root
54-
make generate permissions # makes a bunch of json permission files for test organisations
52+
make perftest-generate-permissions # makes a bunch of json permission files for test organisations
5553
make build # will take all permissions & create nrlf_permissions.zip file
5654

5755
# apply this new permissions zip file to your environment
5856
cd ./terraform/infrastructure
59-
assume nhsd-nrlf-test
57+
assume nhsd-nrlf-mgmt
6058
make init TF_WORKSPACE_NAME=perftest-1 ENV=perftest
6159
make ENV=perftest USE_SHARED_RESOURCES=true apply
6260
```
6361

64-
#### Generate input files
62+
### Prepare to run tests
63+
64+
Prepare input files
6565

6666
```sh
6767
assume nhsd-nrlf-test
68-
# creates 2 csv files and a json file
69-
make perftest-prepare PERFTEST_TABLE_NAME=perftest-baseline
68+
# PERFTEST_TABLE_NAME = pointer table currently pointed to by perftest env
69+
make perftest-prepare PERFTEST_TABLE_NAME=nhsd-nrlf--perftest-baseline-pointers-table ENV=perftest
7070
```
7171

7272
### Run tests
@@ -76,6 +76,28 @@ make perftest-consumer ENV_TYPE=perftest PERFTEST_HOST=perftest-1.perftest.recor
7676
make perftest-producer ENV_TYPE=perftest PERFTEST_HOST=perftest-1.perftest.record-locator.national.nhs.uk
7777
```
7878

79+
## Seed data
80+
81+
Must be run on an empty table. Cannot top up an existing set of pointers.
82+
83+
```sh
84+
make perftest-seed-tables ENV=perftest \
85+
PERFTEST_TABLE_NAME=nhsd-nrlf--perftest-anjali-test-2-pointers-table \
86+
PERFTEST_PATIENTS_WITH_POINTERS=10 \
87+
PERFTEST_POINTERS_PER_PATIENT=2
88+
```
89+
90+
### Refresh input files in S3
91+
92+
Regenerates the input files from the current state of a given perftest table & uploads files to s3. These files are usually generated at the end of the seed tables make command (above).
93+
94+
> Note: this can be an expensive operation for large table sizes.
95+
96+
```sh
97+
make perftest-generate-pointer-table-extract \
98+
PERFTEST_TABLE_NAME=nhsd-nrlf--perftest-anjali-test-2-pointers-table
99+
```
100+
79101
## Assumptions / Caveats
80102

81103
- Run performance tests in the perftest environment only\*

tests/performance/consumer/client_perftest.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@ import { check } from "k6";
33
import exec from "k6/execution";
44
import { CATEGORY_TYPE_GROUPS } from "../type-category-mappings.js";
55

6-
const csvPath = __ENV.DIST_PATH
7-
? `../../../${__ENV.DIST_PATH}/producer_reference_data.csv`
8-
: "../producer_reference_data.csv";
6+
const distPath = __ENV.DIST_PATH || "./dist";
7+
const csvPath = `../../../${distPath}/nft/seed-pointers-extract.csv`;
98
const csv = open(csvPath);
109
const lines = csv.trim().split("\n");
1110
// Skip header
@@ -16,7 +15,7 @@ function getNextPointer() {
1615
const iter = exec.vu.iterationInScenario;
1716
const index = iter % dataLines.length;
1817
const line = dataLines[index];
19-
const [count, pointer_id, pointer_type, custodian, nhs_number] = line
18+
const [pointer_id, pointer_type, custodian, nhs_number] = line
2019
.split(",")
2120
.map((field) => field.trim());
2221
return { pointer_id, pointer_type, nhs_number };

tests/performance/consumer/consumer_reference_data.json

Lines changed: 0 additions & 9 deletions
This file was deleted.

tests/performance/producer/generate_distributions.py renamed to tests/performance/generate_producer_distributions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def expand_distribution(dist):
2424

2525
output = {"types": expanded_types, "custodians": expanded_custodians}
2626

27-
out_path = Path("./tests/performance/expanded_pointer_distributions.json")
27+
out_path = Path("./tests/performance/producer/expanded_pointer_distributions.json")
2828
out_path.parent.mkdir(parents=True, exist_ok=True)
2929
with out_path.open("w") as f:
3030
json.dump(output, f, indent=2)

0 commit comments

Comments
 (0)