From c0509207b4442b631f58674e802590b1c1dd62e5 Mon Sep 17 00:00:00 2001
From: Dmitry Ilyin <6576495+widgetii@users.noreply.github.com>
Date: Thu, 14 May 2026 16:41:54 +0300
Subject: [PATCH] Add `membw` subcommand: DDR bandwidth probe (memset / read /
 memcpy)

Closes #160.

`ipctool membw` runs three synthetic memory-bandwidth ops against
large anonymous DDR buffers (mmap of /dev/zero, NOT malloc) and
reports MB/s:

  write : memset over the buffer       (W-only, libc-dependent)
  read  : volatile uint32_t sum loop   (R-only, libc-INdependent
                                        -- most trustworthy for
                                        cross-firmware comparison)
  copy  : memcpy between two buffers   (R+W, counted as 2x bytes)

CLI matches the existing clocks/cpubench shape:
  --size MB      buffer size per pass (default: 16; must exceed L2)
  --iters N      passes per op        (default: 16)
  --ops a,b,c    comma list of write,read,copy (default: all)
  --json         JSON output instead of YAML

Output is YAML by default with a `chip:` tag for context:

  membw:
    buffer_mb: 16
    iters: 16
    results:
      write:
        mb_per_sec: 2243
        duration_s: 0.120
      read:
        mb_per_sec: 421
        duration_s: 0.637
      copy:
        mb_per_sec: 1863
        duration_s: 0.288
    chip: hi3516ev300

Use case (from #161 / #162 debugging): when two boards with the
same SoC behave differently, this separates "CPU pipeline is the
bottleneck" from "DDR pipeline is the bottleneck" in a few seconds.
With APLL decode and HPM bin now in `ipctool clocks` from #162-#164,
this PR closes the third leg of the same investigation flow.

Verified on four lab boards (all with majestic / vendor App stopped
to measure DDR config baseline rather than workload):

  hi3516ev300 (V4, OpenIPC):   write 2243   read 421   copy 1863  MB/s
  gk7205v300  (V4, OpenIPC):   write 2096   read 417   copy 1633  MB/s
  gk7205v300  (V4, XM Sofia):  write 1576   read 370   copy 1302  MB/s  [--size 4]
  hi3516av300 (V4A, OpenIPC):  write 2320   read 427   copy 2440  MB/s

XM Sofia ran with --size 4 because the board has only 48 MB
userspace memory (the rest is mmz_anonymous for the encoder), so
the default 32 MB total (2 x 16 MB buffers) doesn't fit -- confirms
--size is a genuine knob, not just a tunable.

Buffer-via-mmap caveat baked in per the issue: anonymous DDR pages
rather than tmpfs / page cache.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt |   2 +
 src/main.c     |   7 ++
 src/membw.c    | 282 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/membw.h    |   6 ++
 4 files changed, 297 insertions(+)
 create mode 100644 src/membw.c
 create mode 100644 src/membw.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7296551..2ad7010 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,8 @@ set(IPCTOOL_SRC
     src/i2cspi.c
     src/i2cspi.h
     src/main.c
+    src/membw.c
+    src/membw.h
     src/mtd.c
     src/mtd.h
     src/network.c
diff --git a/src/main.c b/src/main.c
index aa348ca..28a9378 100644
--- a/src/main.c
+++ b/src/main.c
@@ -23,6 +23,7 @@
 #include "hal/hisi/hal_hisi.h"
 #include "hwinfo.h"
 #include "i2cspi.h"
+#include "membw.h"
 #include "mtd.h"
 #include "network.h"
 #include "ptrace.h"
@@ -107,6 +108,10 @@ void print_usage() {
         "  cpubench [--json] [--loops N]\n"
         "                            triangulate CPU clock by running three\n"
         "                            tight inline-asm patterns (ARM only)\n"
+        "  membw [--size MB] [--iters N] [--ops set,...] [--json]\n"
+        "                            DDR bandwidth probe (memset / read scan "
+        "/\n"
+        "                            memcpy)\n"
         "  sensor monitor            poll AE/exposure registers from the\n"
         "                            running sensor every 2s. Supported:\n"
         "                            SC2315E, IMX291, IMX385.\n"
@@ -186,6 +191,8 @@ int main(int argc, char *argv[]) {
             return clocks_cmd(argc - 1, argv + 1);
         else if (!strcmp(argv[1], "cpubench"))
             return cpubench_cmd(argc - 1, argv + 1);
+        else if (!strcmp(argv[1], "membw"))
+            return membw_cmd(argc - 1, argv + 1);
 #ifdef __arm__
         else if (!strcmp(argv[1], "trace"))
             return ptrace_cmd(argc - 1, argv + 1);
diff --git a/src/membw.c b/src/membw.c
new file mode 100644
index 0000000..704abd6
--- /dev/null
+++ b/src/membw.c
@@ -0,0 +1,282 @@
+/* `ipctool membw` -- synthetic DDR bandwidth probe.
+ *
+ * Implements OpenIPC/ipctool#160. Runs three memory-bandwidth ops
+ * against large anonymous DDR buffers and reports MB/s:
+ *
+ *   write : memset over the buffer            (W-only, libc-dependent)
+ *   read  : volatile uint32_t accumulator     (R-only, libc-INdependent --
+ *                                              the most trustworthy number
+ *                                              when comparing firmwares
+ *                                              with different libcs)
+ *   copy  : memcpy between two buffers        (R+W, counted as 2x bytes)
+ *
+ * Caveats baked into the design (per the issue body):
+ *   - Buffers are obtained via `mmap(/dev/zero)`, NOT `malloc`, so they
+ *     come from clean anonymous DDR pages rather than tmpfs / page cache.
+ *   - Default 16 MB per buffer comfortably exceeds the L2 cache on V4
+ *     family (256 KB - 1 MB). Smaller sizes measure L2/L1, not DDR.
+ *   - Streamer / encoder DMA traffic loads DDR. To measure the DDR
+ *     *config* baseline, stop majestic / vendor App first. To measure
+ *     real *workload* bandwidth, leave them running.
+ *   - The default of 16 MB × 16 iters processes ~1 GB across all three
+ *     ops, which takes <2 s on a healthy V4 board and is light enough
+ *     to run with the streamer up.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "chipid.h"
+#include "cjson/cJSON.h"
+#include "cjson/cYAML.h"
+#include "membw.h"
+#include "tools.h"
+
+static double now_sec(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return ts.tv_sec + ts.tv_nsec / 1e9;
+}
+
+struct membw_opts {
+    size_t mb;
+    int iters;
+    bool do_write;
+    bool do_read;
+    bool do_copy;
+    bool want_json;
+};
+
+struct op_result {
+    const char *name;
+    double duration_s;
+    double mb_per_sec;
+};
+
+static cJSON *result_to_json(const struct op_result *r) {
+    cJSON *j_inner = cJSON_CreateObject();
+    ADD_PARAM_NUM("mb_per_sec", r->mb_per_sec);
+    ADD_PARAM_NUM("duration_s", r->duration_s);
+    return j_inner;
+}
+
+static cJSON *run_bench(const struct membw_opts *o) {
+    size_t sz = o->mb * 1024UL * 1024UL;
+
+    int fd = open("/dev/zero", O_RDWR);
+    if (fd < 0) {
+        fprintf(stderr, "membw: open /dev/zero: %s\n", strerror(errno));
+        return NULL;
+    }
+    char *a = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+    char *b = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+    if (a == MAP_FAILED || b == MAP_FAILED) {
+        fprintf(stderr, "membw: mmap %zu MB: %s\n", o->mb, strerror(errno));
+        if (a != MAP_FAILED)
+            munmap(a, sz);
+        if (b != MAP_FAILED)
+            munmap(b, sz);
+        close(fd);
+        return NULL;
+    }
+
+    /* Fault pages in so the first real iteration isn't measuring
+     * page-fault cost. */
+    memset(a, 1, sz);
+    memset(b, 2, sz);
+
+    cJSON *j_inner = cJSON_CreateObject();
+    ADD_PARAM_NUM("buffer_mb", (double)o->mb);
+    ADD_PARAM_NUM("iters", (double)o->iters);
+
+    cJSON *results = cJSON_CreateObject();
+
+    if (o->do_write) {
+        double t0 = now_sec();
+        for (int i = 0; i < o->iters; i++)
+            memset(a, i & 0xff, sz);
+        double dt = now_sec() - t0;
+        struct op_result r = {.name = "write",
+                              .duration_s = dt,
+                              .mb_per_sec = (double)sz * o->iters / dt / 1e6};
+        cJSON_AddItemToObject(results, "write", result_to_json(&r));
+    }
+
+    if (o->do_read) {
+        volatile uint32_t sum = 0;
+        double t0 = now_sec();
+        for (int i = 0; i < o->iters; i++) {
+            uint32_t *p = (uint32_t *)a;
+            size_t n = sz / 4;
+            for (size_t k = 0; k < n; k++)
+                sum += p[k];
+        }
+        double dt = now_sec() - t0;
+        struct op_result r = {.name = "read",
+                              .duration_s = dt,
+                              .mb_per_sec = (double)sz * o->iters / dt / 1e6};
+        cJSON_AddItemToObject(results, "read", result_to_json(&r));
+        /* sink the sum so the optimizer can't elide the loop entirely */
+        (void)sum;
+    }
+
+    if (o->do_copy) {
+        double t0 = now_sec();
+        for (int i = 0; i < o->iters; i++)
+            memcpy(b, a, sz);
+        double dt = now_sec() - t0;
+        /* memcpy moves 2× bytes (one read, one write) per byte of buffer */
+        struct op_result r = {.name = "copy",
+                              .duration_s = dt,
+                              .mb_per_sec =
+                                  (double)sz * o->iters * 2.0 / dt / 1e6};
+        cJSON_AddItemToObject(results, "copy", result_to_json(&r));
+    }
+
+    cJSON_AddItemToObject(j_inner, "results", results);
+
+    munmap(a, sz);
+    munmap(b, sz);
+    close(fd);
+    return j_inner;
+}
+
+static bool parse_ops(const char *spec, struct membw_opts *o) {
+    o->do_write = o->do_read = o->do_copy = false;
+    char buf[64];
+    strncpy(buf, spec, sizeof(buf) - 1);
+    buf[sizeof(buf) - 1] = '\0';
+
+    char *save = NULL;
+    for (char *tok = strtok_r(buf, ",", &save); tok;
+         tok = strtok_r(NULL, ",", &save)) {
+        if (!strcmp(tok, "write"))
+            o->do_write = true;
+        else if (!strcmp(tok, "read"))
+            o->do_read = true;
+        else if (!strcmp(tok, "copy"))
+            o->do_copy = true;
+        else
+            return false;
+    }
+    return o->do_write || o->do_read || o->do_copy;
+}
+
+static void print_membw_usage(void) {
+    printf(
+        "Usage: ipctool membw [--size MB] [--iters N] [--ops set,...] "
+        "[--json]\n"
+        "\n"
+        "Synthetic DDR bandwidth probe. Runs memset (write) / volatile-sum\n"
+        "(read) / memcpy (copy) over anonymous DDR buffers and reports\n"
+        "MB/s for each. Useful for separating CPU-bound from DDR-bound\n"
+        "performance regressions, and for fleet comparison across boards\n"
+        "with the same SoC.\n"
+        "\n"
+        "  --size MB     buffer size per pass (default: 16; must exceed L2)\n"
+        "  --iters N     passes per op       (default: 16)\n"
+        "  --ops a,b,c   comma list of write / read / copy (default: all)\n"
+        "  --json        machine-readable JSON instead of YAML\n"
+        "\n"
+        "The `read` op is libc-INdependent and the most trustworthy number\n"
+        "for cross-firmware comparison; `write` and `copy` are bounded by\n"
+        "libc memset/memcpy vectorization.\n"
+        "\n"
+        "Run with majestic / vendor encoder stopped to measure the DDR\n"
+        "config baseline; leave them running to measure real workload\n"
+        "bandwidth.\n");
+}
+
+int membw_cmd(int argc, char **argv) {
+    struct membw_opts o = {
+        .mb = 16,
+        .iters = 16,
+        .do_write = true,
+        .do_read = true,
+        .do_copy = true,
+        .want_json = false,
+    };
+
+    const struct option long_options[] = {
+        {"size", required_argument, NULL, 's'},
+        {"iters", required_argument, NULL, 'i'},
+        {"ops", required_argument, NULL, 'o'},
+        {"json", no_argument, NULL, 'j'},
+        {"help", no_argument, NULL, 'h'},
+        {NULL, 0, NULL, 0},
+    };
+    int opt;
+    optind = 1;
+    while ((opt = getopt_long(argc, argv, "s:i:o:jh", long_options, NULL)) !=
+           -1) {
+        switch (opt) {
+        case 's': {
+            long mb = strtol(optarg, NULL, 10);
+            if (mb < 1 || mb > 4096) {
+                fprintf(stderr, "membw: --size must be 1..4096 MB\n");
+                return EXIT_FAILURE;
+            }
+            o.mb = (size_t)mb;
+            break;
+        }
+        case 'i': {
+            long it = strtol(optarg, NULL, 10);
+            if (it < 1 || it > 1024) {
+                fprintf(stderr, "membw: --iters must be 1..1024\n");
+                return EXIT_FAILURE;
+            }
+            o.iters = (int)it;
+            break;
+        }
+        case 'o':
+            if (!parse_ops(optarg, &o)) {
+                fprintf(stderr, "membw: --ops must be a comma list of "
+                                "write,read,copy\n");
+                return EXIT_FAILURE;
+            }
+            break;
+        case 'j':
+            o.want_json = true;
+            break;
+        case 'h':
+            print_membw_usage();
+            return EXIT_SUCCESS;
+        default:
+            print_membw_usage();
+            return EXIT_FAILURE;
+        }
+    }
+
+    cJSON *bench = run_bench(&o);
+    if (!bench)
+        return EXIT_FAILURE;
+
+    /* Tag with chip identity for context. Falls through cleanly if
+     * chip detection didn't run (e.g. on a host build). */
+    const char *chip = getchipname();
+    if (chip) {
+        cJSON_AddItemToObject(bench, "chip", cJSON_CreateString(chip));
+    }
+
+    cJSON *root = cJSON_CreateObject();
+    cJSON_AddItemToObject(root, "membw", bench);
+
+    char *out = o.want_json ? cJSON_Print(root) : cYAML_Print(root);
+    if (out) {
+        printf("%s", out);
+        if (o.want_json)
+            printf("\n");
+        free(out);
+    }
+    cJSON_Delete(root);
+    return EXIT_SUCCESS;
+}
diff --git a/src/membw.h b/src/membw.h
new file mode 100644
index 0000000..53d35da
--- /dev/null
+++ b/src/membw.h
@@ -0,0 +1,6 @@
+#ifndef MEMBW_H
+#define MEMBW_H
+
+int membw_cmd(int argc, char **argv);
+
+#endif /* MEMBW_H */