diff --git a/CMakeLists.txt b/CMakeLists.txt index 12b77a8..f12f2a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,10 @@ set(COMMON_LIB_SRC set(IPCTOOL_SRC src/backup.c src/backup.h + src/clocks.c + src/clocks.h + src/cpubench.c + src/cpubench.h src/dns.c src/dns.h src/ethernet.c @@ -86,6 +90,7 @@ set(IPCTOOL_SRC src/fake_symbols.c src/firmware.c src/firmware.h + src/hal/hisi/clocks_v4.c src/hal/hisi/ethernet.c src/hal/hisi/ethernet.h src/hal/hisi/ispreg.c diff --git a/src/clocks.c b/src/clocks.c new file mode 100644 index 0000000..34217da --- /dev/null +++ b/src/clocks.c @@ -0,0 +1,390 @@ +/* `ipctool clocks` (alias `freq`) — show CPU/peripheral PLL frequencies, DDR + * clock, and per-die HPM characterization. Implements OpenIPC/ipctool#161. */ + +#include +#include +#include +#include +#include +#include + +#include "chipid.h" +#include "cjson/cJSON.h" +#include "cjson/cYAML.h" +#include "clocks.h" +#include "tools.h" + +extern const struct clock_family clocks_family_v4; + +/* TODO: add V1/V2/V3/V3A/V4A/OT/3536C/3536D tables — they share the + * CRG-table approach but use different bases and bit layouts. */ +static const struct clock_family *const families[] = { + &clocks_family_v4, +}; + +static const struct clock_family *family_for_chip(int chip_id) { + for (size_t i = 0; i < ARRCNT(families); i++) { + if (families[i]->chip_id == chip_id) + return families[i]; + } + return NULL; +} + +static void add_read_failure(cJSON *parent, const char *key) { + cJSON_AddItemToObject(parent, key, cJSON_CreateString("")); +} + +static uint32_t extract_field(uint32_t raw, uint8_t shift, uint8_t width) { + if (width == 0) + return 0; + uint32_t mask = (width >= 32) ? 0xFFFFFFFFu : ((1u << width) - 1u); + return (raw >> shift) & mask; +} + +static cJSON *decode_pll(const struct pll_info *pll) { + cJSON *j_inner = cJSON_CreateObject(); + + /* Read ctrl_reg1 (FRACDIV / POSTDIV1 / POSTDIV2) */ + uint32_t raw1 = 0; + bool ok1 = mem_reg(pll->ctrl_reg1, &raw1, OP_READ); + /* Read ctrl_reg2 if present (FBDIV / REFDIV); else FBDIV/REFDIV live on + * ctrl_reg1 (single-register PLLs used by older HiSi variants). */ + uint32_t raw2 = raw1; + bool ok2 = true; + if (pll->ctrl_reg2 && pll->ctrl_reg2 != pll->ctrl_reg1) + ok2 = mem_reg(pll->ctrl_reg2, &raw2, OP_READ); + + if (!ok1 || !ok2) { + ADD_PARAM("error", "register read failed"); + ADD_PARAM_FMT("ctrl_reg1", "0x%08x", pll->ctrl_reg1); + if (pll->ctrl_reg2) + ADD_PARAM_FMT("ctrl_reg2", "0x%08x", pll->ctrl_reg2); + return j_inner; + } + + uint32_t fracdiv = extract_field(raw1, pll->frac_shift, pll->frac_width); + uint32_t pdiv1 = + extract_field(raw1, pll->postdiv1_shift, pll->postdiv1_width); + uint32_t pdiv2 = + extract_field(raw1, pll->postdiv2_shift, pll->postdiv2_width); + uint32_t fbdiv = extract_field(raw2, pll->fbdiv_shift, pll->fbdiv_width); + uint32_t refdiv = extract_field(raw2, pll->refdiv_shift, pll->refdiv_width); + if (pll->postdiv1_width == 0) + pdiv1 = 1; + if (pll->postdiv2_width == 0) + pdiv2 = 1; + if (pll->refdiv_width == 0) + refdiv = 1; + + ADD_PARAM_FMT("ctrl_reg1", "0x%08x", pll->ctrl_reg1); + ADD_PARAM_FMT("ctrl_reg1_raw", "0x%08x", raw1); + if (pll->ctrl_reg2 && pll->ctrl_reg2 != pll->ctrl_reg1) { + ADD_PARAM_FMT("ctrl_reg2", "0x%08x", pll->ctrl_reg2); + ADD_PARAM_FMT("ctrl_reg2_raw", "0x%08x", raw2); + } + ADD_PARAM_NUM("fbdiv", fbdiv); + ADD_PARAM_NUM("refdiv", refdiv); + ADD_PARAM_NUM("postdiv1", pdiv1); + ADD_PARAM_NUM("postdiv2", pdiv2); + if (pll->frac_width) + ADD_PARAM_FMT("fracdiv", "0x%06x", fracdiv); + + if (fbdiv == 0) { + ADD_PARAM("note", "PLL gated (FBDIV=0)"); + ADD_PARAM_NUM("freq_mhz", 0); + return j_inner; + } + uint32_t denom = refdiv * pdiv1 * pdiv2; + if (denom == 0) { + ADD_PARAM("note", "invalid divisor (REFDIV*POSTDIV1*POSTDIV2 = 0)"); + ADD_PARAM_NUM("freq_mhz", 0); + return j_inner; + } + /* f = input * (FBDIV + FRACDIV/2^24) / (REFDIV * POSTDIV1 * POSTDIV2) + * Compute in microhertz to keep the integer part exact. */ + uint64_t numer_khz = (uint64_t)pll->input_khz * fbdiv; + if (pll->frac_width) + numer_khz += ((uint64_t)pll->input_khz * fracdiv) >> pll->frac_width; + double freq_mhz = (double)numer_khz / (double)denom / 1000.0; + ADD_PARAM_NUM("freq_mhz", freq_mhz); + return j_inner; +} + +static cJSON *decode_raw(const struct raw_reg_info *r) { + cJSON *j_inner = cJSON_CreateObject(); + uint32_t raw; + if (!mem_reg(r->reg, &raw, OP_READ)) { + ADD_PARAM("error", "register read failed"); + ADD_PARAM_FMT("reg", "0x%08x", r->reg); + return j_inner; + } + ADD_PARAM_FMT("reg", "0x%08x", r->reg); + ADD_PARAM_FMT("raw", "0x%08x", raw); + if (r->note) + ADD_PARAM("note", r->note); + return j_inner; +} + +static cJSON *decode_mux(const struct mux_info *mux) { + cJSON *j_inner = cJSON_CreateObject(); + uint32_t raw; + if (!mem_reg(mux->reg, &raw, OP_READ)) { + ADD_PARAM("error", "register read failed"); + ADD_PARAM_FMT("reg", "0x%08x", mux->reg); + return j_inner; + } + uint8_t sel = (raw >> mux->sel_shift) & mux->sel_mask; + + ADD_PARAM_FMT("reg", "0x%08x", mux->reg); + ADD_PARAM_FMT("raw", "0x%08x", raw); + ADD_PARAM_NUM("cksel", sel); + + uint16_t mhz = 0; + bool found = false; + for (size_t i = 0; i < mux->table_len; i++) { + if (mux->table[i].sel == sel) { + mhz = mux->table[i].mhz; + found = true; + break; + } + } + if (found) { + ADD_PARAM_NUM("freq_mhz", mhz); + if (mux->rate_mult) + ADD_PARAM_NUM("data_rate_mbps", (uint32_t)mhz * mux->rate_mult); + } else { + ADD_PARAM("note", "cksel value not in known table"); + } + return j_inner; +} + +static const char *hpm_bin(uint16_t v, const struct hpm_info *h) { + if (v < h->window_min || v > h->window_max) + return "out_of_spec"; + if (v < h->bin_min) + return "below_window"; + if (v > h->bin_max) + return "above_window"; + /* Split [bin_min..bin_max] into thirds. */ + uint32_t span = h->bin_max - h->bin_min; + uint32_t t = (uint32_t)(v - h->bin_min) * 3; + if (t < span) + return "low"; + if (t < span * 2) + return "mid"; + return "high"; +} + +static cJSON *decode_hpm(const struct hpm_info *h) { + cJSON *j_inner = cJSON_CreateObject(); + uint32_t raw; + if (!mem_reg(h->reg, &raw, OP_READ) || raw == 0xFFFFFFFF) { + /* HPM register absent on this variant — caller should treat NULL-ish + * by simply omitting; we return an empty object and let the parent + * decide. */ + cJSON_Delete(j_inner); + return NULL; + } + uint16_t value = (raw >> h->value_shift) & h->value_mask; + const char *bin = hpm_bin(value, h); + + ADD_PARAM_FMT("reg", "0x%08x", h->reg); + ADD_PARAM_FMT("raw", "0x%08x", raw); + ADD_PARAM_NUM("value", value); + ADD_PARAM("bin", bin); + + cJSON *window = cJSON_CreateArray(); + cJSON_AddItemToArray(window, cJSON_CreateNumber(h->bin_min)); + cJSON_AddItemToArray(window, cJSON_CreateNumber(h->bin_max)); + cJSON_AddItemToObject(j_inner, "binning_window", window); + + if (!strcmp(bin, "low") || !strcmp(bin, "below_window")) { + ADD_PARAM("note", + "low-bin silicon; mask ROM may have selected a reduced PLL " + "multiplier at boot"); + } + + if (h->aux_reg) { + uint32_t aux; + if (mem_reg(h->aux_reg, &aux, OP_READ)) { + ADD_PARAM_FMT("aux_reg", "0x%08x", h->aux_reg); + ADD_PARAM_FMT("aux_value", "0x%08x", aux); + if (h->aux_name) + ADD_PARAM("aux_name", h->aux_name); + } else if (h->aux_name) { + add_read_failure(j_inner, h->aux_name); + } + } + return j_inner; +} + +static bool read_uint_from_file(const char *path, uint32_t *out) { + FILE *f = fopen(path, "r"); + if (!f) + return false; + unsigned long v; + bool ok = fscanf(f, "%lu", &v) == 1; + fclose(f); + if (ok) + *out = (uint32_t)v; + return ok; +} + +static bool read_first_token_from_file(const char *path, char *buf, + size_t buflen) { + FILE *f = fopen(path, "r"); + if (!f) + return false; + bool ok = false; + if (fgets(buf, (int)buflen, f)) { + size_t n = strlen(buf); + while (n && (buf[n - 1] == '\n' || buf[n - 1] == ' ')) + buf[--n] = '\0'; + ok = n > 0; + } + fclose(f); + return ok; +} + +static cJSON *build_cpu_running(void) { + /* TODO: also report per-core scaling_cur_freq on SMP variants. */ + uint32_t khz = 0; + if (!read_uint_from_file( + "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", &khz)) + return NULL; + + cJSON *j_inner = cJSON_CreateObject(); + ADD_PARAM_NUM("freq_mhz", khz / 1000.0); + + char gov[64]; + if (read_first_token_from_file( + "/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", gov, + sizeof(gov))) { + ADD_PARAM("governor", gov); + } + return j_inner; +} + +static void add_plls(cJSON *parent, const struct clock_family *fam) { + for (size_t i = 0; i < fam->n_plls; i++) { + cJSON *p = decode_pll(&fam->plls[i]); + cJSON_AddItemToObject(parent, fam->plls[i].name, p); + } +} + +static void add_muxes(cJSON *parent, const struct clock_family *fam) { + for (size_t i = 0; i < fam->n_muxes; i++) { + cJSON *m = decode_mux(&fam->muxes[i]); + cJSON_AddItemToObject(parent, fam->muxes[i].name, m); + } +} + +static void add_hpms(cJSON *parent, const struct clock_family *fam) { + for (size_t i = 0; i < fam->n_hpms; i++) { + cJSON *h = decode_hpm(&fam->hpms[i]); + if (h) + cJSON_AddItemToObject(parent, fam->hpms[i].name, h); + } +} + +static void add_raws(cJSON *parent, const struct clock_family *fam) { + for (size_t i = 0; i < fam->n_raws; i++) { + cJSON *r = decode_raw(&fam->raws[i]); + cJSON_AddItemToObject(parent, fam->raws[i].name, r); + } +} + +cJSON *clocks_build_json(void) { + /* Make sure chip detection has run and chip_generation is populated. */ + if (!getchipname()) + return NULL; + + const struct clock_family *fam = family_for_chip(chip_generation); + if (!fam) + return NULL; + + cJSON *j_inner = cJSON_CreateObject(); + ADD_PARAM("family", fam->label); + + add_plls(j_inner, fam); + add_muxes(j_inner, fam); + add_hpms(j_inner, fam); + add_raws(j_inner, fam); + + cJSON *running = build_cpu_running(); + if (running) + cJSON_AddItemToObject(j_inner, "cpu_running", running); + + return j_inner; +} + +static void print_clocks_usage(const char *prog) { + printf("Usage: %s clocks [--json]\n" + " %s freq [--json]\n" + "\n" + "Show CPU PLL, peripheral PLL, DDR clock and per-die HPM\n" + "characterization (mask-ROM PLL binning) of the running SoC.\n" + "\n" + "Output is YAML by default; --json emits JSON.\n" + "Currently supported: Hisilicon V4 / Goke V300 family.\n", + prog, prog); +} + +int clocks_cmd(int argc, char **argv) { + bool want_json = false; + + const struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0}, + }; + int opt; + optind = 1; + while ((opt = getopt_long(argc, argv, "jh", long_options, NULL)) != -1) { + switch (opt) { + case 'j': + want_json = true; + break; + case 'h': + print_clocks_usage("ipctool"); + return EXIT_SUCCESS; + default: + print_clocks_usage("ipctool"); + return EXIT_FAILURE; + } + } + + /* getchipname runs once for the lifetime of the process; calling it here + * makes sure chip_generation is set before we dispatch. */ + if (!getchipname()) { + fprintf(stderr, "clocks: cannot identify SoC\n"); + return EXIT_FAILURE; + } + + if (!family_for_chip(chip_generation)) { + fprintf(stderr, + "clocks: chip family 0x%x (%s) not supported yet; only " + "Hisilicon V4 / Goke V300 is implemented\n", + chip_generation, chip_name); + return EXIT_FAILURE; + } + + cJSON *clocks = clocks_build_json(); + if (!clocks) { + fprintf(stderr, "clocks: failed to build clock info\n"); + return EXIT_FAILURE; + } + + cJSON *root = cJSON_CreateObject(); + cJSON_AddItemToObject(root, "clocks", clocks); + + char *out = want_json ? cJSON_Print(root) : cYAML_Print(root); + if (out) { + printf("%s", out); + if (want_json) + printf("\n"); + free(out); + } + cJSON_Delete(root); + return EXIT_SUCCESS; +} diff --git a/src/clocks.h b/src/clocks.h new file mode 100644 index 0000000..ff7f0f9 --- /dev/null +++ b/src/clocks.h @@ -0,0 +1,98 @@ +#ifndef CLOCKS_H +#define CLOCKS_H + +#include +#include + +#include + +/* PLL descriptor matching the HiSilicon `*_pll_clock` layout (e.g. + * `struct hi3516a_pll_clock` in the Hi3516A SDK kernel patch). One PLL spans + * two control registers: + * ctrl_reg1: FRACDIV, POSTDIV1, POSTDIV2 + * ctrl_reg2: FBDIV, REFDIV + * Frequency: f = input_khz * FBDIV / (REFDIV * POSTDIV1 * POSTDIV2) + * + * For families that use a single-register PLL, set ctrl_reg2 == 0 and put + * the relevant fields on ctrl_reg1 (frac_width may be 0 to skip FRACDIV). + * Set any *_width = 0 to default that field to 1 (treat as absent). */ +struct pll_info { + const char *name; /* JSON key, e.g. "cpu_pll" */ + const char *label; /* human label, e.g. "CPU PLL (APLL)" */ + uint32_t ctrl_reg1; + uint8_t frac_shift; + uint8_t frac_width; /* 0 = no FRACDIV */ + uint8_t postdiv1_shift; + uint8_t postdiv1_width; /* 0 = postdiv1 fixed to 1 */ + uint8_t postdiv2_shift; + uint8_t postdiv2_width; /* 0 = postdiv2 fixed to 1 */ + uint32_t ctrl_reg2; /* 0 = FBDIV/REFDIV live on ctrl_reg1 */ + uint8_t fbdiv_shift; + uint8_t fbdiv_width; + uint8_t refdiv_shift; + uint8_t refdiv_width; /* 0 = refdiv fixed to 1 */ + uint32_t input_khz; /* crystal frequency; 24000 on V4 */ +}; + +struct mux_entry { + uint8_t sel; + uint16_t mhz; +}; + +struct mux_info { + const char *name; + const char *label; + uint32_t reg; + uint8_t sel_shift; + uint8_t sel_mask; + const struct mux_entry *table; + size_t table_len; + uint8_t rate_mult; /* DDR3 = 4 (quad-pumped); 0 = no data_rate */ +}; + +struct hpm_info { + const char *name; + const char *label; + uint32_t reg; + uint8_t value_shift; + uint16_t value_mask; + uint16_t window_min; /* HPM_CORE_MIN — outer validity */ + uint16_t window_max; /* HPM_CORE_MAX */ + uint16_t bin_min; /* HPM_CORE_VALUE_MIN — nominal binning low */ + uint16_t bin_max; /* HPM_CORE_VALUE_MAX — nominal binning high */ + uint32_t aux_reg; /* 0 = none */ + const char *aux_name; +}; + +/* Raw register dump entry. Used for mask-ROM-written diagnostic slots that + * look PLL-shaped but don't drive any clock (e.g. V4 HPM-shadow registers at + * 0x12010014 / 0x1201000c — values vary by per-die silicon binning but + * empirically have zero effect on CPU/peripheral clock). */ +struct raw_reg_info { + const char *name; + const char *label; + uint32_t reg; + const char *note; +}; + +struct clock_family { + int chip_id; /* matches chip_generation, e.g. HISI_V4 */ + const char *label; + const struct pll_info *plls; + size_t n_plls; + const struct mux_info *muxes; + size_t n_muxes; + const struct hpm_info *hpms; + size_t n_hpms; + const struct raw_reg_info *raws; + size_t n_raws; +}; + +/* Builds the cJSON tree for the current chip. Returns NULL on unsupported + * chip family. Used by both the `clocks`/`freq` subcommand and the default + * `ipctool` YAML survey. */ +cJSON *clocks_build_json(void); + +int clocks_cmd(int argc, char **argv); + +#endif /* CLOCKS_H */ diff --git a/src/cpubench.c b/src/cpubench.c new file mode 100644 index 0000000..0c014a5 --- /dev/null +++ b/src/cpubench.c @@ -0,0 +1,244 @@ +/* `ipctool cpubench` — multi-pattern CPU clock triangulation. + * + * Runs tight inline-asm loops with three known instruction patterns and + * back-calculates the CPU clock from each. Used in ipctool#161 to settle + * which CRG register really drives the V4 CPU PLL (turned out: register + * pair at 0x12010000/0x12010004, NOT the 0x12010014 shadow the issue body + * originally identified). Kept as a battle-proven test for future board + * bring-up where the PLL register decode is suspect. + * + * Cortex-A7 reference throughput (Arm Cortex-A7 MPCore TRM): + * - Dependent ADD chain : 1.0 cyc/op (in-order, RAW stalls) + * - Independent ADD pair : 0.5 cyc/op (dual-issue, two ALU pipes) + * - Dependent MUL chain : 3.0 cyc/op (3-cyc latency, latency-bound) + * + * Inner-loop block has 16 ops; outer-loop overhead is 3 cycles per iter + * (sub + cmp + branch). Implied clock per pattern: + * + * f_dep_add = ops_per_sec * (16 + 3) / 16 = ops_per_sec * 1.1875 + * f_indep = ops_per_sec * (16 + 3) / 32 = ops_per_sec * 0.594 + * f_dep_mul = ops_per_sec * (16*3 + 3) / 16 = ops_per_sec * 3.1875 + * + * Three independent timing models should converge to within ~2% on a + * Cortex-A7. Build is ARM-gated; on x86/etc. the subcommand reports + * "not supported". + */ + +#include +#include +#include +#include +#include +#include + +#include "cjson/cJSON.h" +#include "cjson/cYAML.h" +#include "cpubench.h" +#include "tools.h" + +#ifdef __arm__ + +/* Replicate a string 16 times so the inner asm block has 16 ops, making + * outer-loop overhead negligible. */ +#define REP16(X) X X X X X X X X X X X X X X X X + +static volatile uint32_t sink_u; + +static double bench_dep_add(uint64_t loops) { + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + uint32_t r0 = 0; + for (uint64_t i = 0; i < loops; i++) { + __asm__ __volatile__(REP16("add %0, %0, #1\n\t") : "+r"(r0)); + } + clock_gettime(CLOCK_MONOTONIC, &t1); + sink_u = r0; + double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + return (double)loops * 16.0 / dt; +} + +static double bench_indep_add(uint64_t loops) { + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + uint32_t r0 = 0, r1 = 0; + for (uint64_t i = 0; i < loops; i++) { + __asm__ __volatile__(REP16("add %0, %0, #1\n\t" + "add %1, %1, #1\n\t") + : "+r"(r0), "+r"(r1)); + } + clock_gettime(CLOCK_MONOTONIC, &t1); + sink_u = r0 + r1; + double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + return (double)loops * 32.0 / dt; +} + +static double bench_dep_mul(uint64_t loops) { + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + uint32_t r0 = 3; + for (uint64_t i = 0; i < loops; i++) { + __asm__ __volatile__(REP16("mul %0, %0, %0\n\t") : "+r"(r0)); + } + clock_gettime(CLOCK_MONOTONIC, &t1); + sink_u = r0; + double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + return (double)loops * 16.0 / dt; +} + +struct pattern_result { + const char *name; /* JSON key */ + const char *label; /* short human label */ + double ops_per_sec; + double freq_mhz; + double cycles_per_op; /* assumed model */ + const char *model_note; +}; + +static cJSON *result_to_json(const struct pattern_result *r) { + cJSON *j_inner = cJSON_CreateObject(); + ADD_PARAM_NUM("ops_per_sec_M", r->ops_per_sec / 1e6); + ADD_PARAM_NUM("cycles_per_op", r->cycles_per_op); + ADD_PARAM_NUM("freq_mhz", r->freq_mhz); + if (r->model_note) + ADD_PARAM("model", r->model_note); + return j_inner; +} + +static cJSON *build_cpubench_json(uint64_t loops) { + /* Three timing models, with the 16-ops-per-block + 3-cyc outer overhead + * accounted for in the implied-clock calculation. */ + struct pattern_result results[3]; + results[0].name = "dep_add"; + results[0].label = "Dependent integer ADD"; + results[0].ops_per_sec = bench_dep_add(loops); + results[0].cycles_per_op = 19.0 / 16.0; /* 1 cyc/op + 3 cyc loop overhead */ + results[0].freq_mhz = + results[0].ops_per_sec * results[0].cycles_per_op / 1e6; + results[0].model_note = "A7: 1 cyc/op dep ALU"; + + results[1].name = "indep_add"; + results[1].label = "Independent integer ADD pair (dual-issue)"; + results[1].ops_per_sec = bench_indep_add(loops); + results[1].cycles_per_op = 19.0 / 32.0; /* 16 cyc for 32 ops + 3 ovh */ + results[1].freq_mhz = + results[1].ops_per_sec * results[1].cycles_per_op / 1e6; + results[1].model_note = "A7: 0.5 cyc/op dual-issue ALU"; + + results[2].name = "dep_mul"; + results[2].label = "Dependent integer MUL"; + results[2].ops_per_sec = bench_dep_mul(loops); + results[2].cycles_per_op = + (16.0 * 3.0 + 3.0) / 16.0; /* 3-cyc-latency dep MUL */ + results[2].freq_mhz = + results[2].ops_per_sec * results[2].cycles_per_op / 1e6; + results[2].model_note = "A7: 3-cyc latency dep MUL"; + + cJSON *j_inner = cJSON_CreateObject(); + ADD_PARAM( + "model", + "Cortex-A7 in-order, 3-cyc outer-loop overhead, 16-op inner block"); + ADD_PARAM_NUM("loops", (double)loops); + + cJSON *patterns = cJSON_CreateObject(); + for (size_t i = 0; i < sizeof(results) / sizeof(results[0]); i++) { + cJSON_AddItemToObject(patterns, results[i].name, + result_to_json(&results[i])); + } + cJSON_AddItemToObject(j_inner, "patterns", patterns); + + /* Consensus: median of the three clock estimates (robust to one + * off-model estimator; in practice indep_add tends to under-shoot + * because A7's second ALU pipe doesn't always accept ADD-imm). Plus + * the min-to-max spread as a confidence indicator. */ + double v[3] = {results[0].freq_mhz, results[1].freq_mhz, + results[2].freq_mhz}; + for (int a = 0; a < 3; a++) + for (int b = a + 1; b < 3; b++) + if (v[b] < v[a]) { + double t = v[a]; + v[a] = v[b]; + v[b] = t; + } + double median = v[1]; + double spread_pct = (v[2] - v[0]) / median * 100.0; + ADD_PARAM_NUM("consensus_freq_mhz", median); + ADD_PARAM_NUM("spread_pct", spread_pct); + return j_inner; +} + +static void print_cpubench_usage(void) { + printf("Usage: ipctool cpubench [--json] [--loops N]\n" + "\n" + "Triangulate CPU clock by running three tight inline-asm patterns\n" + "(dependent ADD, independent ADD pair, dependent MUL) and\n" + "back-calculating MHz from the known Cortex-A7 throughput.\n" + "\n" + "Three independent timing models converge to within ~2%% on a\n" + "healthy chip; large divergence suggests CPU contention (kill\n" + "majestic/encoder first) or a non-A7 core.\n" + "\n" + "Output is YAML by default; --json emits JSON.\n" + "Default loops = 30000000 (~5-7 s of runtime on Cortex-A7 @ ~900 " + "MHz).\n"); +} + +int cpubench_cmd(int argc, char **argv) { + bool want_json = false; + uint64_t loops = 30000000ULL; + + const struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {"loops", required_argument, NULL, 'l'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0}, + }; + int opt; + optind = 1; + while ((opt = getopt_long(argc, argv, "jl:h", long_options, NULL)) != -1) { + switch (opt) { + case 'j': + want_json = true; + break; + case 'l': + loops = strtoull(optarg, NULL, 10); + if (loops < 1000000ULL) { + fprintf(stderr, "cpubench: --loops must be >= 1000000\n"); + return EXIT_FAILURE; + } + break; + case 'h': + print_cpubench_usage(); + return EXIT_SUCCESS; + default: + print_cpubench_usage(); + return EXIT_FAILURE; + } + } + + cJSON *bench = build_cpubench_json(loops); + cJSON *root = cJSON_CreateObject(); + cJSON_AddItemToObject(root, "cpubench", bench); + + char *out = want_json ? cJSON_Print(root) : cYAML_Print(root); + if (out) { + printf("%s", out); + if (want_json) + printf("\n"); + free(out); + } + cJSON_Delete(root); + return EXIT_SUCCESS; +} + +#else /* !__arm__ */ + +int cpubench_cmd(int argc, char **argv) { + (void)argc; + (void)argv; + fprintf(stderr, + "cpubench: only supported on ARM builds (the inline-asm patterns\n" + " rely on Cortex-A7 throughput numbers).\n"); + return EXIT_FAILURE; +} + +#endif /* __arm__ */ diff --git a/src/cpubench.h b/src/cpubench.h new file mode 100644 index 0000000..015428e --- /dev/null +++ b/src/cpubench.h @@ -0,0 +1,6 @@ +#ifndef CPUBENCH_H +#define CPUBENCH_H + +int cpubench_cmd(int argc, char **argv); + +#endif /* CPUBENCH_H */ diff --git a/src/hal/hisi/clocks_v4.c b/src/hal/hisi/clocks_v4.c new file mode 100644 index 0000000..c603b0e --- /dev/null +++ b/src/hal/hisi/clocks_v4.c @@ -0,0 +1,122 @@ +/* Clock / PLL / HPM register map for the Hisilicon V4 / Goke V300 family + * (3516EV200, 3516EV300, 3518EV300, 3516DV200, 7205V200/V210/V300, + * 7202V300/V330, 7201V200/V300, 7605V100, 7205V500/V510/V530). + * + * Same silicon die across HiSilicon and Goke V300 brandings — verified by + * matching `arch/arm/include/asm/arch-*v300/platform.h` between the two + * vendor u-boot trees and by benching all three lab boards at the same + * CPU clock (ipctool#161 ground-truth, 2026-05-14). + * + * CPU PLL (APLL) decode borrowed from the Hi3516A SDK kernel patch + * (`struct hi3516a_pll_clock` in linux-4.9.37.patch): + * ctrl_reg1 = CRG_BASE + 0x00: FRACDIV[23:0], POSTDIV1[26:24], + * POSTDIV2[30:28] ctrl_reg2 = CRG_BASE + 0x04: FBDIV[11:0], REFDIV[17:12] f = + * 24 MHz * FBDIV / (REFDIV * POSTDIV1 * POSTDIV2) + * + * Validated empirically on V4: read pair (0x12010000, 0x12010004) = + * (0x12000000, 0x0100104B) decodes to FBDIV=75, REFDIV=1, POSTDIV1=2, + * POSTDIV2=1 → 900 MHz, within 0.2% of the multi-pattern bench + * triangulation (see `ipctool cpubench`). + * + * NOTE: registers 0x12010014 and 0x1201000c on Goke-branded boards hold + * FBDIV-shaped values written by the mask ROM based on per-die HPM + * binning, BUT they do not drive any active clock — empirically confirmed + * by running identical CPU benchmarks on three V4 boards with three + * different values at 0x12010014 and finding identical 900 MHz operation. + * The issue #161 body identifies 0x12010014 as "CPU PLL FBDIV"; that + * interpretation is falsified. We surface those registers as raw + * diagnostic dumps so users can still spot per-die HPM correlation + * without misreading them as clock dividers. + */ + +#include "clocks.h" +#include "hal/hisi/hal_hisi.h" + +static const struct pll_info v4_plls[] = { + { + .name = "cpu_pll", + .label = "CPU PLL (APLL)", + .ctrl_reg1 = 0x12010000, + .frac_shift = 0, + .frac_width = 24, + .postdiv1_shift = 24, + .postdiv1_width = 3, + .postdiv2_shift = 28, + .postdiv2_width = 3, + .ctrl_reg2 = 0x12010004, + .fbdiv_shift = 0, + .fbdiv_width = 12, + .refdiv_shift = 12, + .refdiv_width = 6, + .input_khz = 24000, + }, +}; + +/* CRG[0x80] bits[5:3] — DDR clock mux (see issue #161). */ +static const struct mux_entry v4_ddr_table[] = { + {0b000, 24}, + {0b001, 450}, + {0b011, 300}, + {0b100, 297}, +}; + +static const struct mux_info v4_muxes[] = { + { + .name = "ddr", + .label = "DDR", + .reg = 0x12010080, + .sel_shift = 3, + .sel_mask = 0x07, + .table = v4_ddr_table, + .table_len = sizeof(v4_ddr_table) / sizeof(v4_ddr_table[0]), + .rate_mult = + 4, /* DDR3 quad-pumped — TODO: verify for LPDDRx variants */ + }, +}; + +static const struct hpm_info v4_hpms[] = { + { + .name = "hpm", + .label = "HPM core", + .reg = 0x1202015C, /* HPM_CHECK_REG */ + .value_shift = 16, + .value_mask = 0x03FF, + .window_min = 150, /* HPM_CORE_MIN */ + .window_max = 350, /* HPM_CORE_MAX */ + .bin_min = 190, /* HPM_CORE_VALUE_MIN */ + .bin_max = 310, /* HPM_CORE_VALUE_MAX */ + .aux_reg = 0x120280D8, /* HPM_CORE_REG0 — per-die fingerprint */ + .aux_name = "hpm_core_reg0", + }, +}; + +/* Mask-ROM-written HPM-shadow registers — vary by per-die silicon binning, + * empirically do NOT drive any active clock. Surfaced for diagnostic / + * fleet-comparison purposes only. See block comment at top of file. */ +static const struct raw_reg_info v4_raws[] = { + { + .name = "pll_shadow_0c", + .label = "PLL-shadow @ CRG[0x0c]", + .reg = 0x1201000C, + .note = "mask-ROM HPM-bin shadow; not a live FBDIV (see clocks_v4.c)", + }, + { + .name = "pll_shadow_14", + .label = "PLL-shadow @ CRG[0x14]", + .reg = 0x12010014, + .note = "mask-ROM HPM-bin shadow; not a live FBDIV (see clocks_v4.c)", + }, +}; + +const struct clock_family clocks_family_v4 = { + .chip_id = HISI_V4, + .label = "Hisilicon V4 / Goke V300", + .plls = v4_plls, + .n_plls = sizeof(v4_plls) / sizeof(v4_plls[0]), + .muxes = v4_muxes, + .n_muxes = sizeof(v4_muxes) / sizeof(v4_muxes[0]), + .hpms = v4_hpms, + .n_hpms = sizeof(v4_hpms) / sizeof(v4_hpms[0]), + .raws = v4_raws, + .n_raws = sizeof(v4_raws) / sizeof(v4_raws[0]), +}; diff --git a/src/main.c b/src/main.c index 10c9e63..9743f75 100644 --- a/src/main.c +++ b/src/main.c @@ -16,6 +16,8 @@ #include "chipid.h" #include "cjson/cJSON.h" #include "cjson/cYAML.h" +#include "clocks.h" +#include "cpubench.h" #include "ethernet.h" #include "firmware.h" #include "hal/hisi/hal_hisi.h" @@ -98,6 +100,13 @@ void print_usage() { " i2cdetect [-b, --bus] attempt to detect devices on I2C bus\n" " reginfo [--script] dump current status of pinmux registers\n" " gpio (scan|mux) GPIO utilities\n" + " clocks|freq [--json] show CPU/peripheral PLL, DDR clock and\n" + " per-die HPM bin (Hisilicon V4 / Goke " + "V300\n" + " family only for now)\n" + " cpubench [--json] [--loops N]\n" + " triangulate CPU clock by running three\n" + " tight inline-asm patterns (ARM only)\n" " sensor monitor poll AE/exposure registers from the\n" " running sensor every 2s. Supported:\n" " SC2315E, IMX291, IMX385.\n" @@ -134,6 +143,7 @@ static cJSON *build_yaml() { add_yaml_fragment(root, "ram", detect_ram()); add_yaml_fragment(root, "firmware", detect_firmare()); add_yaml_fragment(root, "sensors", detect_sensors()); + add_yaml_fragment(root, "clocks", clocks_build_json()); return root; } @@ -172,6 +182,10 @@ int main(int argc, char *argv[]) { return mtd_unlock_cmd(); else if (!strcmp(argv[optind], "sensor")) return snstool_cmd(argc - 1, argv + 1); + else if (!strcmp(argv[1], "clocks") || !strcmp(argv[1], "freq")) + return clocks_cmd(argc - 1, argv + 1); + else if (!strcmp(argv[1], "cpubench")) + return cpubench_cmd(argc - 1, argv + 1); #ifdef __arm__ else if (!strcmp(argv[1], "trace")) return ptrace_cmd(argc - 1, argv + 1);