From 1f70e82cf7ae1a7027b37b3be8e4a0931c58d424 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Wed, 7 Jan 2026 17:17:30 +0100 Subject: [PATCH 01/18] Initial commit of software library --- .env | 2 +- .gitignore | 11 +- .vscode/settings.json | 2 +- CMakeLists.txt | 21 +- README.md | 49 +- coyote | 2 +- hardware/CMakeLists.txt | 20 + .../src}/hdl/axi/axi_demultiplexer.sv | 0 .../src}/hdl/axi/axi_width_converter.sv | 0 {src => hardware/src}/hdl/common.sv | 0 .../src}/hdl/config/config_interfaces.sv | 0 .../src}/hdl/config/config_macros.svh | 0 .../src}/hdl/config/global_config.sv | 0 .../src}/hdl/config/mem_config.sv | 0 .../src}/hdl/config/read_config_splitter.sv | 0 .../config/registers/config_read_register.sv | 0 .../registers/config_read_register_file.sv | 0 .../hdl/config/registers/config_write_fifo.sv | 0 .../registers/config_write_ready_register.sv | 0 .../config/registers/config_write_register.sv | 0 .../src}/hdl/config/stream_config.sv | 0 .../src}/hdl/config/write_config_splitter.sv | 0 {src => hardware/src}/hdl/crossbar/coupler.sv | 0 .../src}/hdl/crossbar/creditor.sv | 0 .../src}/hdl/crossbar/crossbar.sv | 0 .../src}/hdl/crossbar/decoupler.sv | 0 {src => hardware/src}/hdl/crossbar/reorder.sv | 0 .../src}/hdl/crossbar/reorder_decoupler.sv | 0 .../src}/hdl/crossbar/reorder_enumerator.sv | 0 .../src}/hdl/crossbar/tagged_duplicator.sv | 0 .../src}/hdl/crossbar/tagged_multiplexer.sv | 0 {src => hardware/src}/hdl/data_interfaces.sv | 0 {src => hardware/src}/hdl/dict/deduplicate.sv | 0 .../src}/hdl/dict/deduplicate_stage.sv | 0 {src => hardware/src}/hdl/dict/dictionary.sv | 0 .../src}/hdl/dict/dictionary_bank.sv | 0 {src => hardware/src}/hdl/dict/duplicate.sv | 0 .../src}/hdl/dict/typed_dictionary.sv | 0 {src => hardware/src}/hdl/fifo/fifo.vhd | 0 {src => hardware/src}/hdl/fifo/fifo_axi.sv | 0 .../src}/hdl/fifo/multi_insert_fifo.vhd | 0 .../src}/hdl/fifo/multi_insert_fifo_axi.sv | 0 .../src}/hdl/hashing/murmur_hasher.sv | 0 .../src}/hdl/hashing/stream_hasher.sv | 0 {src => hardware/src}/hdl/libstf_macros.svh | 0 .../src}/hdl/normalization/barrel_shifter.sv | 0 .../hdl/normalization/constant_shifter.sv | 0 .../src}/hdl/normalization/data_compactor.sv | 0 .../hdl/normalization/data_compactor_level.sv | 0 .../src}/hdl/normalization/data_normalizer.sv | 0 .../src}/hdl/output/output_writer.sv | 0 .../src}/hdl/output/stream_writer.sv | 0 .../src}/hdl/stream/data_adapters.sv | 0 .../src}/hdl/stream/data_adapters_typed.sv | 0 .../src}/hdl/stream/data_demultiplexer.sv | 0 .../src}/hdl/stream/data_enumerator.sv | 0 .../src}/hdl/stream/data_multiplexer.sv | 0 {src => hardware/src}/hdl/stream/data_sink.sv | 0 .../src}/hdl/stream/data_splitter.sv | 0 .../src}/hdl/stream/data_width_converter.sv | 0 .../src}/hdl/test/cyclic_drivers.sv | 0 .../src}/hdl/util/demultiplexer.sv | 0 .../src}/hdl/util/meta_intf_arbiter.sv | 0 .../src}/hdl/util/performance_counter.sv | 0 {src => hardware/src}/hdl/util/ram.sv | 0 .../src}/hdl/util/ready_valid_helpers.sv | 0 .../src}/hdl/util/reset_resync.sv | 0 .../src}/hdl/util/shift_register.vhd | 0 {src => hardware/src}/hdl/util/skid_buffer.sv | 0 {src => hardware/src}/vfpga_top.svh | 0 .../unit-tests}/dict_test.py | 0 .../unit-tests}/libstf_utils/__init__.py | 0 .../unit-tests}/libstf_utils/common.py | 0 .../libstf_utils/configured_test_case.py | 0 .../libstf_utils/fpga_configuration.py | 0 .../libstf_utils/memory_manager.py | 0 .../libstf_utils/output_writer_test_case.py | 0 .../unit-tests}/normalizer_test.py | 0 .../unit-tests}/output_writer_test.py | 0 .../unit-tests}/typed_dict_test.py | 0 .../unit-tests}/vfpga_tops/dict_test.sv | 0 .../unit-tests}/vfpga_tops/normalizer_test.sv | 0 .../vfpga_tops/output_writer_test.sv | 0 .../unit-tests}/vfpga_tops/typed_dict_test.sv | 0 scripts/install_jemalloc.sh | 38 ++ sim_setup.sh | 1 + software/CMakeLists.txt | 33 ++ software/cmake/FindJemalloc.cmake | 34 ++ software/src/libstf/buffer.cpp | 11 + software/src/libstf/buffer.hpp | 31 ++ software/src/libstf/common.cpp | 26 + software/src/libstf/common.hpp | 54 +++ software/src/libstf/configuration.cpp | 129 +++++ software/src/libstf/configuration.hpp | 108 +++++ software/src/libstf/error_handling.hpp | 61 +++ software/src/libstf/memory_pool.cpp | 457 ++++++++++++++++++ software/src/libstf/memory_pool.hpp | 281 +++++++++++ software/src/libstf/tlb_manager.cpp | 44 ++ software/src/libstf/tlb_manager.hpp | 48 ++ 99 files changed, 1423 insertions(+), 40 deletions(-) create mode 100644 hardware/CMakeLists.txt rename {src => hardware/src}/hdl/axi/axi_demultiplexer.sv (100%) rename {src => hardware/src}/hdl/axi/axi_width_converter.sv (100%) rename {src => hardware/src}/hdl/common.sv (100%) rename {src => hardware/src}/hdl/config/config_interfaces.sv (100%) rename {src => hardware/src}/hdl/config/config_macros.svh (100%) rename {src => hardware/src}/hdl/config/global_config.sv (100%) rename {src => hardware/src}/hdl/config/mem_config.sv (100%) rename {src => hardware/src}/hdl/config/read_config_splitter.sv (100%) rename {src => hardware/src}/hdl/config/registers/config_read_register.sv (100%) rename {src => hardware/src}/hdl/config/registers/config_read_register_file.sv (100%) rename {src => hardware/src}/hdl/config/registers/config_write_fifo.sv (100%) rename {src => hardware/src}/hdl/config/registers/config_write_ready_register.sv (100%) rename {src => hardware/src}/hdl/config/registers/config_write_register.sv (100%) rename {src => hardware/src}/hdl/config/stream_config.sv (100%) rename {src => hardware/src}/hdl/config/write_config_splitter.sv (100%) rename {src => hardware/src}/hdl/crossbar/coupler.sv (100%) rename {src => hardware/src}/hdl/crossbar/creditor.sv (100%) rename {src => hardware/src}/hdl/crossbar/crossbar.sv (100%) rename {src => hardware/src}/hdl/crossbar/decoupler.sv (100%) rename {src => hardware/src}/hdl/crossbar/reorder.sv (100%) rename {src => hardware/src}/hdl/crossbar/reorder_decoupler.sv (100%) rename {src => hardware/src}/hdl/crossbar/reorder_enumerator.sv (100%) rename {src => hardware/src}/hdl/crossbar/tagged_duplicator.sv (100%) rename {src => hardware/src}/hdl/crossbar/tagged_multiplexer.sv (100%) rename {src => hardware/src}/hdl/data_interfaces.sv (100%) rename {src => hardware/src}/hdl/dict/deduplicate.sv (100%) rename {src => hardware/src}/hdl/dict/deduplicate_stage.sv (100%) rename {src => hardware/src}/hdl/dict/dictionary.sv (100%) rename {src => hardware/src}/hdl/dict/dictionary_bank.sv (100%) rename {src => hardware/src}/hdl/dict/duplicate.sv (100%) rename {src => hardware/src}/hdl/dict/typed_dictionary.sv (100%) rename {src => hardware/src}/hdl/fifo/fifo.vhd (100%) rename {src => hardware/src}/hdl/fifo/fifo_axi.sv (100%) rename {src => hardware/src}/hdl/fifo/multi_insert_fifo.vhd (100%) rename {src => hardware/src}/hdl/fifo/multi_insert_fifo_axi.sv (100%) rename {src => hardware/src}/hdl/hashing/murmur_hasher.sv (100%) rename {src => hardware/src}/hdl/hashing/stream_hasher.sv (100%) rename {src => hardware/src}/hdl/libstf_macros.svh (100%) rename {src => hardware/src}/hdl/normalization/barrel_shifter.sv (100%) rename {src => hardware/src}/hdl/normalization/constant_shifter.sv (100%) rename {src => hardware/src}/hdl/normalization/data_compactor.sv (100%) rename {src => hardware/src}/hdl/normalization/data_compactor_level.sv (100%) rename {src => hardware/src}/hdl/normalization/data_normalizer.sv (100%) rename {src => hardware/src}/hdl/output/output_writer.sv (100%) rename {src => hardware/src}/hdl/output/stream_writer.sv (100%) rename {src => hardware/src}/hdl/stream/data_adapters.sv (100%) rename {src => hardware/src}/hdl/stream/data_adapters_typed.sv (100%) rename {src => hardware/src}/hdl/stream/data_demultiplexer.sv (100%) rename {src => hardware/src}/hdl/stream/data_enumerator.sv (100%) rename {src => hardware/src}/hdl/stream/data_multiplexer.sv (100%) rename {src => hardware/src}/hdl/stream/data_sink.sv (100%) rename {src => hardware/src}/hdl/stream/data_splitter.sv (100%) rename {src => hardware/src}/hdl/stream/data_width_converter.sv (100%) rename {src => hardware/src}/hdl/test/cyclic_drivers.sv (100%) rename {src => hardware/src}/hdl/util/demultiplexer.sv (100%) rename {src => hardware/src}/hdl/util/meta_intf_arbiter.sv (100%) rename {src => hardware/src}/hdl/util/performance_counter.sv (100%) rename {src => hardware/src}/hdl/util/ram.sv (100%) rename {src => hardware/src}/hdl/util/ready_valid_helpers.sv (100%) rename {src => hardware/src}/hdl/util/reset_resync.sv (100%) rename {src => hardware/src}/hdl/util/shift_register.vhd (100%) rename {src => hardware/src}/hdl/util/skid_buffer.sv (100%) rename {src => hardware/src}/vfpga_top.svh (100%) rename {unit-tests => hardware/unit-tests}/dict_test.py (100%) rename {unit-tests => hardware/unit-tests}/libstf_utils/__init__.py (100%) rename {unit-tests => hardware/unit-tests}/libstf_utils/common.py (100%) rename {unit-tests => hardware/unit-tests}/libstf_utils/configured_test_case.py (100%) rename {unit-tests => hardware/unit-tests}/libstf_utils/fpga_configuration.py (100%) rename {unit-tests => hardware/unit-tests}/libstf_utils/memory_manager.py (100%) rename {unit-tests => hardware/unit-tests}/libstf_utils/output_writer_test_case.py (100%) rename {unit-tests => hardware/unit-tests}/normalizer_test.py (100%) rename {unit-tests => hardware/unit-tests}/output_writer_test.py (100%) rename {unit-tests => hardware/unit-tests}/typed_dict_test.py (100%) rename {unit-tests => hardware/unit-tests}/vfpga_tops/dict_test.sv (100%) rename {unit-tests => hardware/unit-tests}/vfpga_tops/normalizer_test.sv (100%) rename {unit-tests => hardware/unit-tests}/vfpga_tops/output_writer_test.sv (100%) rename {unit-tests => hardware/unit-tests}/vfpga_tops/typed_dict_test.sv (100%) create mode 100755 scripts/install_jemalloc.sh create mode 100644 software/CMakeLists.txt create mode 100644 software/cmake/FindJemalloc.cmake create mode 100644 software/src/libstf/buffer.cpp create mode 100644 software/src/libstf/buffer.hpp create mode 100644 software/src/libstf/common.cpp create mode 100644 software/src/libstf/common.hpp create mode 100644 software/src/libstf/configuration.cpp create mode 100644 software/src/libstf/configuration.hpp create mode 100644 software/src/libstf/error_handling.hpp create mode 100644 software/src/libstf/memory_pool.cpp create mode 100644 software/src/libstf/memory_pool.hpp create mode 100644 software/src/libstf/tlb_manager.cpp create mode 100644 software/src/libstf/tlb_manager.hpp diff --git a/.env b/.env index f3006da..ecf391f 100644 --- a/.env +++ b/.env @@ -1 +1 @@ -PYTHONPATH=build_hw:coyote/sim \ No newline at end of file +PYTHONPATH=hardware/build_hw:coyote/sim \ No newline at end of file diff --git a/.gitignore b/.gitignore index 774795d..5d48747 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ -build_hw/** +build +hardware/build_hw/** # Generated unit test files -unit-tests/diff/** -unit-tests/sim_dump.vcd -unit-tests/sim.out -unit-tests/vapor_view.json +hardware/unit-tests/diff/** +hardware/unit-tests/sim_dump.vcd +hardware/unit-tests/sim.out +hardware/unit-tests/vapor_view.json # Vivado generated files *.jou diff --git a/.vscode/settings.json b/.vscode/settings.json index ef84459..9a42865 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,7 +2,7 @@ "python.testing.unittestArgs": [ "-v", "-s", - "./unit-tests", + "./hardware/unit-tests", "-p", "*_test.py" ], diff --git a/CMakeLists.txt b/CMakeLists.txt index a329022..eb0a887 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,20 +1,9 @@ -cmake_minimum_required(VERSION 3.0) +cmake_minimum_required(VERSION 3.16) project(libstf) -set(CYT_DIR ${CMAKE_SOURCE_DIR}/coyote/) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CYT_DIR}/cmake) +# NOTE: This CMakeLists.txt is only here because of VSCode -find_package(CoyoteHW REQUIRED) +# Find dependencies +add_subdirectory(coyote/sw coyote) -set(N_REGIONS 1) -set(EN_STRM 1) -set(N_STRM_AXI 4) -set(FDEV_NAME u55c) - -validation_checks_hw() - -load_apps ( - VFPGA_C0_0 "src" -) - -create_hw() +add_subdirectory(software) diff --git a/README.md b/README.md index 9173c06..cb0b3e2 100644 --- a/README.md +++ b/README.md @@ -12,44 +12,61 @@ This library is a collection of common modules used to develop hardware designs. Some modules are specific to developing vFPGAs for Coyote. The main functionality libstf offers is: -- Interfaces that transport a specific number of elements, adapters to AXI4S interfaces, and other helpers (src/data_interfaces.sv and src/hdl/stream) -- Utilities for configuration registers accessed through an AXI4L interface (src/hdl/config) -- Crossbar (src/hdl/crossbar) -- Dictionary (src/hdl/dict) -- Stream normalization (src/hdl/normalization) -- vFPGA-specific stream output writer that generates memory requests and communicates with a memory manager on the CPU side (src/hdl/output) +- Interfaces that transport a specific number of elements (ndata), adapters to AXI4S interfaces, and other helpers (hardware/src/data_interfaces.sv and hardware/src/hdl/stream) +- Utilities for configuration registers accessed through an AXI4L interface (hardware/src/hdl/config) +- Crossbar (hardware/src/hdl/crossbar) +- Dictionary (hardware/src/hdl/dict) +- Stream normalization (hardware/src/hdl/normalization) +- Coyote vFPGA-specific stream output writer that generates memory requests and communicates with a output buffer manager on the CPU side (hardware/src/hdl/output) -This library is *work in progress*. +It also features some of the corresponding software-side components to use the hardware. This +repository is *work in progress* and quite a bit of documentation is missing. Please feel free to +contribute. -## Getting Started +## Getting Started (Hardware) Clone the repo and download the Coyote submodule: ```bash git clone --recurse-submodules https://github.com/fpgasystems/libstf ``` The recommended way to get started with libstf is to start exploring the Python unit tests in the -`unit-tests` folder. To execute the unit tests, you have to set up the Coyote simulation project -with: +`hardware/unit-tests` folder. To execute the unit tests, you have to set up the Coyote simulation +project with: ```bash ./sim_setup.sh ``` -This creates the `build_hw` folder. Anytime you create or rename files in `src/hdl`, you have to -execute this command again. Afterwards, the unit tests should show up in VSCode. +This creates the `hardware/build_hw` folder. Anytime you create or rename files in `src/hdl`, you +have to execute this command again. Afterwards, the unit tests should show up in VSCode. + +## Getting Started (Software) +You have to install jemalloc which is required for the HugePageMemoryPool like this: + +```bash +./scripts/install_jemalloc.sh +``` + +This will install jemalloc in `~/opt/jemalloc`. The you can build the libstf library as follows: + +```bash +mkdir build && cd build +cmake .. +make +``` ## Code Style For now, we have a couple of code style rules: - Camel case for class names and snake case for file names and everything else in the code -- _i postfix for interfaces -- _t postfix for types +- _i suffix for interfaces +- _t suffix for types - n_ prefix for next signals in state logic - inst_ prefix for module instantiations -- *Width* always refers to width in bits and *size* to width in bytes +- The term *width* always refers to width in bits and *size* to width in bytes ## TODOs -1. Get types and NUM_ELEMENTS from interface instead of parameters +1. Get types and NUM_ELEMENTS from interface instead of parameters (on hold because this crashes Vivado sometimes) 2. Add interface assertions ## License diff --git a/coyote b/coyote index a29cf61..0f6d3e1 160000 --- a/coyote +++ b/coyote @@ -1 +1 @@ -Subproject commit a29cf61294880e629b00cc70d880487b2085c0c3 +Subproject commit 0f6d3e1cfa556647c416b0574a5460d224dedeaa diff --git a/hardware/CMakeLists.txt b/hardware/CMakeLists.txt new file mode 100644 index 0000000..893f4f8 --- /dev/null +++ b/hardware/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.0) +project(libstf) + +set(CYT_DIR ${CMAKE_SOURCE_DIR}/../coyote) +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CYT_DIR}/cmake) + +find_package(CoyoteHW REQUIRED) + +set(N_REGIONS 1) +set(EN_STRM 1) +set(N_STRM_AXI 4) +set(FDEV_NAME u55c) + +validation_checks_hw() + +load_apps ( + VFPGA_C0_0 "src" +) + +create_hw() diff --git a/src/hdl/axi/axi_demultiplexer.sv b/hardware/src/hdl/axi/axi_demultiplexer.sv similarity index 100% rename from src/hdl/axi/axi_demultiplexer.sv rename to hardware/src/hdl/axi/axi_demultiplexer.sv diff --git a/src/hdl/axi/axi_width_converter.sv b/hardware/src/hdl/axi/axi_width_converter.sv similarity index 100% rename from src/hdl/axi/axi_width_converter.sv rename to hardware/src/hdl/axi/axi_width_converter.sv diff --git a/src/hdl/common.sv b/hardware/src/hdl/common.sv similarity index 100% rename from src/hdl/common.sv rename to hardware/src/hdl/common.sv diff --git a/src/hdl/config/config_interfaces.sv b/hardware/src/hdl/config/config_interfaces.sv similarity index 100% rename from src/hdl/config/config_interfaces.sv rename to hardware/src/hdl/config/config_interfaces.sv diff --git a/src/hdl/config/config_macros.svh b/hardware/src/hdl/config/config_macros.svh similarity index 100% rename from src/hdl/config/config_macros.svh rename to hardware/src/hdl/config/config_macros.svh diff --git a/src/hdl/config/global_config.sv b/hardware/src/hdl/config/global_config.sv similarity index 100% rename from src/hdl/config/global_config.sv rename to hardware/src/hdl/config/global_config.sv diff --git a/src/hdl/config/mem_config.sv b/hardware/src/hdl/config/mem_config.sv similarity index 100% rename from src/hdl/config/mem_config.sv rename to hardware/src/hdl/config/mem_config.sv diff --git a/src/hdl/config/read_config_splitter.sv b/hardware/src/hdl/config/read_config_splitter.sv similarity index 100% rename from src/hdl/config/read_config_splitter.sv rename to hardware/src/hdl/config/read_config_splitter.sv diff --git a/src/hdl/config/registers/config_read_register.sv b/hardware/src/hdl/config/registers/config_read_register.sv similarity index 100% rename from src/hdl/config/registers/config_read_register.sv rename to hardware/src/hdl/config/registers/config_read_register.sv diff --git a/src/hdl/config/registers/config_read_register_file.sv b/hardware/src/hdl/config/registers/config_read_register_file.sv similarity index 100% rename from src/hdl/config/registers/config_read_register_file.sv rename to hardware/src/hdl/config/registers/config_read_register_file.sv diff --git a/src/hdl/config/registers/config_write_fifo.sv b/hardware/src/hdl/config/registers/config_write_fifo.sv similarity index 100% rename from src/hdl/config/registers/config_write_fifo.sv rename to hardware/src/hdl/config/registers/config_write_fifo.sv diff --git a/src/hdl/config/registers/config_write_ready_register.sv b/hardware/src/hdl/config/registers/config_write_ready_register.sv similarity index 100% rename from src/hdl/config/registers/config_write_ready_register.sv rename to hardware/src/hdl/config/registers/config_write_ready_register.sv diff --git a/src/hdl/config/registers/config_write_register.sv b/hardware/src/hdl/config/registers/config_write_register.sv similarity index 100% rename from src/hdl/config/registers/config_write_register.sv rename to hardware/src/hdl/config/registers/config_write_register.sv diff --git a/src/hdl/config/stream_config.sv b/hardware/src/hdl/config/stream_config.sv similarity index 100% rename from src/hdl/config/stream_config.sv rename to hardware/src/hdl/config/stream_config.sv diff --git a/src/hdl/config/write_config_splitter.sv b/hardware/src/hdl/config/write_config_splitter.sv similarity index 100% rename from src/hdl/config/write_config_splitter.sv rename to hardware/src/hdl/config/write_config_splitter.sv diff --git a/src/hdl/crossbar/coupler.sv b/hardware/src/hdl/crossbar/coupler.sv similarity index 100% rename from src/hdl/crossbar/coupler.sv rename to hardware/src/hdl/crossbar/coupler.sv diff --git a/src/hdl/crossbar/creditor.sv b/hardware/src/hdl/crossbar/creditor.sv similarity index 100% rename from src/hdl/crossbar/creditor.sv rename to hardware/src/hdl/crossbar/creditor.sv diff --git a/src/hdl/crossbar/crossbar.sv b/hardware/src/hdl/crossbar/crossbar.sv similarity index 100% rename from src/hdl/crossbar/crossbar.sv rename to hardware/src/hdl/crossbar/crossbar.sv diff --git a/src/hdl/crossbar/decoupler.sv b/hardware/src/hdl/crossbar/decoupler.sv similarity index 100% rename from src/hdl/crossbar/decoupler.sv rename to hardware/src/hdl/crossbar/decoupler.sv diff --git a/src/hdl/crossbar/reorder.sv b/hardware/src/hdl/crossbar/reorder.sv similarity index 100% rename from src/hdl/crossbar/reorder.sv rename to hardware/src/hdl/crossbar/reorder.sv diff --git a/src/hdl/crossbar/reorder_decoupler.sv b/hardware/src/hdl/crossbar/reorder_decoupler.sv similarity index 100% rename from src/hdl/crossbar/reorder_decoupler.sv rename to hardware/src/hdl/crossbar/reorder_decoupler.sv diff --git a/src/hdl/crossbar/reorder_enumerator.sv b/hardware/src/hdl/crossbar/reorder_enumerator.sv similarity index 100% rename from src/hdl/crossbar/reorder_enumerator.sv rename to hardware/src/hdl/crossbar/reorder_enumerator.sv diff --git a/src/hdl/crossbar/tagged_duplicator.sv b/hardware/src/hdl/crossbar/tagged_duplicator.sv similarity index 100% rename from src/hdl/crossbar/tagged_duplicator.sv rename to hardware/src/hdl/crossbar/tagged_duplicator.sv diff --git a/src/hdl/crossbar/tagged_multiplexer.sv b/hardware/src/hdl/crossbar/tagged_multiplexer.sv similarity index 100% rename from src/hdl/crossbar/tagged_multiplexer.sv rename to hardware/src/hdl/crossbar/tagged_multiplexer.sv diff --git a/src/hdl/data_interfaces.sv b/hardware/src/hdl/data_interfaces.sv similarity index 100% rename from src/hdl/data_interfaces.sv rename to hardware/src/hdl/data_interfaces.sv diff --git a/src/hdl/dict/deduplicate.sv b/hardware/src/hdl/dict/deduplicate.sv similarity index 100% rename from src/hdl/dict/deduplicate.sv rename to hardware/src/hdl/dict/deduplicate.sv diff --git a/src/hdl/dict/deduplicate_stage.sv b/hardware/src/hdl/dict/deduplicate_stage.sv similarity index 100% rename from src/hdl/dict/deduplicate_stage.sv rename to hardware/src/hdl/dict/deduplicate_stage.sv diff --git a/src/hdl/dict/dictionary.sv b/hardware/src/hdl/dict/dictionary.sv similarity index 100% rename from src/hdl/dict/dictionary.sv rename to hardware/src/hdl/dict/dictionary.sv diff --git a/src/hdl/dict/dictionary_bank.sv b/hardware/src/hdl/dict/dictionary_bank.sv similarity index 100% rename from src/hdl/dict/dictionary_bank.sv rename to hardware/src/hdl/dict/dictionary_bank.sv diff --git a/src/hdl/dict/duplicate.sv b/hardware/src/hdl/dict/duplicate.sv similarity index 100% rename from src/hdl/dict/duplicate.sv rename to hardware/src/hdl/dict/duplicate.sv diff --git a/src/hdl/dict/typed_dictionary.sv b/hardware/src/hdl/dict/typed_dictionary.sv similarity index 100% rename from src/hdl/dict/typed_dictionary.sv rename to hardware/src/hdl/dict/typed_dictionary.sv diff --git a/src/hdl/fifo/fifo.vhd b/hardware/src/hdl/fifo/fifo.vhd similarity index 100% rename from src/hdl/fifo/fifo.vhd rename to hardware/src/hdl/fifo/fifo.vhd diff --git a/src/hdl/fifo/fifo_axi.sv b/hardware/src/hdl/fifo/fifo_axi.sv similarity index 100% rename from src/hdl/fifo/fifo_axi.sv rename to hardware/src/hdl/fifo/fifo_axi.sv diff --git a/src/hdl/fifo/multi_insert_fifo.vhd b/hardware/src/hdl/fifo/multi_insert_fifo.vhd similarity index 100% rename from src/hdl/fifo/multi_insert_fifo.vhd rename to hardware/src/hdl/fifo/multi_insert_fifo.vhd diff --git a/src/hdl/fifo/multi_insert_fifo_axi.sv b/hardware/src/hdl/fifo/multi_insert_fifo_axi.sv similarity index 100% rename from src/hdl/fifo/multi_insert_fifo_axi.sv rename to hardware/src/hdl/fifo/multi_insert_fifo_axi.sv diff --git a/src/hdl/hashing/murmur_hasher.sv b/hardware/src/hdl/hashing/murmur_hasher.sv similarity index 100% rename from src/hdl/hashing/murmur_hasher.sv rename to hardware/src/hdl/hashing/murmur_hasher.sv diff --git a/src/hdl/hashing/stream_hasher.sv b/hardware/src/hdl/hashing/stream_hasher.sv similarity index 100% rename from src/hdl/hashing/stream_hasher.sv rename to hardware/src/hdl/hashing/stream_hasher.sv diff --git a/src/hdl/libstf_macros.svh b/hardware/src/hdl/libstf_macros.svh similarity index 100% rename from src/hdl/libstf_macros.svh rename to hardware/src/hdl/libstf_macros.svh diff --git a/src/hdl/normalization/barrel_shifter.sv b/hardware/src/hdl/normalization/barrel_shifter.sv similarity index 100% rename from src/hdl/normalization/barrel_shifter.sv rename to hardware/src/hdl/normalization/barrel_shifter.sv diff --git a/src/hdl/normalization/constant_shifter.sv b/hardware/src/hdl/normalization/constant_shifter.sv similarity index 100% rename from src/hdl/normalization/constant_shifter.sv rename to hardware/src/hdl/normalization/constant_shifter.sv diff --git a/src/hdl/normalization/data_compactor.sv b/hardware/src/hdl/normalization/data_compactor.sv similarity index 100% rename from src/hdl/normalization/data_compactor.sv rename to hardware/src/hdl/normalization/data_compactor.sv diff --git a/src/hdl/normalization/data_compactor_level.sv b/hardware/src/hdl/normalization/data_compactor_level.sv similarity index 100% rename from src/hdl/normalization/data_compactor_level.sv rename to hardware/src/hdl/normalization/data_compactor_level.sv diff --git a/src/hdl/normalization/data_normalizer.sv b/hardware/src/hdl/normalization/data_normalizer.sv similarity index 100% rename from src/hdl/normalization/data_normalizer.sv rename to hardware/src/hdl/normalization/data_normalizer.sv diff --git a/src/hdl/output/output_writer.sv b/hardware/src/hdl/output/output_writer.sv similarity index 100% rename from src/hdl/output/output_writer.sv rename to hardware/src/hdl/output/output_writer.sv diff --git a/src/hdl/output/stream_writer.sv b/hardware/src/hdl/output/stream_writer.sv similarity index 100% rename from src/hdl/output/stream_writer.sv rename to hardware/src/hdl/output/stream_writer.sv diff --git a/src/hdl/stream/data_adapters.sv b/hardware/src/hdl/stream/data_adapters.sv similarity index 100% rename from src/hdl/stream/data_adapters.sv rename to hardware/src/hdl/stream/data_adapters.sv diff --git a/src/hdl/stream/data_adapters_typed.sv b/hardware/src/hdl/stream/data_adapters_typed.sv similarity index 100% rename from src/hdl/stream/data_adapters_typed.sv rename to hardware/src/hdl/stream/data_adapters_typed.sv diff --git a/src/hdl/stream/data_demultiplexer.sv b/hardware/src/hdl/stream/data_demultiplexer.sv similarity index 100% rename from src/hdl/stream/data_demultiplexer.sv rename to hardware/src/hdl/stream/data_demultiplexer.sv diff --git a/src/hdl/stream/data_enumerator.sv b/hardware/src/hdl/stream/data_enumerator.sv similarity index 100% rename from src/hdl/stream/data_enumerator.sv rename to hardware/src/hdl/stream/data_enumerator.sv diff --git a/src/hdl/stream/data_multiplexer.sv b/hardware/src/hdl/stream/data_multiplexer.sv similarity index 100% rename from src/hdl/stream/data_multiplexer.sv rename to hardware/src/hdl/stream/data_multiplexer.sv diff --git a/src/hdl/stream/data_sink.sv b/hardware/src/hdl/stream/data_sink.sv similarity index 100% rename from src/hdl/stream/data_sink.sv rename to hardware/src/hdl/stream/data_sink.sv diff --git a/src/hdl/stream/data_splitter.sv b/hardware/src/hdl/stream/data_splitter.sv similarity index 100% rename from src/hdl/stream/data_splitter.sv rename to hardware/src/hdl/stream/data_splitter.sv diff --git a/src/hdl/stream/data_width_converter.sv b/hardware/src/hdl/stream/data_width_converter.sv similarity index 100% rename from src/hdl/stream/data_width_converter.sv rename to hardware/src/hdl/stream/data_width_converter.sv diff --git a/src/hdl/test/cyclic_drivers.sv b/hardware/src/hdl/test/cyclic_drivers.sv similarity index 100% rename from src/hdl/test/cyclic_drivers.sv rename to hardware/src/hdl/test/cyclic_drivers.sv diff --git a/src/hdl/util/demultiplexer.sv b/hardware/src/hdl/util/demultiplexer.sv similarity index 100% rename from src/hdl/util/demultiplexer.sv rename to hardware/src/hdl/util/demultiplexer.sv diff --git a/src/hdl/util/meta_intf_arbiter.sv b/hardware/src/hdl/util/meta_intf_arbiter.sv similarity index 100% rename from src/hdl/util/meta_intf_arbiter.sv rename to hardware/src/hdl/util/meta_intf_arbiter.sv diff --git a/src/hdl/util/performance_counter.sv b/hardware/src/hdl/util/performance_counter.sv similarity index 100% rename from src/hdl/util/performance_counter.sv rename to hardware/src/hdl/util/performance_counter.sv diff --git a/src/hdl/util/ram.sv b/hardware/src/hdl/util/ram.sv similarity index 100% rename from src/hdl/util/ram.sv rename to hardware/src/hdl/util/ram.sv diff --git a/src/hdl/util/ready_valid_helpers.sv b/hardware/src/hdl/util/ready_valid_helpers.sv similarity index 100% rename from src/hdl/util/ready_valid_helpers.sv rename to hardware/src/hdl/util/ready_valid_helpers.sv diff --git a/src/hdl/util/reset_resync.sv b/hardware/src/hdl/util/reset_resync.sv similarity index 100% rename from src/hdl/util/reset_resync.sv rename to hardware/src/hdl/util/reset_resync.sv diff --git a/src/hdl/util/shift_register.vhd b/hardware/src/hdl/util/shift_register.vhd similarity index 100% rename from src/hdl/util/shift_register.vhd rename to hardware/src/hdl/util/shift_register.vhd diff --git a/src/hdl/util/skid_buffer.sv b/hardware/src/hdl/util/skid_buffer.sv similarity index 100% rename from src/hdl/util/skid_buffer.sv rename to hardware/src/hdl/util/skid_buffer.sv diff --git a/src/vfpga_top.svh b/hardware/src/vfpga_top.svh similarity index 100% rename from src/vfpga_top.svh rename to hardware/src/vfpga_top.svh diff --git a/unit-tests/dict_test.py b/hardware/unit-tests/dict_test.py similarity index 100% rename from unit-tests/dict_test.py rename to hardware/unit-tests/dict_test.py diff --git a/unit-tests/libstf_utils/__init__.py b/hardware/unit-tests/libstf_utils/__init__.py similarity index 100% rename from unit-tests/libstf_utils/__init__.py rename to hardware/unit-tests/libstf_utils/__init__.py diff --git a/unit-tests/libstf_utils/common.py b/hardware/unit-tests/libstf_utils/common.py similarity index 100% rename from unit-tests/libstf_utils/common.py rename to hardware/unit-tests/libstf_utils/common.py diff --git a/unit-tests/libstf_utils/configured_test_case.py b/hardware/unit-tests/libstf_utils/configured_test_case.py similarity index 100% rename from unit-tests/libstf_utils/configured_test_case.py rename to hardware/unit-tests/libstf_utils/configured_test_case.py diff --git a/unit-tests/libstf_utils/fpga_configuration.py b/hardware/unit-tests/libstf_utils/fpga_configuration.py similarity index 100% rename from unit-tests/libstf_utils/fpga_configuration.py rename to hardware/unit-tests/libstf_utils/fpga_configuration.py diff --git a/unit-tests/libstf_utils/memory_manager.py b/hardware/unit-tests/libstf_utils/memory_manager.py similarity index 100% rename from unit-tests/libstf_utils/memory_manager.py rename to hardware/unit-tests/libstf_utils/memory_manager.py diff --git a/unit-tests/libstf_utils/output_writer_test_case.py b/hardware/unit-tests/libstf_utils/output_writer_test_case.py similarity index 100% rename from unit-tests/libstf_utils/output_writer_test_case.py rename to hardware/unit-tests/libstf_utils/output_writer_test_case.py diff --git a/unit-tests/normalizer_test.py b/hardware/unit-tests/normalizer_test.py similarity index 100% rename from unit-tests/normalizer_test.py rename to hardware/unit-tests/normalizer_test.py diff --git a/unit-tests/output_writer_test.py b/hardware/unit-tests/output_writer_test.py similarity index 100% rename from unit-tests/output_writer_test.py rename to hardware/unit-tests/output_writer_test.py diff --git a/unit-tests/typed_dict_test.py b/hardware/unit-tests/typed_dict_test.py similarity index 100% rename from unit-tests/typed_dict_test.py rename to hardware/unit-tests/typed_dict_test.py diff --git a/unit-tests/vfpga_tops/dict_test.sv b/hardware/unit-tests/vfpga_tops/dict_test.sv similarity index 100% rename from unit-tests/vfpga_tops/dict_test.sv rename to hardware/unit-tests/vfpga_tops/dict_test.sv diff --git a/unit-tests/vfpga_tops/normalizer_test.sv b/hardware/unit-tests/vfpga_tops/normalizer_test.sv similarity index 100% rename from unit-tests/vfpga_tops/normalizer_test.sv rename to hardware/unit-tests/vfpga_tops/normalizer_test.sv diff --git a/unit-tests/vfpga_tops/output_writer_test.sv b/hardware/unit-tests/vfpga_tops/output_writer_test.sv similarity index 100% rename from unit-tests/vfpga_tops/output_writer_test.sv rename to hardware/unit-tests/vfpga_tops/output_writer_test.sv diff --git a/unit-tests/vfpga_tops/typed_dict_test.sv b/hardware/unit-tests/vfpga_tops/typed_dict_test.sv similarity index 100% rename from unit-tests/vfpga_tops/typed_dict_test.sv rename to hardware/unit-tests/vfpga_tops/typed_dict_test.sv diff --git a/scripts/install_jemalloc.sh b/scripts/install_jemalloc.sh new file mode 100755 index 0000000..33d827b --- /dev/null +++ b/scripts/install_jemalloc.sh @@ -0,0 +1,38 @@ +#!/bin/bash -ex + +# Determine the paths to install into +download_path="$HOME/download" +if [ -n "$DOWNLOAD_PATH" ]; then + download_path="$DOWNLOAD_PATH" +fi +mkdir -p "${download_path}" + +install_path="$HOME/opt" +if [ -n "$INSTALL_PATH" ]; then + install_path="$INSTALL_PATH" +fi +mkdir -p "${download_path}" + +echo "Downloading into ${download_path} and installing into ${install_path}" + +# Clone the repository +wget https://github.com/jemalloc/jemalloc/archive/refs/tags/5.3.0.zip +unzip 5.3.0.zip +rm 5.3.0.zip +mv jemalloc-5.3.0 "${download_path}/jemalloc" +pushd "${download_path}/jemalloc" + +# Configure the compilation +./autogen.sh +# prefix: Install path +# jemalloc-prefix: prefix of all methods. This is very important as otherwise we might overwrite the default malloc! +# nareans: Disables any default arenas (min = 1). We create own ones in the code so we disable as many of the +# default arenas as we can. +./configure --prefix=${install_path}/jemalloc --with-jemalloc-prefix=je_ --with-malloc-conf=narenas:1 + +# Compile & Install +make +make install + +# Go back to the parent directory +popd diff --git a/sim_setup.sh b/sim_setup.sh index f787663..0c4d1c6 100755 --- a/sim_setup.sh +++ b/sim_setup.sh @@ -1,5 +1,6 @@ #!/bin/bash +pushd hardware rm -r -f build_hw mkdir build_hw pushd build_hw diff --git a/software/CMakeLists.txt b/software/CMakeLists.txt new file mode 100644 index 0000000..c353835 --- /dev/null +++ b/software/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.16) +project(libstf VERSION 0.1.0) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Collect source files +file(GLOB_RECURSE SOURCES src/libstf/*.cpp) + +# Create the library +add_library(libstf SHARED ${SOURCES}) + +# Find dependencies + +# Only add Coyote as a subdirectory if we are installing it standalone +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + add_subdirectory(../coyote/sw coyote) +else() + if(NOT TARGET Coyote) + message(FATAL_ERROR "When libstf is a subproject, parent must provide Coyote") + endif() +endif() + +list(APPEND CMAKE_PREFIX_PATH "$ENV{HOME}/opt") +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +find_package(Jemalloc REQUIRED) + +# Specify include directories +target_include_directories(libstf + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src ${COYOTE_INCLUDE_DIRS} ${JEMALLOC_INCLUDE_DIRS} +) + +target_link_libraries(libstf PRIVATE Coyote ${JEMALLOC_LIBRARIES}) diff --git a/software/cmake/FindJemalloc.cmake b/software/cmake/FindJemalloc.cmake new file mode 100644 index 0000000..3b033c0 --- /dev/null +++ b/software/cmake/FindJemalloc.cmake @@ -0,0 +1,34 @@ +# FindJemalloc.cmake +# Locate Jemalloc library + +# Convert CMAKE_PREFIX_PATH into a list +list(APPEND SEARCH_PATHS ${CMAKE_PREFIX_PATH}) + +set(JEMALLOC_LIB_NAME jemalloc) + +# Check if Jemalloc library exists in the install prefix directories +find_library(JEMALLOC_LIBRARIES + NAMES ${JEMALLOC_LIB_NAME} + PATHS ${SEARCH_PATHS} + PATH_SUFFIXES jemalloc/lib jemalloc/lib64 +) + +# If Jemalloc library is found, set appropriate variables +if(JEMALLOC_LIB_NAME) + set(JEMALLOC_FOUND TRUE) + find_path(JEMALLOC_INCLUDE_DIRS + NAMES jemalloc + PATHS ${SEARCH_PATHS} + PATH_SUFFIXES jemalloc/include + ) +endif() + +# Provide interface for other CMake files +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIRS) + +# If JEMALLOC is not found, clear variables +if(NOT JEMALLOC_FOUND) + unset(JEMALLOC_LIBRARIES CACHE) + unset(JEMALLOC_INCLUDE_DIRS CACHE) +endif() diff --git a/software/src/libstf/buffer.cpp b/software/src/libstf/buffer.cpp new file mode 100644 index 0000000..72383da --- /dev/null +++ b/software/src/libstf/buffer.cpp @@ -0,0 +1,11 @@ +#include "libstf/buffer.hpp" + +namespace libstf { + +std::shared_ptr make_buffer(MemoryPool &memory_pool, void *ptr, size_t size, size_t capacity) { + return std::shared_ptr( + new Buffer{.ptr = ptr, .size = size, .capacity = capacity}, + BufferDeleter(memory_pool)); +} + +} // namespace libstf diff --git a/software/src/libstf/buffer.hpp b/software/src/libstf/buffer.hpp new file mode 100644 index 0000000..cb05719 --- /dev/null +++ b/software/src/libstf/buffer.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include + +#include "libstf/memory_pool.hpp" + +namespace libstf { + +struct Buffer { + void *ptr; // The address at which the buffer begins + size_t size; // The actual size of the buffer + size_t capacity; // The total capacity of the buffer +}; + +// Deleter struct for allocations that is used to clean up the memory that we pass as a shared_ptr. +struct BufferDeleter { + MemoryPool &memory_pool; + + BufferDeleter(MemoryPool &memory_pool) : memory_pool(memory_pool) {} + + void operator()(Buffer const *buffer) const { + // First: free the allocation the struct manages + memory_pool.free(buffer->ptr, buffer->capacity); + // Then: free the struct itself! + delete buffer; + } +}; + +std::shared_ptr make_buffer(MemoryPool &memory_pool, void *ptr, size_t size, size_t capacity); + +} // namespace libstf diff --git a/software/src/libstf/common.cpp b/software/src/libstf/common.cpp new file mode 100644 index 0000000..e8e23e4 --- /dev/null +++ b/software/src/libstf/common.cpp @@ -0,0 +1,26 @@ +#include "libstf/common.hpp" + +namespace libstf { + +std::ostream &operator<<(std::ostream &out, const type_t &data_type) { + switch (data_type) { + case type_t::BYTE_T: + out << "type_t::BYTE_T"; + break; + case type_t::INT32_T: + out << "type_t::INT32_T"; + break; + case type_t::INT64_T: + out << "type_t::INT64_T"; + break; + case type_t::FLOAT_T: + out << "type_t::FLOAT_T"; + break; + case type_t::DOUBLE_T: + out << "type_t::DOUBLE_T"; + break; + } + return out; +} + +} // namespace libstf diff --git a/software/src/libstf/common.hpp b/software/src/libstf/common.hpp new file mode 100644 index 0000000..d0c390f --- /dev/null +++ b/software/src/libstf/common.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include + +namespace libstf { + +// -- Utils ---------------------------------------------------------------------------------------- + +// Problem: while std::log2 exists, it is only a const_expression in gcc, not in clang. +// Therefore, using it might not compile under all scenarios. +// Solution: Re-define our down constexpr for a floored log2. +// Inspired by: https://stackoverflow.com/a/35313613/5589776 +// Same implementation logic as in the SV code. See: +// https://gitlab.inf.ethz.ch/OU-ALONSO/Student-Projects/fpga-dbops/-/blob/main/src/hdl/common.sv?ref_type=heads#L12 +constexpr unsigned floor_log2(unsigned val) { + return val ? 1 + floor_log2(val >> 1) : -1; +} + +// -- Constants ------------------------------------------------------------------------------------ + +// Memory +static constexpr uint32_t BYTES_PER_FPGA_TRANSFER = 65536; +static constexpr uint32_t FPGA_VADDR_BITS = 48; +static constexpr uint32_t INTERRUPT_TRANSFER_SIZE_BITS = 28; +static constexpr uint32_t BUFFER_SIZE_BITS = INTERRUPT_TRANSFER_SIZE_BITS - floor_log2(BYTES_PER_FPGA_TRANSFER); +static constexpr uint32_t MAXIMUM_FPGA_BUFFER_SIZE = (1 << INTERRUPT_TRANSFER_SIZE_BITS) - 1; + +// -- Type defs ------------------------------------------------------------------------------------ +typedef uint8_t stream_t; // Type that holds a stream_id + +enum class type_t : unsigned char { + BYTE_T, + INT32_T, + INT64_T, + FLOAT_T, + DOUBLE_T, + NUM_TYPES +}; + +std::ostream &operator<<(std::ostream &out, const type_t &data_type); + +constexpr size_t size_of(type_t type) { + switch (type) { + case type_t::BYTE_T: return 1; + case type_t::INT32_T: return 4; + case type_t::INT64_T: return 8; + case type_t::FLOAT_T: return 4; + case type_t::DOUBLE_T: return 8; + case type_t::NUM_TYPES: break; + } + throw std::invalid_argument("Invalid type"); +} + +} // namespace celeris diff --git a/software/src/libstf/configuration.cpp b/software/src/libstf/configuration.cpp new file mode 100644 index 0000000..bf540ac --- /dev/null +++ b/software/src/libstf/configuration.cpp @@ -0,0 +1,129 @@ +#include "libstf/configuration.hpp" + +#include +#include +#include + +namespace libstf { + +// ---------------------------------------------------------------------------- +// ConfigRegister +// ---------------------------------------------------------------------------- + +ConfigRegister::ConfigRegister(uint32_t addr, uint64_t value) + : addr_(addr), value_(value) { +} + +const uint32_t ConfigRegister::addr() const { + return addr_; +} + +const uint64_t ConfigRegister::value() const { + return value_; +} + +void ConfigRegister::set_value(uint64_t value) { + this->value_ = value; +} + +std::ostream &operator<<(std::ostream &out, const ConfigRegister &conf) { + out << "FPGAConfiguration { .addr = " << conf.addr() + << " .value = 0x"; + // Print the value as hex + std::ios oldState(nullptr); + oldState.copyfmt(out); + out << std::hex; + out << conf.value(); + out.copyfmt(oldState); + out << " }"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const std::vector &conf) { + out << "[" << std::endl; + for (auto config_it = conf.begin(); config_it < conf.end(); config_it++) { + out << *config_it; + int index = std::distance(conf.begin(), config_it); + if (index < conf.size() - 1) { + out << "," << std::endl; + } + } + out << std::endl << "]" << std::endl; + return out; +} + +// ---------------------------------------------------------------------------- +// Config +// ---------------------------------------------------------------------------- + +Config::Config(coyote::cThread &cthread, uint32_t addr_offset) : + cthread(cthread), + addr_offset(addr_offset) {} + +ConfigRegister Config::read_register(uint32_t addr) { + return ConfigRegister(addr_offset + addr, cthread.getCSR(addr_offset + addr)); +} + +void Config::write_register(ConfigRegister reg) { + cthread.setCSR(reg.value(), addr_offset + reg.addr()); +} + +// ---------------------------------------------------------------------------- +// GlobalConfig +// ---------------------------------------------------------------------------- + +GlobalConfig::GlobalConfig(coyote::cThread &cthread) : Config(cthread, 0) { + system_id_ = read_register(0).value(); + num_configs_ = read_register(1).value(); + + config_bounds.emplace_back(2 + num_configs_); + + for (size_t i = 0; i < num_configs_; i++) { + config_bounds.emplace_back(read_register(2 + i).value()); + + auto config_id = read_register(config_bounds[i]).value(); + assert(!has_config(config_id)); + config_ids.emplace_back(config_id); + } +} + +std::pair GlobalConfig::get_config_bounds(uint32_t config_id) { + auto it = std::find(config_ids.begin(), config_ids.end(), config_id); + assert(it != config_ids.end()); + + auto config_idx = std::distance(config_ids.begin(), it); + + return std::pair(config_bounds[config_idx], config_bounds[config_idx + 1]); +} + +bool GlobalConfig::has_config(uint32_t config_id) { + auto it = std::find(config_ids.begin(), config_ids.end(), config_id); + return it != config_ids.end(); +} + +// ---------------------------------------------------------------------------- +// MemConfig +// ---------------------------------------------------------------------------- + +MemConfig::MemConfig(coyote::cThread &cthread, uint32_t addr_offset) : + Config(cthread, addr_offset), + num_streams_(read_register(1).value()) {} + +void MemConfig::enqueue_buffer(stream_t stream_id, Buffer &buffer) { + assert(stream_id < num_streams_); + + auto vaddr = reinterpret_cast(buffer.ptr); + + // Assert the buffer properties. The design only supports buffers that are a multiple of the + // transfer size. + assert(vaddr < (1ULL << FPGA_VADDR_BITS)); + assert(buffer.capacity > 0); + assert(buffer.capacity < MAXIMUM_FPGA_BUFFER_SIZE); + assert(buffer.capacity % BYTES_PER_FPGA_TRANSFER == 0); + + size_t capacity_as_num_transfers = buffer.capacity / BYTES_PER_FPGA_TRANSFER; + + write_register(ConfigRegister(stream_id, vaddr << BUFFER_SIZE_BITS | capacity_as_num_transfers)); +} + +} // namespace libstf diff --git a/software/src/libstf/configuration.hpp b/software/src/libstf/configuration.hpp new file mode 100644 index 0000000..3ad3818 --- /dev/null +++ b/software/src/libstf/configuration.hpp @@ -0,0 +1,108 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "coyote/cThread.hpp" + +#include "libstf/common.hpp" +#include "libstf/buffer.hpp" + +namespace libstf { + +class ConfigRegister { +public: + ConfigRegister(uint32_t addr, uint64_t value); + + void set_value(uint64_t value); + + const uint32_t addr() const; + const uint64_t value() const; +private: + uint32_t addr_; + uint64_t value_; +}; + +std::ostream& operator<<(std::ostream& out, const ConfigRegister& conf); +std::ostream& operator<<(std::ostream& out, const std::vector& conf); + +class Config { +public: + Config(coyote::cThread &cthread, uint32_t addr_offset); + + /** + * Read configuration value from addr starting at addr_offset. + */ + ConfigRegister read_register(uint32_t addr); + + void write_register(ConfigRegister reg); + + static constexpr uint32_t ID = -1; + +private: + coyote::cThread &cthread; + uint32_t addr_offset; +}; + +class GlobalConfig : private Config { +public: + /** + * Note: Takes the cThread as a reference so we don't create a circular dependency with + * CelerisContext. + */ + GlobalConfig(coyote::cThread &cthread); + + /** + * Checks whether a config with a certain config_id is present in the system. Can be used to + * check which operators the Celeris instance flashed to the device supports. + */ + bool has_config(uint32_t config_id); + + /** + * Get's the address range of a config with the given config_id. + */ + std::pair get_config_bounds(uint32_t config_id); + + uint64_t system_id() { return system_id_; } + +private: + uint64_t system_id_; + uint32_t num_configs_; + + std::vector config_ids; + std::vector config_bounds; +}; + +class MemConfig : public Config { +public: + MemConfig(coyote::cThread &cthread, uint32_t addr_offset); + + /** + * Writes the CSR registers to add a new buffer to the FPGA for the given stream. + * @param stream_id The stream this buffer is done for + * @param buffer The buffer to write the registers for + */ + void enqueue_buffer(stream_t stream_id, Buffer &buffer); + + /** + * Writes the flush buffer CSR register which flushes potentially stale buffers in hardware. + */ + void flush_buffers() { write_register(ConfigRegister(num_streams_, 0)); } + + const stream_t num_streams() const { return num_streams_; } + + static constexpr uint32_t ID = 0; + +private: + stream_t num_streams_; +}; + +class StreamConfig : public Config { +public: + static constexpr uint32_t ID = 1; +}; + +} // namespace libstf diff --git a/software/src/libstf/error_handling.hpp b/software/src/libstf/error_handling.hpp new file mode 100644 index 0000000..547a557 --- /dev/null +++ b/software/src/libstf/error_handling.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include +#include + +namespace libstf { + +enum StatusCode { + OK = 0, + OutOfMemory = 2, + Error = 3, +}; + +/** + * This status class was originally taken from the Maximus project. + */ +class Status { +public: + Status() = default; + + Status(int code, const std::string &msg) { + this->_code = code; + this->_message = msg; + } + + explicit Status(int code) { this->_code = code; } + + explicit Status(StatusCode code) { this->_code = code; } + + Status(StatusCode code, const std::string &msg) { + this->_code = code; + this->_message = msg; + } + + int code() const { return _code; } + + bool ok() const { return _code == StatusCode::OK; } + + static Status OK() { return Status(StatusCode::OK); } + + const std::string &message() const { return _message; } + + std::string to_string() const { + return "code: " + std::to_string(_code) + ", message: " + _message; + } + +private: + int _code{}; + std::string _message{}; +}; + +template +void check_status(const T &expr) { + if (!expr.ok()) { + throw std::runtime_error("Celeris Error: " + std::to_string(static_cast(expr.code())) + + "; Message: " + expr.message() + "\n" + __FILE__ + ":" + + std::to_string(__LINE__)); + } +} + +} // namespace libstf diff --git a/software/src/libstf/memory_pool.cpp b/software/src/libstf/memory_pool.cpp new file mode 100644 index 0000000..0f67208 --- /dev/null +++ b/software/src/libstf/memory_pool.cpp @@ -0,0 +1,457 @@ +#include "libstf/memory_pool.hpp" + +#include +#include + +using namespace std::chrono_literals; + +namespace libstf { + +// ---------------------------------------------------------------------------- +// Helper functions for Jemalloc +// ---------------------------------------------------------------------------- + +std::string mallctl_error_to_string(int error) { + switch (error) { + case EINVAL: + return "The alignment parameter is not a power of 2 at least as large as sizeof(void " + "*)."; + case ENOENT: + return "Name or mib specifies an unknown/invalid value."; + case EPERM: + return "Attempt to read or write void value, or attempt to write read-only value."; + case EAGAIN: + return "A memory allocation failure occurred."; + case EFAULT: + return "EFAULT occurred"; + default: + return "Unknown error."; + } +} + +void check_mallctl_success(int error, std::string msg) { + if (error != 0) { + throw std::runtime_error(msg + " Got error: " + mallctl_error_to_string(error)); + } +} + +/** + * Executes the given control name without parameters + * @param control_name + */ +void je_mallctl_do(std::string control_name) { + check_mallctl_success(je_mallctl(control_name.c_str(), nullptr, nullptr, nullptr, 0), + "Failed to execute mallctl control " + control_name + "."); +} + +/** + * Writes the given value to the jemalloc control with the given name + * @tparam A Type of the value to write + * @param control_name The name of the control to write to + * @param value A pointer to the value to write! + */ +template +void je_mallctl_write(std::string control_name, A *value) { + check_mallctl_success(je_mallctl(control_name.c_str(), nullptr, nullptr, value, sizeof(A)), + "Failed to write to mallctl control " + control_name + "."); +} + +/** + * @tparam A The type of value to read + * @param control_name The name of the control to read from + * @return A value of type A as returned by the given jemalloc control + */ +template +A je_mallctl_read(std::string control_name) { + A value; + auto size = sizeof(A); + check_mallctl_success(je_mallctl(control_name.c_str(), &value, &size, nullptr, 0), + "Failed to read from mallctl control " + control_name + "."); + return std::move(value); +} + +/** + * @tparam A Type of the value to be read + * @tparam B Type of the value to write + * @param control_name The control to read/write to + * @param write_value The value to write + * @return Reads & Writes from the given jemalloc control name + */ +template +A je_mallctl_read_write(std::string control_name, B *write_value) { + A read_value; + auto size = sizeof(A); + check_mallctl_success( + je_mallctl(control_name.c_str(), &read_value, &size, write_value, sizeof(B)), + "Failed to read/write from mallctl control " + control_name + "."); + return std::move(read_value); +} + +// ---------------------------------------------------------------------------- +// HugePageMemoryPool implementation +// ---------------------------------------------------------------------------- + +// Assign a value to the zero_size_data while making sure its aligned according +// to the default alignment. +alignas(HugePageMemoryPool::DEFAULT_ALIGNMENT) int64_t zero_size_data[1] = {0xFFFFULL}; + +HugePageMemoryPool::HugePageMemoryPool() { + // Set some default options for jemalloc + // Immediately reuse pages + je_mallctl_write("arenas.dirty_decay_ms", 0); + + // Pre-allocate all 1 Gib huge pages available in the system + auto num_huge_pages = get_number_of_available_huge_pages(); + next_free_addr = mmap(nullptr, + num_huge_pages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | HUGE_PAGE_TYPE, + -1, + 0); + if (next_free_addr == MAP_FAILED) { + throw std::runtime_error( + "Could not allocated the expected number of 1GiB huge pages for HugePageMemoryPool."); + } + initial_address_ = next_free_addr; + total_capacity_ = num_huge_pages * PAGE_SIZE; + remaining_capacity = num_huge_pages * PAGE_SIZE; + + // Create a new jemalloc arena with customs hook + // -> The hook will request chunks of the 1GiB pages we pre-allocated + auto hooks = &hugepage_hooks; + arena_index = je_mallctl_read_write("arenas.create", &hooks); +} + +HugePageMemoryPool::~HugePageMemoryPool() { + // Destroy all the tcaches we crated + // This is REQUIRED for destroying the arena and otherwise will lead to a SEGFAULT + for (auto tcache : tcache_ids) { + je_mallctl_write("tcache.destroy", &tcache.second); + } + + // It can take some time for the tcaches to be cleaned up. + // Unfortunately, the operation is not blocking. + // And there is no way to ask if the destruction has finished. + // This is really stupid. To prevent seg faults during termination, + // we sleep a fixed time here in the hope that after this, the tcache are destroyed. + std::this_thread::sleep_for(100ms); + + // Destroy the arena we created + std::ostringstream arena; + arena << "arena." << arena_index << ".destroy"; + je_mallctl_do(arena.str()); + + // Unmap all the pre-allocated memory + if (munmap(initial_address_, total_capacity_) == -1) { + std::cerr << "HugePageMemoryPool: Could not munmap the obtained huge page mappings." + << std::endl; + } +} + +int HugePageMemoryPool::get_number_of_available_huge_pages() { + std::ostringstream path; + path << "/sys/kernel/mm/hugepages/hugepages-" << PAGE_SIZE_KB << "kB/free_hugepages"; + std::ifstream file(path.str()); + + if (!file.is_open()) { + throw std::runtime_error( + "It seems the target system does not have 1GiB huge pages enabled, which are required " + "for FPGA support. Please enable 1GiB huge pages in your system."); + } + + // Read out the number of free pages + int free_pages = 0; + file >> free_pages; + file.close(); + if (free_pages == 0) { + throw std::runtime_error( + "Your system has 0 free 1GiB huge pages. The FPGA support requires 1GiB huge pages. " + "Please enable additional 1GiB huge pages as described in the Maximus readme."); + } + return free_pages; +} + +unsigned HugePageMemoryPool::get_tcache_id_for_calling_thread() { + // Jemalloc automatically allocates a so-called tcache for the calling + // thread of mallocx, rallocx, or dallocx calls. + // The Problem is that when we want to destroy the arena, jemalloc requires us to first, + // destroy all the (automatically created) tcaches. + // Since they are automatically managed, and we don't know how many threads called our memory + // pool, we have NO WAY of destroying them all... + // -> We need to manually manage them. That way, we know all created tcaches + // and can destroy them + ReadLock r_lock(tcache_lock); + auto thread_id = std::this_thread::get_id(); + auto exists = tcache_ids.find(thread_id); + if (exists != tcache_ids.end()) { + r_lock.unlock(); + return exists->second; + } + + // If we get here, the calling thread does not yet have a tcache. Create one! + r_lock.unlock(); + WriteLock w_lock(tcache_lock); + unsigned tcache_id = je_mallctl_read("tcache.create"); + tcache_ids.insert(std::make_pair(thread_id, tcache_id)); + w_lock.unlock(); + return tcache_id; +} + +bool HugePageMemoryPool::is_in_bounds(void *ptr, size_t size) { + auto address = static_cast(ptr); + auto initial = static_cast(initial_address_); + auto max_address = initial + total_capacity_; + return address >= initial && address + size - 1 <= max_address; +} + +std::pair HugePageMemoryPool::get_page_boundaries(const void *ptr) { + auto byte_ptr = static_cast(ptr); + auto initial = static_cast(initial_address_); + auto max_address = initial + total_capacity_; + if (byte_ptr < initial || byte_ptr > max_address) { + std::ostringstream err; + err << "The Provided address " << static_cast(ptr) + << " is not within the bounds of the HugePageMemoryPool"; + throw std::runtime_error(err.str()); + } + + auto n_th_page = (byte_ptr - initial) / PAGE_SIZE; + return std::make_pair(static_cast(initial + n_th_page * PAGE_SIZE), PAGE_SIZE); +} + +Status HugePageMemoryPool::allocate(size_t size, size_t alignment, void **out) { + if (size == 0) { + *out = ZeroSizePointer; + } else { + // Ensure the alignment is a power of two. See: https://stackoverflow.com/a/108360/5589776 + assert((alignment & (alignment - 1)) == 0); + auto tc = get_tcache_id_for_calling_thread(); + *out = je_mallocx( + size, MALLOCX_ALIGN(alignment) | MALLOCX_ARENA(arena_index) | MALLOCX_TCACHE(tc)); + if (*out == nullptr) { + return Status(StatusCode::OutOfMemory, + "HugePageMemoryPool is unable to allocate memory!"); + } + total_bytes_allocated_ += size; + bytes_allocated_ += size; + } + num_allocs_ += 1; + + return Status::OK(); +} + +Status HugePageMemoryPool::reallocate(size_t old_size, size_t new_size, size_t alignment, void **ptr) { + // We want to allocate from an existing zero allocation + if (*ptr == ZeroSizePointer) { + assert(old_size == 0); + return allocate(new_size, alignment, ptr); + } + // We want to decrease the new size to 0 + if (new_size == 0) { + free(*ptr, old_size, alignment); + *ptr = ZeroSizePointer; + return Status::OK(); + } + + // Ensure the alignment is a power of two. See: https://stackoverflow.com/a/108360/5589776 + assert((alignment & (alignment - 1)) == 0); + auto tc = get_tcache_id_for_calling_thread(); + // Normal Re-allocation with jemalloc (which cannot handle size = 0) + *ptr = je_rallocx(*ptr, new_size, + MALLOCX_ALIGN(alignment) | MALLOCX_ARENA(arena_index) | MALLOCX_TCACHE(tc)); + + if (*ptr == nullptr) { + return Status(StatusCode::OutOfMemory, + "HugePageMemoryPool could not Reallocate as requested!"); + } + + auto n_new_bytes = (new_size - old_size); + if (n_new_bytes >= 0) { + total_bytes_allocated_ += n_new_bytes; + } + bytes_allocated_ += n_new_bytes; + + return Status::OK(); +} + +void HugePageMemoryPool::free(void *ptr, size_t size, size_t alignment) { + if (ptr == ZeroSizePointer) { + assert(size == 0); + } else { + bytes_allocated_ -= size; + num_allocs_ -= 1; + auto tc = get_tcache_id_for_calling_thread(); + je_dallocx(ptr, MALLOCX_ARENA(arena_index) | MALLOCX_TCACHE(tc)); + } +} + +void *HugePageMemoryPool::huge_page_alloc(extent_hooks_t *hooks, + void *new_addr, + size_t size, + size_t alignment, + bool *zero, + bool *commit, + unsigned arena_ind) { + // When new_addr is != null, the man page says to return new_addr. + // -> Unclear what the intended behavior is (not documented) + // -> Ensure it is never NULL since we don't handle that case... + assert(new_addr == nullptr); + + allocations_mutex_.lock(); + + // Check if there is enough space to fit the requested size with alignment + auto aligned_address = std::align( + alignment, + size, + next_free_addr, + // Note: std::align decreases remaining_capacity automatically by the alignment bytes! + remaining_capacity); + + // There was not enough space remaining + if (aligned_address == nullptr) { + allocations_mutex_.unlock(); + std::cerr << "HugePageMemoryPool: Not enough huge page memory remaining to satisfy " + "jemalloc request over " + << size << " bytes. Please add additional 1GiB huge pages to your system." + << std::endl; + return nullptr; + } + + // There was enough space: Update values + remaining_capacity -= size; + next_free_addr = static_cast(static_cast(aligned_address) + size); + + // MAP_ANONYMOUS always ensures zero-ing of the memory and only returned commit memory + *zero = true; + *commit = true; + allocations_mutex_.unlock(); + return aligned_address; +} + +bool HugePageMemoryPool::huge_page_dealloc( + extent_hooks_t *hooks, void *addr, size_t size, bool committed, unsigned arena_ind) { + // True = opt out from deallocation and retain the memory for future use + return true; +} + +bool HugePageMemoryPool::huge_page_decommit(extent_hooks_t *hooks, + void *addr, + size_t size, + size_t offset, + size_t length, + unsigned arena_ind) { + // True = Opt out from decommit + return true; +} + +bool HugePageMemoryPool::huge_page_split_extend(extent_hooks_t *hooks, + void *addr, + size_t size, + size_t size_a, + size_t size_b, + bool committed, + unsigned arena_ind) { + // False = Pages are successfully splitted + // -> We don't really need to do anything, everything remains in one big chunk. + return false; +} + +bool HugePageMemoryPool::huge_page_merge_extend(extent_hooks_t *hooks, + void *addr_a, + size_t size_a, + void *addr_b, + size_t size_b, + bool committed, + unsigned arena_ind) { + // False = Successfully merged extends + // Since all extends given to jemalloc are already continuous, we can always return false + return false; +} + +// ---------------------------------------------------------------------------- +// SimpleMemoryPool implementation +// ---------------------------------------------------------------------------- + +SimpleMemoryPool::~SimpleMemoryPool() { + std::lock_guard lock(allocated_buffers_mutex); + while (!allocated_buffers.empty()) { + auto &[ptr, size] = *allocated_buffers.begin(); + free(ptr, size); + } +} + +Status SimpleMemoryPool::allocate(size_t size, size_t alignment, void **out) { + if (size == 0) { + *out = nullptr; + } else { + // Ensure the alignment is a power of two. See: https://stackoverflow.com/a/108360/5589776 + assert((alignment & (alignment - 1)) == 0); + *out = std::aligned_alloc(alignment, size); + if (!out) { + return Status(StatusCode::OutOfMemory, "Unable to allocate memory!"); + } + + total_bytes_allocated_ += size; + bytes_allocated_ += size; + } + + { + std::lock_guard lock(allocated_buffers_mutex); + allocated_buffers.emplace(*out, size); + } + num_allocs_++; + + return Status::OK(); +} + +Status SimpleMemoryPool::reallocate(size_t old_size, size_t new_size, size_t alignment, void **ptr) { + if (new_size == 0) { + free(*ptr, old_size, alignment); + *ptr = nullptr; + return Status::OK(); + } + + void* new_ptr = nullptr; + auto status = allocate(new_size, alignment, &new_ptr); + if (!status.ok()) { + return status; + } + + if (*ptr) { + std::memcpy(new_ptr, *ptr, std::min(old_size, new_size)); + free(*ptr, old_size, alignment); + } + + auto n_new_bytes = (new_size - old_size); + if (n_new_bytes >= 0) { + total_bytes_allocated_ += n_new_bytes; + } + bytes_allocated_ += n_new_bytes; + + *ptr = new_ptr; + return Status::OK(); +} + +void SimpleMemoryPool::free(void *ptr, size_t size, size_t alignment) { + if (ptr) { + bytes_allocated_ -= size; + num_allocs_ -= 1; + std::free(ptr); + + std::lock_guard lock(allocated_buffers_mutex); + allocated_buffers.erase(ptr); + } +} + +std::pair SimpleMemoryPool::get_page_boundaries(const void *ptr) { + std::lock_guard lock(allocated_buffers_mutex); + if (allocated_buffers.find((void *) ptr) == allocated_buffers.end()) { + std::ostringstream err; + err << "The Provided address " << ptr << " is not within the bounds of the memory pool!"; + throw std::runtime_error(err.str()); + } + + return std::make_pair((void *) ptr, allocated_buffers[(void *) ptr]); +} + +} // namespace libstf diff --git a/software/src/libstf/memory_pool.hpp b/software/src/libstf/memory_pool.hpp new file mode 100644 index 0000000..e1e67c1 --- /dev/null +++ b/software/src/libstf/memory_pool.hpp @@ -0,0 +1,281 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "jemalloc/jemalloc.h" + +#include "libstf/error_handling.hpp" + +typedef std::shared_mutex ReaderWriterLock; +typedef std::unique_lock WriteLock; +typedef std::shared_lock ReadLock; + +namespace libstf { + +/** + * Base class for memory pools in libstf. + */ +class MemoryPool { +public: + virtual ~MemoryPool() = default; + + /** + * Allocates a memory block of at least the specified size. + * + * @param size The minimum number of bytes to allocate. + * @param alignment The required alignment for the memory block. + * @param out Pointer to store the address of the allocated memory. + * @return Status indicating success or failure. + */ + virtual Status allocate(size_t size, size_t alignment, void **out) = 0; + virtual Status allocate(size_t size, void **out) = 0; + + /** + * Resizes an existing allocated memory block. + * + * Since most platform allocators do not support aligned reallocation, + * this operation may involve copying the data to a new memory block. + * @param old_size The current size of the allocated memory block. + * @param new_size The desired new size of the memory block. + * @param alignment The alignment requirement of the memory block. + * @param ptr Pointer to the memory block to be resized. Updated on success. + * @return Status indicating success or failure. + */ + virtual Status reallocate(size_t old_size, size_t new_size, size_t alignment, void **ptr) = 0; + + /** + * Frees a previously allocated memory block. + * + * @param ptr Pointer to the start of the allocated memory block. + * @param size The size of the allocated memory block. + * Some allocators may use this for tracking memory usage or + * optimizing deallocation. + * @param alignment The alignment of the memory block. + */ + virtual void free(void *ptr, size_t size, size_t alignment) = 0; + virtual void free(void *ptr, size_t size) = 0; + + /** + * Returns the address and size for the page in which the given allocation was placed. + * This information can be used, e.g. for TLB mappings on FPGAs. + * @param ptr The address where the allocation begins, as returned by 'allocate' + * @return A pair of: Start address and size of the allocated page + */ + virtual std::pair get_page_boundaries(const void *ptr) = 0; + + /** + * Retrieves the current amount of allocated memory that has not been freed. + * + * @return The number of bytes currently allocated. + */ + virtual size_t bytes_allocated() const = 0; + + /** + * Retrieves the total amount of memory allocated since the pool's creation. + * + * @return The cumulative number of bytes allocated. + */ + virtual size_t total_bytes_allocated() const = 0; + + /** + * Retrieves the total number of allocation and reallocation requests. + * + * @return The number of times memory has been allocated or reallocated. + */ + virtual size_t num_allocations() const = 0; + + /** + * Retrieves the peak memory usage recorded by this memory pool. + * + * @return The highest number of bytes allocated at any point. + * Returns -1 if tracking is not implemented. + */ + virtual size_t max_memory() const = 0; + + /** + * Retrieves the name of the memory allocation backend in use. + * + * @return A string representing the backend (e.g., "system", "jemalloc"). + */ + virtual std::string backend_name() const = 0; +}; + +// A static piece of memory for 0-size allocations, to return an aligned non-null pointer. This is +// required because Arrow memory pools (when we use this in other projects) need to support 0-byte +// allocations, reallocations, and deallocations but jemalloc does not support them! +extern int64_t zero_size_data[1]; +static void *const ZeroSizePointer = reinterpret_cast(&zero_size_data); + +/** + * This class implements a MemoryPool that uses 1GiB huge pages. + * This is required for the FPGA support since all the data send/received from the FPGA + * needs to be mapped on the FPGAs TLB. Additionally, every TLB miss causes an FPGA-side interrupt + * and is handled in Coyotes kernel code. As small pages lead to many TLB misses, this can cause + * performance problems due to the large number of interrupts. The goal of this pool is to minimize + * such misses by using huge pages. + * + * The pool is implemented as follows: During initialization, it pre-allocates all available 1GiB + * pages in the system. Under the hood it uses jemalloc to handle the actual + * allocation/free requests. Jemalloc uses smart (and very complex) mechanisms to minimize + * fragmentation. However, whenever it runs out of memory, it asks this pool via extension_hooks, + * which provides a chunk of the pre-allocated memory. The pre-allocated 1GiB pages will be + * unmaped/freed when the MemoryPool is destroyed. + */ +class HugePageMemoryPool : public MemoryPool { +public: + // The size of the pages to use. These are 1GiB huge pages by default + static inline const size_t HUGE_PAGE_BITS = 30; + static inline const size_t PAGE_SIZE = 1 << HUGE_PAGE_BITS; + static inline const size_t PAGE_SIZE_KB = PAGE_SIZE / 1024; + static inline const size_t HUGE_PAGE_TYPE = (HUGE_PAGE_BITS << MAP_HUGE_SHIFT); + + // Note: This alignment is required by Coyote anyway + static inline const size_t DEFAULT_ALIGNMENT = 64; + + HugePageMemoryPool(); + ~HugePageMemoryPool(); + + Status allocate(size_t raw_size, size_t alignment, void **out) override; + Status allocate(size_t size, void **out) override { return allocate(size, DEFAULT_ALIGNMENT, out); } + + Status reallocate(size_t old_size, size_t new_size, size_t alignment, void **ptr) override; + + void free(void *ptr, size_t size, size_t alignment) override; + void free(void *ptr, size_t size) override { free(ptr, size, DEFAULT_ALIGNMENT); } + + std::pair get_page_boundaries(const void *ptr) override; + + /** + * @param ptr The start address of the buffer to check + * @param size The size of the buffer to check + * @return Whether the given buffer and size have been allocated by this memory pool + */ + bool is_in_bounds(void *ptr, size_t size); + + size_t bytes_allocated() const override { return bytes_allocated_.load(); } + size_t total_bytes_allocated() const override { return total_bytes_allocated_.load(); } + size_t num_allocations() const override { return num_allocs_.load(); } + size_t max_memory() const override { return std::numeric_limits::max(); } + std::string backend_name() const override { return "HugePageMemoryPool"; } + + void *initial_address() const { return initial_address_; } + size_t total_capacity() const { return total_capacity_; } + +private: + /** + * @return The number of huge pages with PAGE_SIZE currently available in the system. + * Ensures this number is > 0 + */ + int get_number_of_available_huge_pages(); + + // Call back functions that get called by jemalloc to manage the underlying memory + static void* huge_page_alloc(extent_hooks_t *hooks, + void *new_addr, + size_t size, + size_t alignment, + bool *zero, + bool *commit, + unsigned arena_ind); + + static bool huge_page_dealloc( + extent_hooks_t *hooks, void *addr, size_t size, bool committed, unsigned arena_ind); + + static bool huge_page_decommit( + extent_hooks_t *hooks, void *addr, size_t size, size_t offset, size_t length, unsigned arena_ind); + + static bool huge_page_split_extend( + extent_hooks_t *hooks, void* addr, size_t size, size_t size_a, size_t size_b, bool committed, unsigned arena_ind); + + static bool huge_page_merge_extend( + extent_hooks_t *hooks, void *addr_a, size_t size_a, void *addr_b, size_t size_b, bool committed, unsigned arena_ind); + + // Explicit management of thread caches (or tcaches) + unsigned get_tcache_id_for_calling_thread(); + ReaderWriterLock tcache_lock; + std::unordered_map tcache_ids; + + // The index of the area we allocate + unsigned arena_index; + // The struct of hooks we use for the allocation + // Needs to be static to ensure it lives long enough + static inline extent_hooks_t hugepage_hooks = { + huge_page_alloc, // alloc + huge_page_dealloc, // dalloc + nullptr, // destroy + nullptr, // commit + huge_page_decommit, // decommit + nullptr, // purge_lazy + nullptr, // purge_forced + huge_page_split_extend, // split + huge_page_merge_extend // merge + }; + + // The initial address of the memory we allocate + // -> Needed for the de-allocation + void *initial_address_; + // Total capacity of allocated memory + size_t total_capacity_; + // Atomics for the statistics + std::atomic total_bytes_allocated_{0}; + std::atomic bytes_allocated_{0}; + std::atomic num_allocs_{0}; + // How much capacity is remaining in the allocated huge page memory + static inline size_t remaining_capacity = 0; + // The next free address that can be returned to jemalloc + static inline void *next_free_addr = nullptr; + static inline std::mutex allocations_mutex_; +}; + +/** + * Implements a naive memory pool that is only used for simulation purposes in systems where there + * are no huge pages. + */ +class SimpleMemoryPool : public MemoryPool { +public: + static inline const size_t PAGE_SIZE = 4096; + + // Note: This alignment is required by Coyote anyway + static inline const size_t DEFAULT_ALIGNMENT = 64; + + SimpleMemoryPool() = default; + ~SimpleMemoryPool() override; + + Status allocate(size_t size, size_t alignment, void **out) override; + Status allocate(size_t size, void **out) override { return allocate(size, DEFAULT_ALIGNMENT, out); }; + + Status reallocate(size_t old_size, size_t new_size, size_t alignment, void **ptr) override; + + void free(void *ptr, size_t size, size_t alignment) override; + void free(void *ptr, size_t size) override { free(ptr, size, DEFAULT_ALIGNMENT); }; + + std::pair get_page_boundaries(const void *ptr) override; + + size_t bytes_allocated() const override { return bytes_allocated_.load(); }; + size_t total_bytes_allocated() const override { return total_bytes_allocated_.load(); }; + size_t num_allocations() const override { return num_allocs_.load(); }; + size_t max_memory() const override { return std::numeric_limits::max(); }; + std::string backend_name() const override { return "SimpleMemoryPool"; }; + +private: + // A map of all the pages that have been allocated and mapped for this thread + std::unordered_map allocated_buffers; + // Recursive mutex to protect allocated_buffers (recursive to allow destructor to call free()) + mutable std::recursive_mutex allocated_buffers_mutex; + + // Atomics for the statistics + std::atomic total_bytes_allocated_{0}; + std::atomic bytes_allocated_{0}; + std::atomic num_allocs_{0}; +}; + +} // namespace libstf diff --git a/software/src/libstf/tlb_manager.cpp b/software/src/libstf/tlb_manager.cpp new file mode 100644 index 0000000..69c51ef --- /dev/null +++ b/software/src/libstf/tlb_manager.cpp @@ -0,0 +1,44 @@ +#include "libstf/tlb_manager.hpp" + +namespace libstf { + +// ---------------------------------------------------------------------------- +// Public methods +// ---------------------------------------------------------------------------- + +TLBManager::TLBManager(coyote::cThread &cthread, MemoryPool &memory_pool) + : memory_pool(memory_pool) + , cthread(cthread) {} + +TLBManager::~TLBManager() { + std::lock_guard tlb_guard(tlb_mutex); + + // Unmap all tlb entries we have created on the FPGA + for (const auto& mapping : existing_tlb_mappings) { + cthread.userUnmap(mapping); + } +} + +void TLBManager::ensure_tlb_mapping(const void *data_address, size_t size) { + // Guard the tlb_mapping structures. Note: This uses a recursive mutex! + std::lock_guard guard(tlb_mutex); + + auto [page_address, page_size] = + memory_pool.get_page_boundaries(data_address); + // Check if this mapping already exists + if (existing_tlb_mappings.find(page_address) == existing_tlb_mappings.end()) { + // Add new TLB entry for this page! + cthread.userMap(page_address, page_size); + existing_tlb_mappings.insert(page_address); + } + + auto end_mapped = static_cast(data_address) + size; + auto page_end = static_cast(page_address) + page_size; + + // Check if the size goes over one page and further pages need to be mapped + if (end_mapped > page_end) { + ensure_tlb_mapping(static_cast(page_end), end_mapped - page_end); + } +} + +} // namespace libstf diff --git a/software/src/libstf/tlb_manager.hpp b/software/src/libstf/tlb_manager.hpp new file mode 100644 index 0000000..257df85 --- /dev/null +++ b/software/src/libstf/tlb_manager.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include +#include + +#include "coyote/cThread.hpp" + +#include "libstf/memory_pool.hpp" + +namespace libstf { + +/** + * The TLB manager is responsible for ensuring that allocated pages are mapped to the FPGA's TLB. I + * also unmaps the pages upon destruction. + * + * Note: This does not work when the TLB becomes full and TLB entries are evicted. It is intended to + * be used with 1GiB huge pages. With the standard Coyote TLB configuration, this allows to map + * 512GiB of memory. + */ +class TLBManager { +public: + TLBManager(coyote::cThread &cthread, MemoryPool &memory_pool); + + ~TLBManager(); + + /** + * Ensures a TLB mapping on the FPGA exists for the given address and size. If the address was + * already mapped previously, no new mapping will be created. It is assumed, that the given + * address was allocated using a memory pool based on MemoryPool. + * + * Note: We could also just always call cThread::userMap(...) but that invokes a system call + * which we want to avoid for performance reasons. + * + * @param data_address Address that points to the beginning of the data + * @param size The size of the data to be mapped in bytes. + */ + void ensure_tlb_mapping(const void *data_address, size_t size); + +private: + coyote::cThread &cthread; + MemoryPool &memory_pool; + + // The address of all the pages for which we already performed a TLB mapping + std::set existing_tlb_mappings; + std::recursive_mutex tlb_mutex; +}; + +} // namespace libstf From ae57b63648fdba2451d1f0f72e57db0c88fe80cc Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 12 Jan 2026 00:27:18 +0100 Subject: [PATCH 02/18] reorder files, make lib installable --- software/CMakeLists.txt | 36 ++++++++++++++++++-- software/{src => }/libstf/buffer.cpp | 0 software/{src => }/libstf/buffer.hpp | 0 software/{src => }/libstf/common.cpp | 0 software/{src => }/libstf/common.hpp | 0 software/{src => }/libstf/configuration.cpp | 0 software/{src => }/libstf/configuration.hpp | 0 software/{src => }/libstf/error_handling.hpp | 0 software/{src => }/libstf/memory_pool.cpp | 0 software/{src => }/libstf/memory_pool.hpp | 0 software/{src => }/libstf/tlb_manager.cpp | 0 software/{src => }/libstf/tlb_manager.hpp | 0 12 files changed, 33 insertions(+), 3 deletions(-) rename software/{src => }/libstf/buffer.cpp (100%) rename software/{src => }/libstf/buffer.hpp (100%) rename software/{src => }/libstf/common.cpp (100%) rename software/{src => }/libstf/common.hpp (100%) rename software/{src => }/libstf/configuration.cpp (100%) rename software/{src => }/libstf/configuration.hpp (100%) rename software/{src => }/libstf/error_handling.hpp (100%) rename software/{src => }/libstf/memory_pool.cpp (100%) rename software/{src => }/libstf/memory_pool.hpp (100%) rename software/{src => }/libstf/tlb_manager.cpp (100%) rename software/{src => }/libstf/tlb_manager.hpp (100%) diff --git a/software/CMakeLists.txt b/software/CMakeLists.txt index c353835..3ac2a13 100644 --- a/software/CMakeLists.txt +++ b/software/CMakeLists.txt @@ -5,7 +5,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) # Collect source files -file(GLOB_RECURSE SOURCES src/libstf/*.cpp) +file(GLOB_RECURSE SOURCES libstf/*.cpp) # Create the library add_library(libstf SHARED ${SOURCES}) @@ -21,13 +21,43 @@ else() endif() endif() -list(APPEND CMAKE_PREFIX_PATH "$ENV{HOME}/opt") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") find_package(Jemalloc REQUIRED) # Specify include directories target_include_directories(libstf - PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src ${COYOTE_INCLUDE_DIRS} ${JEMALLOC_INCLUDE_DIRS} + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${COYOTE_INCLUDE_DIRS} ${JEMALLOC_INCLUDE_DIRS} ) target_link_libraries(libstf PRIVATE Coyote ${JEMALLOC_LIBRARIES}) + +include(GNUInstallDirs) + +# Install the library +install(TARGETS libstf + EXPORT libstfTargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) + +# Install headers +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/libstf + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libstf + FILES_MATCHING PATTERN "*.hpp" +) + +# Export package configuration +install(EXPORT libstfTargets + FILE libstfTargets.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/libstf +) + +# Generate CMake package configuration files +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/libstfConfigVersion.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion +) diff --git a/software/src/libstf/buffer.cpp b/software/libstf/buffer.cpp similarity index 100% rename from software/src/libstf/buffer.cpp rename to software/libstf/buffer.cpp diff --git a/software/src/libstf/buffer.hpp b/software/libstf/buffer.hpp similarity index 100% rename from software/src/libstf/buffer.hpp rename to software/libstf/buffer.hpp diff --git a/software/src/libstf/common.cpp b/software/libstf/common.cpp similarity index 100% rename from software/src/libstf/common.cpp rename to software/libstf/common.cpp diff --git a/software/src/libstf/common.hpp b/software/libstf/common.hpp similarity index 100% rename from software/src/libstf/common.hpp rename to software/libstf/common.hpp diff --git a/software/src/libstf/configuration.cpp b/software/libstf/configuration.cpp similarity index 100% rename from software/src/libstf/configuration.cpp rename to software/libstf/configuration.cpp diff --git a/software/src/libstf/configuration.hpp b/software/libstf/configuration.hpp similarity index 100% rename from software/src/libstf/configuration.hpp rename to software/libstf/configuration.hpp diff --git a/software/src/libstf/error_handling.hpp b/software/libstf/error_handling.hpp similarity index 100% rename from software/src/libstf/error_handling.hpp rename to software/libstf/error_handling.hpp diff --git a/software/src/libstf/memory_pool.cpp b/software/libstf/memory_pool.cpp similarity index 100% rename from software/src/libstf/memory_pool.cpp rename to software/libstf/memory_pool.cpp diff --git a/software/src/libstf/memory_pool.hpp b/software/libstf/memory_pool.hpp similarity index 100% rename from software/src/libstf/memory_pool.hpp rename to software/libstf/memory_pool.hpp diff --git a/software/src/libstf/tlb_manager.cpp b/software/libstf/tlb_manager.cpp similarity index 100% rename from software/src/libstf/tlb_manager.cpp rename to software/libstf/tlb_manager.cpp diff --git a/software/src/libstf/tlb_manager.hpp b/software/libstf/tlb_manager.hpp similarity index 100% rename from software/src/libstf/tlb_manager.hpp rename to software/libstf/tlb_manager.hpp From f9b6fd95bde3ca29e5193d93659c78abad886063 Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 12 Jan 2026 00:37:00 +0100 Subject: [PATCH 03/18] use std::shared_ptr in buffer --- software/CMakeLists.txt | 7 ++++++- software/libstf/buffer.cpp | 2 +- software/libstf/buffer.hpp | 8 ++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/software/CMakeLists.txt b/software/CMakeLists.txt index 3ac2a13..0a0f863 100644 --- a/software/CMakeLists.txt +++ b/software/CMakeLists.txt @@ -26,7 +26,12 @@ find_package(Jemalloc REQUIRED) # Specify include directories target_include_directories(libstf - PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${COYOTE_INCLUDE_DIRS} ${JEMALLOC_INCLUDE_DIRS} + PUBLIC + $ + $ +) +target_include_directories(libstf + PUBLIC ${COYOTE_INCLUDE_DIRS} ${JEMALLOC_INCLUDE_DIRS} ) target_link_libraries(libstf PRIVATE Coyote ${JEMALLOC_LIBRARIES}) diff --git a/software/libstf/buffer.cpp b/software/libstf/buffer.cpp index 72383da..84cf0b3 100644 --- a/software/libstf/buffer.cpp +++ b/software/libstf/buffer.cpp @@ -2,7 +2,7 @@ namespace libstf { -std::shared_ptr make_buffer(MemoryPool &memory_pool, void *ptr, size_t size, size_t capacity) { +std::shared_ptr make_buffer(std::shared_ptr memory_pool, void *ptr, size_t size, size_t capacity) { return std::shared_ptr( new Buffer{.ptr = ptr, .size = size, .capacity = capacity}, BufferDeleter(memory_pool)); diff --git a/software/libstf/buffer.hpp b/software/libstf/buffer.hpp index cb05719..50a9f05 100644 --- a/software/libstf/buffer.hpp +++ b/software/libstf/buffer.hpp @@ -14,18 +14,18 @@ struct Buffer { // Deleter struct for allocations that is used to clean up the memory that we pass as a shared_ptr. struct BufferDeleter { - MemoryPool &memory_pool; + std::shared_ptr memory_pool; - BufferDeleter(MemoryPool &memory_pool) : memory_pool(memory_pool) {} + BufferDeleter(std::shared_ptr memory_pool) : memory_pool(memory_pool) {} void operator()(Buffer const *buffer) const { // First: free the allocation the struct manages - memory_pool.free(buffer->ptr, buffer->capacity); + memory_pool->free(buffer->ptr, buffer->capacity); // Then: free the struct itself! delete buffer; } }; -std::shared_ptr make_buffer(MemoryPool &memory_pool, void *ptr, size_t size, size_t capacity); +std::shared_ptr make_buffer(std::shared_ptr memory_pool, void *ptr, size_t size, size_t capacity); } // namespace libstf From 92900e283065dfc6602512eb7fd257dda85fdc44 Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 12 Jan 2026 00:38:45 +0100 Subject: [PATCH 04/18] use shared_ptr in tlb manager aswell --- software/libstf/tlb_manager.cpp | 8 ++++---- software/libstf/tlb_manager.hpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/software/libstf/tlb_manager.cpp b/software/libstf/tlb_manager.cpp index 69c51ef..511e1b0 100644 --- a/software/libstf/tlb_manager.cpp +++ b/software/libstf/tlb_manager.cpp @@ -6,7 +6,7 @@ namespace libstf { // Public methods // ---------------------------------------------------------------------------- -TLBManager::TLBManager(coyote::cThread &cthread, MemoryPool &memory_pool) +TLBManager::TLBManager(std::shared_ptr cthread, std::shared_ptr memory_pool) : memory_pool(memory_pool) , cthread(cthread) {} @@ -15,7 +15,7 @@ TLBManager::~TLBManager() { // Unmap all tlb entries we have created on the FPGA for (const auto& mapping : existing_tlb_mappings) { - cthread.userUnmap(mapping); + cthread->userUnmap(mapping); } } @@ -24,11 +24,11 @@ void TLBManager::ensure_tlb_mapping(const void *data_address, size_t size) { std::lock_guard guard(tlb_mutex); auto [page_address, page_size] = - memory_pool.get_page_boundaries(data_address); + memory_pool->get_page_boundaries(data_address); // Check if this mapping already exists if (existing_tlb_mappings.find(page_address) == existing_tlb_mappings.end()) { // Add new TLB entry for this page! - cthread.userMap(page_address, page_size); + cthread->userMap(page_address, page_size); existing_tlb_mappings.insert(page_address); } diff --git a/software/libstf/tlb_manager.hpp b/software/libstf/tlb_manager.hpp index 257df85..a026341 100644 --- a/software/libstf/tlb_manager.hpp +++ b/software/libstf/tlb_manager.hpp @@ -19,7 +19,7 @@ namespace libstf { */ class TLBManager { public: - TLBManager(coyote::cThread &cthread, MemoryPool &memory_pool); + TLBManager(std::shared_ptr cthread, std::shared_ptr memory_pool); ~TLBManager(); @@ -37,8 +37,8 @@ class TLBManager { void ensure_tlb_mapping(const void *data_address, size_t size); private: - coyote::cThread &cthread; - MemoryPool &memory_pool; + std::shared_ptr cthread; + std::shared_ptr memory_pool; // The address of all the pages for which we already performed a TLB mapping std::set existing_tlb_mappings; From 38c99588a913df0be7222903bbaa41e63d044fd1 Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 12 Jan 2026 00:42:57 +0100 Subject: [PATCH 05/18] put buffer implementation in c file --- software/libstf/buffer.cpp | 9 +++++++++ software/libstf/buffer.hpp | 12 ++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/software/libstf/buffer.cpp b/software/libstf/buffer.cpp index 84cf0b3..1a4f455 100644 --- a/software/libstf/buffer.cpp +++ b/software/libstf/buffer.cpp @@ -2,6 +2,15 @@ namespace libstf { +BufferDeleter::BufferDeleter(std::shared_ptr memory_pool) : memory_pool(memory_pool) {} + +void BufferDeleter::operator()(Buffer const *buffer) const { + // First: free the allocation the struct manages + memory_pool->free(buffer->ptr, buffer->capacity); + // Then: free the struct itself! + delete buffer; +} + std::shared_ptr make_buffer(std::shared_ptr memory_pool, void *ptr, size_t size, size_t capacity) { return std::shared_ptr( new Buffer{.ptr = ptr, .size = size, .capacity = capacity}, diff --git a/software/libstf/buffer.hpp b/software/libstf/buffer.hpp index 50a9f05..c9113a5 100644 --- a/software/libstf/buffer.hpp +++ b/software/libstf/buffer.hpp @@ -14,16 +14,12 @@ struct Buffer { // Deleter struct for allocations that is used to clean up the memory that we pass as a shared_ptr. struct BufferDeleter { - std::shared_ptr memory_pool; + BufferDeleter(std::shared_ptr memory_pool); - BufferDeleter(std::shared_ptr memory_pool) : memory_pool(memory_pool) {} + void operator()(Buffer const *buffer) const; - void operator()(Buffer const *buffer) const { - // First: free the allocation the struct manages - memory_pool->free(buffer->ptr, buffer->capacity); - // Then: free the struct itself! - delete buffer; - } +private: + std::shared_ptr memory_pool; }; std::shared_ptr make_buffer(std::shared_ptr memory_pool, void *ptr, size_t size, size_t capacity); From c82dd266b90e05fd85b29157c19ac4d0c5caa2c3 Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 12 Jan 2026 00:58:58 +0100 Subject: [PATCH 06/18] document usage of -DCMAKE_PREFIX_PATH in readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cb0b3e2..f86909b 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,13 @@ This will install jemalloc in `~/opt/jemalloc`. The you can build the libstf lib ```bash mkdir build && cd build -cmake .. +cmake -DCMAKE_PREFIX_PATH=$HOME/opt .. make ``` +Notice the `-DCMAKE_PREFIX_PATH=$HOME/opt`: this is so that CMake will look in the path where +jemalloc has been installed with the script and link libstf against it. + ## Code Style For now, we have a couple of code style rules: From 9192c6d746a7dd693d3f47fd56ae1999a4b582c8 Mon Sep 17 00:00:00 2001 From: Luca Date: Mon, 12 Jan 2026 11:17:25 +0100 Subject: [PATCH 07/18] use std::shared_ptr in configuration --- software/libstf/configuration.cpp | 10 +++++----- software/libstf/configuration.hpp | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/software/libstf/configuration.cpp b/software/libstf/configuration.cpp index bf540ac..380a65b 100644 --- a/software/libstf/configuration.cpp +++ b/software/libstf/configuration.cpp @@ -56,23 +56,23 @@ std::ostream &operator<<(std::ostream &out, const std::vector &c // Config // ---------------------------------------------------------------------------- -Config::Config(coyote::cThread &cthread, uint32_t addr_offset) : +Config::Config(std::shared_ptr cthread, uint32_t addr_offset) : cthread(cthread), addr_offset(addr_offset) {} ConfigRegister Config::read_register(uint32_t addr) { - return ConfigRegister(addr_offset + addr, cthread.getCSR(addr_offset + addr)); + return ConfigRegister(addr_offset + addr, cthread->getCSR(addr_offset + addr)); } void Config::write_register(ConfigRegister reg) { - cthread.setCSR(reg.value(), addr_offset + reg.addr()); + cthread->setCSR(reg.value(), addr_offset + reg.addr()); } // ---------------------------------------------------------------------------- // GlobalConfig // ---------------------------------------------------------------------------- -GlobalConfig::GlobalConfig(coyote::cThread &cthread) : Config(cthread, 0) { +GlobalConfig::GlobalConfig(std::shared_ptr cthread) : Config(cthread, 0) { system_id_ = read_register(0).value(); num_configs_ = read_register(1).value(); @@ -105,7 +105,7 @@ bool GlobalConfig::has_config(uint32_t config_id) { // MemConfig // ---------------------------------------------------------------------------- -MemConfig::MemConfig(coyote::cThread &cthread, uint32_t addr_offset) : +MemConfig::MemConfig(std::shared_ptr cthread, uint32_t addr_offset) : Config(cthread, addr_offset), num_streams_(read_register(1).value()) {} diff --git a/software/libstf/configuration.hpp b/software/libstf/configuration.hpp index 3ad3818..97c7800 100644 --- a/software/libstf/configuration.hpp +++ b/software/libstf/configuration.hpp @@ -31,7 +31,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector& c class Config { public: - Config(coyote::cThread &cthread, uint32_t addr_offset); + Config(std::shared_ptr cthread, uint32_t addr_offset); /** * Read configuration value from addr starting at addr_offset. @@ -43,7 +43,7 @@ class Config { static constexpr uint32_t ID = -1; private: - coyote::cThread &cthread; + std::shared_ptr cthread; uint32_t addr_offset; }; @@ -53,7 +53,7 @@ class GlobalConfig : private Config { * Note: Takes the cThread as a reference so we don't create a circular dependency with * CelerisContext. */ - GlobalConfig(coyote::cThread &cthread); + GlobalConfig(std::shared_ptr cthread); /** * Checks whether a config with a certain config_id is present in the system. Can be used to @@ -78,7 +78,7 @@ class GlobalConfig : private Config { class MemConfig : public Config { public: - MemConfig(coyote::cThread &cthread, uint32_t addr_offset); + MemConfig(std::shared_ptr cthread, uint32_t addr_offset); /** * Writes the CSR registers to add a new buffer to the FPGA for the given stream. From 1d4ee10aace54c2a9ee7a3e4f7d20cee4550f976 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 13 Jan 2026 01:28:24 +0100 Subject: [PATCH 08/18] Tie off tasks --- hardware/src/hdl/config/stream_config.sv | 1 - hardware/src/hdl/data_interfaces.sv | 97 +++++++++++++++++++++--- 2 files changed, 85 insertions(+), 13 deletions(-) diff --git a/hardware/src/hdl/config/stream_config.sv b/hardware/src/hdl/config/stream_config.sv index 53a64ce..95350cd 100644 --- a/hardware/src/hdl/config/stream_config.sv +++ b/hardware/src/hdl/config/stream_config.sv @@ -4,7 +4,6 @@ `include "config_macros.svh" module StreamConfig #( - parameter NUM_SELECT, parameter NUM_STREAMS ) ( input logic clk, diff --git a/hardware/src/hdl/data_interfaces.sv b/hardware/src/hdl/data_interfaces.sv index 6591051..23bca98 100644 --- a/hardware/src/hdl/data_interfaces.sv +++ b/hardware/src/hdl/data_interfaces.sv @@ -6,17 +6,19 @@ interface valid_i #( data_t data; logic valid; + task tie_off_m(); // Tie off unused slave signals + data = '0; + valid = 1'b0; + endtask + modport m ( + import tie_off_m, output data, valid ); modport s ( input data, valid ); - - task tie_off_s(); // Tie off unused slave signals - valid = 1'b0; - endtask endinterface interface ready_valid_i #( @@ -26,23 +28,26 @@ interface ready_valid_i #( logic valid; logic ready; + task tie_off_m(); // Tie off unused slave signals + data = '0; + valid = 1'b0; + endtask + + task tie_off_s(); // Tie off unused master signals + ready = 1'b0; + endtask + modport m ( + import tie_off_m, input ready, output data, valid ); modport s ( + import tie_off_s, input data, valid, output ready ); - - task tie_off_m(); // Tie off unused master signals - ready = 1'b0; - endtask - - task tie_off_s(); // Tie off unused slave signals - valid = 1'b0; - endtask endinterface interface data_i #( @@ -54,12 +59,25 @@ interface data_i #( logic valid; logic ready; + task tie_off_m(); // Tie off unused slave signals + data = '0; + keep = 1'b0; + last = 1'b0; + valid = 1'b0; + endtask + + task tie_off_s(); // Tie off unused master signals + ready = 1'b0; + endtask + modport m ( + import tie_off_m, input ready, output data, keep, last, valid ); modport s ( + import tie_off_s, input data, keep, last, valid, output ready ); @@ -75,12 +93,25 @@ interface ndata_i #( logic valid; logic ready; + task tie_off_m(); // Tie off unused slave signals + data = '0; + keep = '0; + last = 1'b0; + valid = 1'b0; + endtask + + task tie_off_s(); // Tie off unused master signals + ready = 1'b0; + endtask + modport m ( + import tie_off_m, input ready, output data, keep, last, valid ); modport s ( + import tie_off_s, input data, keep, last, valid, output ready ); @@ -97,12 +128,26 @@ interface tagged_i #( logic valid; logic ready; + task tie_off_m(); // Tie off unused slave signals + data = '0; + tag = '0; + keep = 1'b0; + last = 1'b0; + valid = 1'b0; + endtask + + task tie_off_s(); // Tie off unused master signals + ready = 1'b0; + endtask + modport m ( + import tie_off_m, input ready, output data, tag, keep, last, valid ); modport s ( + import tie_off_s, input data, tag, keep, last, valid, output ready ); @@ -122,12 +167,26 @@ interface ntagged_i #( logic valid; logic ready; + task tie_off_m(); // Tie off unused slave signals + data = '0; + tag = '0; + keep = '0; + last = 1'b0; + valid = 1'b0; + endtask + + task tie_off_s(); // Tie off unused master signals + ready = 1'b0; + endtask + modport m ( + import tie_off_m, input ready, output data, tag, keep, last, valid ); modport s ( + import tie_off_s, input data, tag, keep, last, valid, output ready ); @@ -143,12 +202,26 @@ interface typed_ndata_i #( logic valid; logic ready; + task tie_off_m(); // Tie off unused slave signals + data = '0; + typ = '0; + keep = '0; + last = 1'b0; + valid = 1'b0; + endtask + + task tie_off_s(); // Tie off unused master signals + ready = 1'b0; + endtask + modport m ( + import tie_off_m, input ready, output data, typ, keep, last, valid ); modport s ( + import tie_off_s, input data, typ, keep, last, valid, output ready ); From 3e9154518d39ff0f6627fb69b9a95bc5ad7a3738 Mon Sep 17 00:00:00 2001 From: Luca Date: Thu, 15 Jan 2026 19:28:12 +0100 Subject: [PATCH 09/18] basic memory pool: support bounds checking on sub-buffers within an allocation --- software/libstf/memory_pool.cpp | 40 +++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/software/libstf/memory_pool.cpp b/software/libstf/memory_pool.cpp index 0f67208..6db0a31 100644 --- a/software/libstf/memory_pool.cpp +++ b/software/libstf/memory_pool.cpp @@ -197,25 +197,34 @@ unsigned HugePageMemoryPool::get_tcache_id_for_calling_thread() { return tcache_id; } +/** + * Checks if the given buffer at `buf` is within the provided `bounds`. + * Both the buffer and the bounds are encoded as a pair (base_addr, size). + */ +inline bool is_buffer_within_bounds(std::pair bounds, std::pair buf) { + auto buf_start = static_cast(std::get<0>(buf)); + auto buf_end = buf_start + std::get<1>(buf); + auto bounds_start = static_cast(std::get<0>(bounds)); + auto bounds_end = bounds_start + std::get<1>(bounds); + + return buf_start >= bounds_start && buf_end <= bounds_end; +} + bool HugePageMemoryPool::is_in_bounds(void *ptr, size_t size) { - auto address = static_cast(ptr); - auto initial = static_cast(initial_address_); - auto max_address = initial + total_capacity_; - return address >= initial && address + size - 1 <= max_address; + return is_buffer_within_bounds({initial_address_, total_capacity_}, {ptr, size}); } std::pair HugePageMemoryPool::get_page_boundaries(const void *ptr) { - auto byte_ptr = static_cast(ptr); - auto initial = static_cast(initial_address_); - auto max_address = initial + total_capacity_; - if (byte_ptr < initial || byte_ptr > max_address) { + if (!is_buffer_within_bounds({initial_address_, total_capacity_}, {const_cast(ptr), 0})) { std::ostringstream err; err << "The Provided address " << static_cast(ptr) << " is not within the bounds of the HugePageMemoryPool"; throw std::runtime_error(err.str()); } - auto n_th_page = (byte_ptr - initial) / PAGE_SIZE; + auto initial = reinterpret_cast(const_cast(initial_address_)); + auto buf = reinterpret_cast(const_cast(ptr)); + auto n_th_page = (buf - initial) / PAGE_SIZE; return std::make_pair(static_cast(initial + n_th_page * PAGE_SIZE), PAGE_SIZE); } @@ -445,13 +454,16 @@ void SimpleMemoryPool::free(void *ptr, size_t size, size_t alignment) { std::pair SimpleMemoryPool::get_page_boundaries(const void *ptr) { std::lock_guard lock(allocated_buffers_mutex); - if (allocated_buffers.find((void *) ptr) == allocated_buffers.end()) { - std::ostringstream err; - err << "The Provided address " << ptr << " is not within the bounds of the memory pool!"; - throw std::runtime_error(err.str()); + + for (const auto &buffer : allocated_buffers) { + if (is_buffer_within_bounds({buffer.first, buffer.second}, {const_cast(ptr), 0})) { + return {buffer.first, buffer.second}; + } } - return std::make_pair((void *) ptr, allocated_buffers[(void *) ptr]); + std::ostringstream err; + err << "The Provided address " << ptr << " is not within the bounds of the memory pool!"; + throw std::runtime_error(err.str()); } } // namespace libstf From 69a293e1df7e9443ccfaa9d6f5a54af4c0a10123 Mon Sep 17 00:00:00 2001 From: Luca Date: Fri, 16 Jan 2026 20:08:55 +0100 Subject: [PATCH 10/18] fix synthesis issue --- hardware/src/hdl/data_interfaces.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hardware/src/hdl/data_interfaces.sv b/hardware/src/hdl/data_interfaces.sv index 23bca98..635b321 100644 --- a/hardware/src/hdl/data_interfaces.sv +++ b/hardware/src/hdl/data_interfaces.sv @@ -204,7 +204,7 @@ interface typed_ndata_i #( task tie_off_m(); // Tie off unused slave signals data = '0; - typ = '0; + typ = BYTE_T; keep = '0; last = 1'b0; valid = 1'b0; From e211d55bae1bd42a2bfb0ac86f6aa3a240b2971d Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Sun, 18 Jan 2026 18:01:57 +0100 Subject: [PATCH 11/18] Proper software-side StreamConfig --- software/libstf/configuration.cpp | 14 ++++++++++++++ software/libstf/configuration.hpp | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/software/libstf/configuration.cpp b/software/libstf/configuration.cpp index 380a65b..198a322 100644 --- a/software/libstf/configuration.cpp +++ b/software/libstf/configuration.cpp @@ -126,4 +126,18 @@ void MemConfig::enqueue_buffer(stream_t stream_id, Buffer &buffer) { write_register(ConfigRegister(stream_id, vaddr << BUFFER_SIZE_BITS | capacity_as_num_transfers)); } +// ---------------------------------------------------------------------------- +// StreamConfig +// ---------------------------------------------------------------------------- + +StreamConfig::StreamConfig(std::shared_ptr cthread, uint32_t addr_offset) : + Config(cthread, addr_offset), + num_streams_(read_register(1).value()) {} + +void StreamConfig::enqueue_stream_config(stream_t stream_id, type_t type, uint8_t select) { + assert(stream_id < num_streams_); + + write_register(ConfigRegister(stream_id, (select << 3) | static_cast(type))); +} + } // namespace libstf diff --git a/software/libstf/configuration.hpp b/software/libstf/configuration.hpp index 97c7800..2a56370 100644 --- a/software/libstf/configuration.hpp +++ b/software/libstf/configuration.hpp @@ -102,7 +102,16 @@ class MemConfig : public Config { class StreamConfig : public Config { public: + StreamConfig(std::shared_ptr cthread, uint32_t addr_offset); + + void enqueue_stream_config(stream_t stream_id, type_t type, uint8_t select); + + const stream_t num_streams() const { return num_streams_; } + static constexpr uint32_t ID = 1; + +private: + stream_t num_streams_; }; } // namespace libstf From d93d0ed48f03c8034ecb57c3b3b6eadd7ec1252d Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Sun, 18 Jan 2026 18:02:31 +0100 Subject: [PATCH 12/18] Value variant and direct make_buffer --- software/libstf/buffer.cpp | 9 +++++++++ software/libstf/buffer.hpp | 1 + software/libstf/common.cpp | 5 +++++ software/libstf/common.hpp | 6 ++++++ 4 files changed, 21 insertions(+) diff --git a/software/libstf/buffer.cpp b/software/libstf/buffer.cpp index 1a4f455..eb8a934 100644 --- a/software/libstf/buffer.cpp +++ b/software/libstf/buffer.cpp @@ -17,4 +17,13 @@ std::shared_ptr make_buffer(std::shared_ptr memory_pool, voi BufferDeleter(memory_pool)); } +std::shared_ptr make_buffer(std::shared_ptr memory_pool, size_t size, Status &status) { + void *ptr; + status = memory_pool->allocate(size, &ptr); + + return std::shared_ptr( + new Buffer{.ptr = ptr, .size = size, .capacity = size}, + BufferDeleter(memory_pool)); +} + } // namespace libstf diff --git a/software/libstf/buffer.hpp b/software/libstf/buffer.hpp index c9113a5..5a836e6 100644 --- a/software/libstf/buffer.hpp +++ b/software/libstf/buffer.hpp @@ -23,5 +23,6 @@ struct BufferDeleter { }; std::shared_ptr make_buffer(std::shared_ptr memory_pool, void *ptr, size_t size, size_t capacity); +std::shared_ptr make_buffer(std::shared_ptr memory_pool, size_t size, Status &status); } // namespace libstf diff --git a/software/libstf/common.cpp b/software/libstf/common.cpp index e8e23e4..99d98e1 100644 --- a/software/libstf/common.cpp +++ b/software/libstf/common.cpp @@ -24,3 +24,8 @@ std::ostream &operator<<(std::ostream &out, const type_t &data_type) { } } // namespace libstf + +std::ostream &operator<<(std::ostream &out, const libstf::Value &v) { + std::visit([&out](auto &&val) { out << val; }, v); + return out; +} diff --git a/software/libstf/common.hpp b/software/libstf/common.hpp index d0c390f..25e6e7b 100644 --- a/software/libstf/common.hpp +++ b/software/libstf/common.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include namespace libstf { @@ -51,4 +52,9 @@ constexpr size_t size_of(type_t type) { throw std::invalid_argument("Invalid type"); } +using Value = std::variant; + } // namespace celeris + +// If this is not defined in the global namespace, we cannot find it in Celeris +std::ostream &operator<<(std::ostream &out, const libstf::Value &v); From f0fba3b13fb5ddd7134fff298ac84ab360559eb8 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 20 Jan 2026 00:09:57 +0100 Subject: [PATCH 13/18] Dispatch types --- software/libstf/common.hpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/software/libstf/common.hpp b/software/libstf/common.hpp index 25e6e7b..ed1895f 100644 --- a/software/libstf/common.hpp +++ b/software/libstf/common.hpp @@ -52,7 +52,28 @@ constexpr size_t size_of(type_t type) { throw std::invalid_argument("Invalid type"); } -using Value = std::variant; +/** + * Type dispatcher for type_t. This assumes you pass it a struct with an operator as the Func. + */ +template +auto dispatch_type(libstf::type_t type, Func &&func, Args &&... args) { + switch (type) { + case libstf::type_t::BYTE_T: + return func.template operator()(std::forward(args)...); + case libstf::type_t::INT32_T: + return func.template operator()(std::forward(args)...); + case libstf::type_t::INT64_T: + return func.template operator()(std::forward(args)...); + case libstf::type_t::FLOAT_T: + return func.template operator()(std::forward(args)...); + case libstf::type_t::DOUBLE_T: + return func.template operator()(std::forward(args)...); + case type_t::NUM_TYPES: break; + } + throw std::invalid_argument("Invalid type"); +} + +using Value = std::variant; } // namespace celeris From 0ad3cfe52768bf08e1d54511006bc57603d93de8 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 20 Jan 2026 03:12:52 +0100 Subject: [PATCH 14/18] AXI multiplexer --- .gitignore | 2 +- hardware/src/hdl/axi/axi_multiplexer.sv | 48 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 hardware/src/hdl/axi/axi_multiplexer.sv diff --git a/.gitignore b/.gitignore index 5d48747..7a6b619 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ build -hardware/build_hw/** +hardware/build_hw # Generated unit test files hardware/unit-tests/diff/** diff --git a/hardware/src/hdl/axi/axi_multiplexer.sv b/hardware/src/hdl/axi/axi_multiplexer.sv new file mode 100644 index 0000000..53ab6a5 --- /dev/null +++ b/hardware/src/hdl/axi/axi_multiplexer.sv @@ -0,0 +1,48 @@ +`timescale 1ns / 1ps + +import libstf::*; + +module AXIMultiplexer #( + parameter NUM_STREAMS +) ( + input logic clk, + input logic rst_n, + + ready_valid_i.s select, // #(logic[$clog2(NUM_STREAMS) - 1:0]) + + AXI4S.s in[NUM_STREAMS], + AXI4S.m out +); + +ndata_i #(data8_t, AXI_DATA_BITS / 8) in_data[NUM_STREAMS](); +ndata_i #(data8_t, AXI_DATA_BITS / 8) out_data(); + +for (genvar I = 0; I < NUM_STREAMS; I++) begin + assign in_data[I].data = in[I].tdata; + assign in_data[I].keep = in[I].tkeep; + assign in_data[I].last = in[I].tlast; + assign in_data[I].valid = in[I].tvalid; + assign in[I].tready = in_data[I].ready; +end + +DataMultiplexer #( + .data_t(data8_t), + .NUM_ELEMENTS(AXI_DATA_BITS / 8), + .NUM_STREAMS(NUM_STREAMS) +) inst_mux ( + .clk(clk), + .rst_n(rst_n), + + .select(select), + + .in(in_data), + .out(out_data) +); + +assign out.tdata = out_data.data; +assign out.tkeep = out_data.keep; +assign out.tlast = out_data.last; +assign out.tvalid = out_data.valid; +assign out_data.ready = out.tready; + +endmodule From 2e3a2a063fb014661ae451203efa2ead5463c3b6 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 20 Jan 2026 18:47:43 +0100 Subject: [PATCH 15/18] Fixed left over StreamConfig parameter --- hardware/unit-tests/vfpga_tops/dict_test.sv | 3 +-- hardware/unit-tests/vfpga_tops/typed_dict_test.sv | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/hardware/unit-tests/vfpga_tops/dict_test.sv b/hardware/unit-tests/vfpga_tops/dict_test.sv index 5e6ca92..a4ad4c3 100644 --- a/hardware/unit-tests/vfpga_tops/dict_test.sv +++ b/hardware/unit-tests/vfpga_tops/dict_test.sv @@ -36,7 +36,7 @@ ndata_i #(data64_t, NUM_ELEMENTS) dict_out(); AXI4S axi_out[N_STRM_AXI](.aclk(clk), .aresetn(rst_n)); -// -- Configuration ------------------------------------------------------------------------------- +// -- Configuration -------------------------------------------------------------------------------- write_config_i write_configs[1](.*); read_config_i read_configs [1](.*); GlobalConfig #( @@ -55,7 +55,6 @@ GlobalConfig #( stream_config_i stream_config[1](.*); StreamConfig #( - .NUM_SELECT(2), .NUM_STREAMS(1) ) inst_stream_config ( .clk(clk), diff --git a/hardware/unit-tests/vfpga_tops/typed_dict_test.sv b/hardware/unit-tests/vfpga_tops/typed_dict_test.sv index 0e5eba1..b263619 100644 --- a/hardware/unit-tests/vfpga_tops/typed_dict_test.sv +++ b/hardware/unit-tests/vfpga_tops/typed_dict_test.sv @@ -56,7 +56,6 @@ GlobalConfig #( stream_config_i stream_config[1](.*); StreamConfig #( - .NUM_SELECT(2), .NUM_STREAMS(1) ) inst_stream_config ( .clk(clk), From 3ee2cf87380bedf2c27ef53f3f2ed2466cc36333 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 20 Jan 2026 18:48:40 +0100 Subject: [PATCH 16/18] Made NDataWidthConverter more flexible to handle any power of two input width that is less or equal to the output width --- .../src/hdl/stream/data_width_converter.sv | 78 ++++++++++--------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/hardware/src/hdl/stream/data_width_converter.sv b/hardware/src/hdl/stream/data_width_converter.sv index 0b3170d..f8c6b0a 100644 --- a/hardware/src/hdl/stream/data_width_converter.sv +++ b/hardware/src/hdl/stream/data_width_converter.sv @@ -5,7 +5,8 @@ /** * Converts an ndata_i stream to a different width. * - * Currently only same input and output width and 8 to 16 elements is supported. + * Note: Supports any power-of-two IN_WIDTH that is <= OUT_WIDTH. OUT_WIDTH must be a multiple of + * IN_WIDTH. */ module NDataWidthConverter #( parameter type data_t @@ -17,67 +18,68 @@ module NDataWidthConverter #( ndata_i.m out // #(data_t, OUT_WIDTH) ); -localparam IN_WIDTH = in.NUM_ELEMENTS; -localparam OUT_WIDTH = out.NUM_ELEMENTS; +localparam IN_WIDTH = in.NUM_ELEMENTS; +localparam OUT_WIDTH = out.NUM_ELEMENTS; +localparam NUM_SLOTS = OUT_WIDTH / IN_WIDTH; +localparam SLOT_COUNTER_WIDTH = $clog2(NUM_SLOTS); -`ASSERT_ELAB(IN_WIDTH == OUT_WIDTH || IN_WIDTH == 8 && OUT_WIDTH == 16) +`ASSERT_ELAB(IN_WIDTH <= OUT_WIDTH) +`ASSERT_ELAB((IN_WIDTH & (IN_WIDTH - 1)) == 0) // IN_WIDTH is power of 2 +`ASSERT_ELAB((OUT_WIDTH & (OUT_WIDTH - 1)) == 0) // OUT_WIDTH is power of 2 +`ASSERT_ELAB(OUT_WIDTH % IN_WIDTH == 0) // Exact multiple generate if (IN_WIDTH == OUT_WIDTH) begin `DATA_ASSIGN(in, out) -end else if (IN_WIDTH == 8 && OUT_WIDTH == 16) begin - logic is_upper, n_is_upper; +end else begin + logic[SLOT_COUNTER_WIDTH - 1:0] slot_idx, n_slot_idx; data_t[OUT_WIDTH - 1:0] data, n_data; - logic[OUT_WIDTH - 1:0] keep, n_keep; + logic [OUT_WIDTH - 1:0] keep, n_keep; logic last, n_last; logic valid, n_valid; assign in.ready = out.ready; always_ff @(posedge clk) begin - if (rst_n == 1'b0) begin - is_upper <= 1'b0; - - valid <= 1'b0; + if (!rst_n) begin + slot_idx <= '0; + valid <= 1'b0; end else begin - is_upper <= n_is_upper; - - data <= n_data; - keep <= n_keep; - last <= n_last; - valid <= n_valid; + slot_idx <= n_slot_idx; + data <= n_data; + keep <= n_keep; + last <= n_last; + valid <= n_valid; end end always_comb begin - n_is_upper = is_upper; - - n_data = data; - n_keep = keep; - n_last = last; - n_valid = 1'b0; + n_slot_idx = slot_idx; + n_data = data; + n_keep = keep; + n_last = last; + n_valid = 1'b0; if (out.ready) begin if (in.valid) begin - if (!in.last) begin - n_is_upper = ~is_upper; + if (in.last) begin + n_slot_idx = '0; end else begin - n_is_upper = 1'b0; + n_slot_idx = slot_idx + 1; // Wraps around end end - if (!is_upper) begin - n_data[7:0] = in.data; - n_keep[15:8] = '0; - n_keep[7:0] = in.keep; - - if (in.last) begin - n_valid = in.valid; - end - end else begin - n_data[15:8] = in.data; - n_keep[15:8] = in.keep; - n_valid = in.valid; + if (slot_idx == 0) begin + n_keep = '0; + end + + for (int i = 0; i < IN_WIDTH; i++) begin + n_data[slot_idx * IN_WIDTH + i] = in.data[i]; + n_keep[slot_idx * IN_WIDTH + i] = in.keep[i]; + end + + if (in.valid && (in.last || slot_idx == SLOT_COUNTER_WIDTH'(NUM_SLOTS - 1))) begin + n_valid = 1'b1; end n_last = in.last; From 77ab8b04338e88e8fd96b89a5f10f7007e93545c Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 20 Jan 2026 20:23:11 +0100 Subject: [PATCH 17/18] Added SortedSeqToBitmask module and test --- hardware/src/hdl/stream/data_adapters.sv | 1 - .../src/hdl/util/sorted_seq_to_bitmask.sv | 119 ++++++++++++++++++ .../unit-tests/sorted_seq_to_bitmask_test.py | 105 ++++++++++++++++ .../vfpga_tops/sorted_seq_to_bitmask_test.sv | 72 +++++++++++ 4 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 hardware/src/hdl/util/sorted_seq_to_bitmask.sv create mode 100644 hardware/unit-tests/sorted_seq_to_bitmask_test.py create mode 100644 hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv diff --git a/hardware/src/hdl/stream/data_adapters.sv b/hardware/src/hdl/stream/data_adapters.sv index 76ca9c9..e13750f 100644 --- a/hardware/src/hdl/stream/data_adapters.sv +++ b/hardware/src/hdl/stream/data_adapters.sv @@ -26,7 +26,6 @@ localparam AXI_ELEMENT_SIZE = AXI_ELEMENT_WIDTH / 8; `ASSERT_ELAB(AXI_WIDTH == AXI_ELEMENT_WIDTH * NUM_AXI_ELEMENTS) `ASSERT_ELAB($bits(data_t) <= AXI_ELEMENT_WIDTH) -`ASSERT_ELAB(NUM_ELEMENTS == NUM_AXI_ELEMENTS || 2 * NUM_ELEMENTS == NUM_AXI_ELEMENTS) ndata_i #(data_t, NUM_AXI_ELEMENTS) internal(); diff --git a/hardware/src/hdl/util/sorted_seq_to_bitmask.sv b/hardware/src/hdl/util/sorted_seq_to_bitmask.sv new file mode 100644 index 0000000..1e4b449 --- /dev/null +++ b/hardware/src/hdl/util/sorted_seq_to_bitmask.sv @@ -0,0 +1,119 @@ +`timescale 1ns / 1ps + +module SortedSeqToBitmask #( + parameter type data_t, + parameter NUM_ELEMENTS +) ( + input logic clk, + input logic rst_n, + + ndata_i.s in, // #(data_t, NUM_ELEMENTS) + data_i.m out // #(logic[NUM_ELEMENTS - 1:0]) +); + +`RESET_RESYNC // Reset pipelining + +typedef logic[NUM_ELEMENTS - 1:0] mask_t; + +// The first ID we are currently creating the mask for and IDs relative to this +data_t current_id, n_current_id, id_end_of_mask; +data_t[NUM_ELEMENTS - 1:0] relative_ids; // TODO: Reduce width + +mask_t current_mask, mask, n_mask; +mask_t current_processed, processed, n_processed; + +logic data_beat_done; +logic exact_end_of_mask; + +mask_t n_out_data; +logic n_out_last; +logic n_out_keep; +logic n_out_valid; + +always_comb begin + for (int i = 0; i < NUM_ELEMENTS; i++) begin + relative_ids[i] = in.data[i] - current_id; + end +end + +assign id_end_of_mask = current_id + (NUM_ELEMENTS - 1); + +always_comb begin + current_mask = '0; + current_processed = '0; + exact_end_of_mask = 1'b0; + + // Set bits at the specified indices + for (int i = 0; i < NUM_ELEMENTS; i++) begin + if (in.keep[i] && relative_ids[i] < NUM_ELEMENTS) begin + current_mask[relative_ids[i]] |= 1'b1; + current_processed[i] = 1'b1; + end + if (in.data[i] == id_end_of_mask) begin + exact_end_of_mask |= 1'b1; + end + end +end + +assign data_beat_done = (processed | current_processed) == in.keep; + +always_ff @(posedge clk) begin + if (reset_synced == 1'b0) begin + current_id <= '0; + mask <= '0; + processed <= '0; + end else begin + current_id <= n_current_id; + processed <= n_processed; + mask <= n_mask; + + out.data <= n_out_data; + out.last <= n_out_last; + out.keep <= n_out_keep; + out.valid <= n_out_valid; + end +end + +always_comb begin + in.ready = 1'b0; + + n_current_id = current_id; + n_processed = processed; + n_mask = mask; + + n_out_data = out.data; + n_out_last = out.last; + n_out_keep = out.keep; + n_out_valid = 1'b0; + + if (out.ready || !out.valid) begin + if (data_beat_done) begin + in.ready = 1'b1; + end + + if (in.valid) begin + if (data_beat_done) begin + n_processed = '0; + end else begin + n_processed = processed | current_processed; + end + + if (!data_beat_done || exact_end_of_mask || in.last) begin + n_current_id = current_id + NUM_ELEMENTS; + + n_out_data = mask | current_mask; + n_out_last = data_beat_done && in.last; + n_out_keep = 1'b1; + n_out_valid = 1'b1; + + n_mask = '0; + end else begin + n_mask = mask | current_mask; + end + end + end else begin + n_out_valid = out.valid; + end +end + +endmodule diff --git a/hardware/unit-tests/sorted_seq_to_bitmask_test.py b/hardware/unit-tests/sorted_seq_to_bitmask_test.py new file mode 100644 index 0000000..53aa7c6 --- /dev/null +++ b/hardware/unit-tests/sorted_seq_to_bitmask_test.py @@ -0,0 +1,105 @@ +from typing import List +import math + +from coyote_test import fpga_test_case + +from unit_test.fpga_stream import Stream, StreamType + +class SortedSeqToBitmaskTest(fpga_test_case.FPGATestCase): + alternative_vfpga_top_file = "vfpga_tops/sorted_seq_to_bitmask_test.sv" + debug_mode = True + verbose_logging = True + + def setUp(self): + super().setUp() + self.ids : List[int] = None + + def compute_expected_bitmask(self) -> List[bool]: + # Get the number of bits the bitmask should have. + # -> Next multiple of 8 that is >= max_id + n_bits = math.ceil((self.ids[-1] + 1) / 8) * 8 + bitmask = [True if id in self.ids else False for id in range(0, n_bits)] + return bitmask + + def simulate_fpga(self): + assert self.ids is not None, "ids cannot be None" + + self.set_stream_input(0, Stream(StreamType.UNSIGNED_INT_32, self.ids)) + + # Set the expected output + self.set_expected_output(0, Stream(StreamType.ARROW_BOOL, self.compute_expected_bitmask())) + + return super().simulate_fpga() + + def test_with_continuous_ids(self): + """ + Test behavior with continuous tuple ids, without gaps. + Every 8 ids, one mask with all 1 should be created. + """ + # Arrange + self.ids = list(range(0, 400)) + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() + + def test_with_gaps_that_fit_into_one_mask(self): + """ + Test behavior that tests indices that still fit into + one mask (8-bit) but are irregular + """ + # Arrange + self.ids = [0, 2, 4, 7, 10, 11, 13, 15] + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() + + def test_large_gaps_that_need_to_be_filled(self): + """ + Tests tuple ids that have large gaps in between them. + Those gaps need to be filled with masks that are all zero. + """ + self.ids = list(range(0, 192, 24)) + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() + + def test_mask_with_non_full_data_beat(self): + """ + Tests the behavior with masks that do not have 4 masks per data beat + """ + self.ids = list(range(0, 37)) + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() + + def test_mixed_mask(self): + """ + Test produces a mask that exhibits most of the properties + discussed above. The first few data beats produce full + masks. Then we have some larger gaps. Finally, the + input ends with a non-full data beat. + """ + continuous = list(range(0, 33)) + gaps = list(range(33, 90, 17)) + end = [92, 97] + self.ids = continuous + gaps + end + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() + + diff --git a/hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv b/hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv new file mode 100644 index 0000000..7b061e4 --- /dev/null +++ b/hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv @@ -0,0 +1,72 @@ +import libstf::*; + +parameter NUM_ELEMENTS = 8; + +// -- Tie-off unused interfaces and signals -------------------------------------------------------- +always_comb axi_ctrl.tie_off_s(); +always_comb notify.tie_off_m(); +always_comb sq_rd.tie_off_m(); +always_comb sq_wr.tie_off_m(); +always_comb cq_rd.tie_off_s(); +always_comb cq_wr.tie_off_s(); + +for (genvar I = 1; I < N_STRM_AXI; I++) begin + always_comb axis_host_recv[I].tie_off_s(); + always_comb axis_host_send[I].tie_off_m(); +end + +// -- Fix clock and reset names -------------------------------------------------------------------- +logic clk; +logic rst_n; + +assign clk = aclk; +assign rst_n = aresetn; + +// -- Signals -------------------------------------------------------------------------------------- +typedef logic[NUM_ELEMENTS - 1:0] mask_t; + +AXI4S axi_host_recv_0(.aclk(clk), .aresetn(rst_n)); + +ndata_i #(data32_t, NUM_ELEMENTS) sorted_seq(.*); +data_i #(mask_t) bitmask(.*); +ndata_i #(mask_t, 1) bitmask_ndata(.*); + +AXI4S bitmask_collected(.aclk(clk), .aresetn(rst_n)); + +`AXIS_ASSIGN(axis_host_recv[0], axi_host_recv_0) // AXI4SR to AXI4S +AXIToNData #( + .AXI_WIDTH(AXI_DATA_BITS), + .NUM_AXI_ELEMENTS(16), + .data_t(data32_t), + .NUM_ELEMENTS(NUM_ELEMENTS) +) inst_axi_to_ndata ( + .clk(clk), + .rst_n(rst_n), + + .in(axi_host_recv_0), + .out(sorted_seq) +); + +SortedSeqToBitmask #( + .data_t(data32_t), + .NUM_ELEMENTS(NUM_ELEMENTS) +) inst_seq_to_bitmask ( + .clk(clk), + .rst_n(rst_n), + .in(sorted_seq), + .out(bitmask) +); + +`DATA_ASSIGN(bitmask, bitmask_ndata) +NDataToAXI #( + .data_t(mask_t), + .NUM_ELEMENTS(1), + .AXI_WIDTH(AXI_DATA_BITS), + .NUM_AXI_ELEMENTS(64) +) inst_ndata_to_axi ( + .clk(clk), + .rst_n(rst_n), + .in(bitmask_ndata), + .out(bitmask_collected) +); +`AXIS_ASSIGN(bitmask_collected, axis_host_send[0]) From 9f91c5f56fa906d25af168c7496747ece79017e7 Mon Sep 17 00:00:00 2001 From: Jonas Dann Date: Tue, 20 Jan 2026 21:00:07 +0100 Subject: [PATCH 18/18] Reduced relative id width --- hardware/src/hdl/util/sorted_seq_to_bitmask.sv | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hardware/src/hdl/util/sorted_seq_to_bitmask.sv b/hardware/src/hdl/util/sorted_seq_to_bitmask.sv index 1e4b449..2b477ba 100644 --- a/hardware/src/hdl/util/sorted_seq_to_bitmask.sv +++ b/hardware/src/hdl/util/sorted_seq_to_bitmask.sv @@ -13,11 +13,15 @@ module SortedSeqToBitmask #( `RESET_RESYNC // Reset pipelining +localparam DATA_WIDTH = $bits(data_t); +localparam RIDX_WIDTH = $clog2(NUM_ELEMENTS) + 1; + typedef logic[NUM_ELEMENTS - 1:0] mask_t; +typedef logic[RIDX_WIDTH - 1:0] rid_t; // The first ID we are currently creating the mask for and IDs relative to this -data_t current_id, n_current_id, id_end_of_mask; -data_t[NUM_ELEMENTS - 1:0] relative_ids; // TODO: Reduce width +data_t current_id, n_current_id, id_end_of_mask; +rid_t[NUM_ELEMENTS - 1:0] relative_ids; mask_t current_mask, mask, n_mask; mask_t current_processed, processed, n_processed; @@ -32,7 +36,8 @@ logic n_out_valid; always_comb begin for (int i = 0; i < NUM_ELEMENTS; i++) begin - relative_ids[i] = in.data[i] - current_id; + data_t diff = in.data[i] - current_id; + relative_ids[i] = {|diff[DATA_WIDTH - 1: RIDX_WIDTH - 1], diff[RIDX_WIDTH - 2:0]}; end end