diff --git a/.gitignore b/.gitignore
index 1722478..a725e42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,7 @@
 **/.idea/*
 .cache/
 bench/
-experiment/
\ No newline at end of file
+experiment/
+**/results
+**.pyc
+**/_pychache__
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b98cda..b7fffd8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,3 +27,9 @@ if (WITH_TESTS)
 add_subdirectory(tests)
 
 endif()
+
+if (WITH_BENCHMARKS)
+
+add_subdirectory(benchmarks)
+
+endif()
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..e1a8aed
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,118 @@
+cmake_minimum_required(VERSION 3.10)
+project(dsr_benchmarks
+        VERSION 2024.12.01
+        DESCRIPTION "DSR Benchmarking Suite"
+        LANGUAGES CXX)
+
+# Fetch Catch2 if not already available
+Include(FetchContent)
+
+FetchContent_Declare(
+  Catch2
+  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+  GIT_TAG        v3.8.0
+)
+
+FetchContent_MakeAvailable(Catch2)
+
+# Find required packages
+find_package(Boost REQUIRED)
+find_package(Qt6 COMPONENTS Core REQUIRED)
+find_package(Eigen3 3.3 REQUIRED NO_MODULE)
+
+# Collect source files
+set(BENCHMARK_SOURCES
+    benchmark_main.cpp
+
+    # Latency benchmarks
+    latency/delta_propagation_bench.cpp
+    latency/signal_latency_bench.cpp
+    latency/crdt_join_bench.cpp
+
+    # Throughput benchmarks
+    throughput/single_agent_ops_bench.cpp
+    throughput/concurrent_writers_bench.cpp
+    throughput/single_agent_ops_with_latency_bench.cpp
+
+    # Scalability benchmarks
+    scalability/multi_agent_sync_bench.cpp
+    scalability/graph_size_impact_bench.cpp
+    scalability/thread_scaling_bench.cpp
+    scalability/graph_size_scaling_bench.cpp
+    scalability/agent_scaling_bench.cpp
+
+    # Consistency benchmarks
+    consistency/convergence_time_bench.cpp
+    consistency/conflict_rate_bench.cpp
+)
+
+# Header files for IDE integration
+set(BENCHMARK_HEADERS
+    core/benchmark_config.h
+    core/timing_utils.h
+    core/metrics_collector.h
+    core/report_generator.h
+    fixtures/multi_agent_fixture.h
+    fixtures/graph_generator.h
+)
+
+# Create benchmark executable
+add_executable(dsr_benchmarks
+    ${BENCHMARK_SOURCES}
+    ${BENCHMARK_HEADERS}
+)
+
+# Set C++ standard
+set_target_properties(dsr_benchmarks PROPERTIES
+    CMAKE_CXX_STANDARD 23
+    CXX_STANDARD_REQUIRED ON
+    CXX_EXTENSIONS ON
+)
+
+target_compile_options(dsr_benchmarks PUBLIC -g -std=c++23)
+
+# Include directories
+target_include_directories(dsr_benchmarks PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/core
+    ${CMAKE_CURRENT_SOURCE_DIR}/fixtures
+)
+
+# Link libraries
+target_link_libraries(dsr_benchmarks PRIVATE
+    Catch2::Catch2
+    dsr_api
+    dsr_core
+    Qt6::Core
+    Eigen3::Eigen
+    fastdds
+    fastcdr
+)
+
+# Create results directory
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/results)
+
+# Copy results directory structure
+add_custom_command(TARGET dsr_benchmarks POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:dsr_benchmarks>/results
+    COMMENT "Creating results directory"
+)
+
+# Register tests with CTest (optional)
+# Disabled auto-discovery as it requires running the binary at build time
+# which may fail if libraries are not in LD_LIBRARY_PATH
+# include(Catch)
+# catch_discover_tests(dsr_benchmarks)
+
+# Installation (optional)
+install(TARGETS dsr_benchmarks
+    RUNTIME DESTINATION bin
+)
+
+# Print configuration summary
+message(STATUS "")
+message(STATUS "DSR Benchmarks Configuration:")
+message(STATUS "  Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "  C++ Standard: C++23")
+message(STATUS "  Catch2 version: 3.8.0")
+message(STATUS "")
diff --git a/benchmarks/TODO.md b/benchmarks/TODO.md
new file mode 100644
index 0000000..417bb44
--- /dev/null
+++ b/benchmarks/TODO.md
@@ -0,0 +1,69 @@
+# Benchmarks - Pending Items
+
+## Working Tests
+
+The following benchmarks run by default (no filter needed):
+
+### Single-agent (C++)
+- Node/edge insert, read, update, delete throughput `[THROUGHPUT][single]`
+- Node/edge insert, read, update, delete latency+throughput `[THROUGHPUT][LATENCY][single]`
+- Concurrent writers throughput `[THROUGHPUT][concurrent]`
+- Signal emission latency `[LATENCY][signal]`
+- Signal emission under load `[LATENCY][signal][stress]`
+- CRDT mvreg operations `[CRDT][mvreg]`
+- CRDT dot_context operations `[CRDT][dot_context]`
+
+### Scalability (C++)
+- Thread scaling per operation `[SCALABILITY][threads]`
+- Graph size impact per operation `[SCALABILITY][graphsize]`
+- Graph size impact on performance `[SCALABILITY][memory]`
+
+### Python
+- `bench_graph_operations.py` — node/edge CRUD
+- `bench_throughput.py` — 5-second throughput+latency windows
+- `bench_signals.py` — signal callback latency and throughput
+- `bench_binding_overhead.py` — pydsr binding overhead
+
+Run all: `./dsr_benchmarks '~[.multi]'`
+Run specific: `./dsr_benchmarks '[CRDT]'`, `./dsr_benchmarks '[THROUGHPUT]'`
+
+## Known Issues
+
+### Multi-agent tests disabled (tag: `.multi`)
+- DDS synchronization not working in test environment
+- Signals from agent A not propagating to agent B handlers
+- Agents discover each other (DDS participant matching works) but data doesn't sync
+- Run with `./dsr_benchmarks "[.multi]"` to explicitly test
+
+### API note: insert_node auto-generates IDs
+- `DSRGraph::insert_node()` ignores the ID set on the node and generates a new one
+- Use `insert_node_with_id` to use a provided ID; check return value to confirm
+- The returned `std::optional<uint64_t>` contains the actual assigned ID
+
+
+## Python Benchmarks
+- [ ] Add RT_API transform benchmarks
+- [ ] Add InnerEigenAPI spatial transform benchmarks
+- [ ] Benchmark Python ↔ C++ data conversion overhead (Eigen matrices, large arrays)
+- [ ] Add multi-agent Python benchmarks (multiple DSRGraph instances)
+
+## C++ Benchmarks
+- [ ] Add DDS-specific latency benchmarks (network layer)
+- [ ] Benchmark different QoS settings impact
+- [ ] Add RT_API benchmarks
+- [ ] Add Eigen api benchmarks
+
+## Profiling
+- [ ] Add Tracy profiler instrumentation (zones for delta propagation, CRDT joins, DDS pub/sub)
+- [ ] Create Tracy build option (`WITH_TRACY`)
+- [ ] Document Tracy vs perf usage
+
+## Infrastructure (DON'T DO UNLESS EXPLICIT REQUEST)
+- [ ] CI integration (run benchmarks on PR, compare with baseline)
+- [ ] Historical results tracking
+- [ ] Regression detection with configurable thresholds
+- [ ] Grafana/dashboard export format
+
+## Documentation
+- [ ] Benchmark interpretation guide
+- [ ] Performance tuning recommendations based on results
diff --git a/benchmarks/benchmark_main.cpp b/benchmarks/benchmark_main.cpp
new file mode 100644
index 0000000..6084ecd
--- /dev/null
+++ b/benchmarks/benchmark_main.cpp
@@ -0,0 +1,95 @@
+// DSR Benchmarking Suite
+// Main entry point using Catch2
+
+#define CATCH_CONFIG_RUNNER
+#include <catch2/catch_session.hpp>
+#include <QCoreApplication>
+#include <QtGlobal>
+#include <iostream>
+
+// Custom Qt message handler to filter debug output during benchmarks
+static bool g_verbose = false;
+
+void benchmarkMessageHandler(QtMsgType type, const QMessageLogContext& context, const QString& msg) {
+    // In non-verbose mode, only show warnings and above
+    if (!g_verbose) {
+        switch (type) {
+            case QtDebugMsg:
+            case QtInfoMsg:
+                return;  // Suppress debug and info messages
+            default:
+                break;
+        }
+    }
+
+    // Format and output remaining messages
+    QByteArray localMsg = msg.toLocal8Bit();
+    switch (type) {
+        case QtDebugMsg:
+            std::cout << "[DEBUG] " << localMsg.constData() << std::endl;
+            break;
+        case QtInfoMsg:
+            std::cout << "[INFO] " << localMsg.constData() << std::endl;
+            break;
+        case QtWarningMsg:
+            std::cout << "[WARNING] " << localMsg.constData() << std::endl;
+            break;
+        case QtCriticalMsg:
+            std::cout << "[CRITICAL] " << localMsg.constData() << std::endl;
+            break;
+        case QtFatalMsg:
+            std::cout << "[FATAL] " << localMsg.constData() << std::endl;
+            abort();
+    }
+}
+
+int main(int argc, char* argv[]) {
+    // Install custom message handler before QCoreApplication
+    qInstallMessageHandler(benchmarkMessageHandler);
+
+    // Check for verbose flag
+    for (int i = 1; i < argc; ++i) {
+        if (std::string(argv[i]) == "--verbose" || std::string(argv[i]) == "-v") {
+            g_verbose = true;
+            break;
+        }
+    }
+
+    // Initialize Qt (required for signals/slots)
+    QCoreApplication app(argc, argv);
+    // Initialize Catch2
+    Catch::Session session;
+
+    // Set default reporter to console with colors
+    session.configData().showDurations = Catch::ShowDurations::Always;
+
+    // Apply command line arguments
+    int returnCode = session.applyCommandLine(argc, argv);
+    if (returnCode != 0) {
+        return returnCode;
+    }
+
+    std::cout << "=================================\n";
+    std::cout << " DSR Benchmarking Suite\n";
+    std::cout << "=================================\n\n";
+    std::cout << "Available benchmark categories:\n";
+    std::cout << "  [LATENCY]      - Signal emission, CRDT operations\n";
+    std::cout << "  [THROUGHPUT]   - Single agent insert/read/update/delete, concurrent writers\n";
+    std::cout << "  [CRDT]         - mvreg and dot_context micro-benchmarks\n";
+    std::cout << "  [SCALABILITY]  - Thread scaling, graph size impact\n";
+    std::cout << "  [CONSISTENCY]  - Convergence time, conflict rates\n";
+    std::cout << "\n";
+    std::cout << "Usage examples:\n";
+    std::cout << "  ./dsr_benchmarks                    # Run all non-hidden benchmarks\n";
+    std::cout << "  ./dsr_benchmarks \"[LATENCY]\"        # Run latency benchmarks\n";
+    std::cout << "  ./dsr_benchmarks \"[THROUGHPUT]\"     # Run throughput benchmarks\n";
+    std::cout << "  ./dsr_benchmarks \"[CRDT]\"           # Run CRDT micro-benchmarks\n";
+    std::cout << "  ./dsr_benchmarks \"[.multi]\"         # Run multi-agent tests (may timeout)\n";
+    std::cout << "  ./dsr_benchmarks -r json::out=x.json # Export to JSON\n";
+    std::cout << "  ./dsr_benchmarks --verbose          # Show Qt debug messages\n";
+    std::cout << "\n";
+    std::cout << "Note: Only multi-agent [.multi] tests are hidden by default.\n";
+    std::cout << "\n";
+
+    return session.run();
+}
diff --git a/benchmarks/consistency/conflict_rate_bench.cpp b/benchmarks/consistency/conflict_rate_bench.cpp
new file mode 100644
index 0000000..ab212ad
--- /dev/null
+++ b/benchmarks/consistency/conflict_rate_bench.cpp
@@ -0,0 +1,354 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <thread>
+#include <barrier>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Conflict rate benchmarks", "[CONSISTENCY][conflict][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("conflict_rate");
+
+    SECTION("Concurrent attribute updates - same node") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(4, config_file));
+        fixture.wait_for_sync();
+
+        // Create shared node and capture actual ID
+        auto* agent_0 = fixture.get_agent(0);
+        auto shared_node = GraphGenerator::create_test_node(
+            0, agent_0->get_agent_id(), "conflict_test");
+        auto insert_result = agent_0->insert_node(shared_node);
+        REQUIRE(insert_result.has_value());
+        uint64_t shared_node_id = insert_result.value();
+
+        fixture.wait_for_sync();
+        REQUIRE(fixture.verify_convergence());
+
+        constexpr int NUM_ROUNDS = 50;
+        constexpr int UPDATES_PER_AGENT = 10;
+        constexpr size_t NUM_AGENTS = 4;
+
+        std::atomic<uint64_t> total_updates{0};
+        uint64_t conflicts_detected = 0;
+
+        std::barrier sync_point(NUM_AGENTS);
+
+        for (int round = 0; round < NUM_ROUNDS; ++round) {
+            std::vector<std::thread> threads;
+            threads.reserve(NUM_AGENTS);
+
+            // Record initial values before concurrent updates
+            std::vector<int32_t> expected_values(NUM_AGENTS);
+            for (size_t i = 0; i < NUM_AGENTS; ++i) {
+                expected_values[i] = static_cast<int32_t>(round * 1000 + i * 100);
+            }
+
+            for (size_t agent_idx = 0; agent_idx < NUM_AGENTS; ++agent_idx) {
+                threads.emplace_back([&, agent_idx, node_id = shared_node_id]() {
+                    auto* agent = fixture.get_agent(agent_idx);
+                    sync_point.arrive_and_wait();
+
+                    for (int i = 0; i < UPDATES_PER_AGENT; ++i) {
+                        auto node = agent->get_node(node_id);
+                        if (node) {
+                            int32_t value = static_cast<int32_t>(
+                                round * 1000 + agent_idx * 100 + i);
+                            agent->add_or_modify_attrib_local<level_att>(*node, value);
+                            agent->update_node(*node);
+                            total_updates.fetch_add(1, std::memory_order_relaxed);
+                        }
+                    }
+                });
+            }
+
+            for (auto& t : threads) {
+                t.join();
+            }
+
+            // Wait for convergence
+            fixture.wait_for_sync(std::chrono::milliseconds(500));
+
+            // Check if all agents converged to the same value
+            std::set<int32_t> final_values;
+            for (size_t i = 0; i < NUM_AGENTS; ++i) {
+                auto* agent = fixture.get_agent(i);
+                auto node = agent->get_node(shared_node_id);
+                if (node) {
+                    auto attr = agent->get_attrib_by_name<level_att>(*node);
+                    if (attr.has_value()) {
+                        final_values.insert(attr.value());
+                    }
+                }
+            }
+
+            // If agents have different values, conflict resolution may still be in progress
+            // or there was a conflict that resolved differently
+            if (final_values.size() > 1) {
+                conflicts_detected++;
+            }
+        }
+
+        double conflict_rate = static_cast<double>(conflicts_detected) /
+                              static_cast<double>(NUM_ROUNDS) * 100.0;
+
+        collector.record_consistency("concurrent_update_conflict_rate",
+            conflict_rate, "%",
+            {{"num_agents", std::to_string(NUM_AGENTS)},
+             {"updates_per_round", std::to_string(UPDATES_PER_AGENT * NUM_AGENTS)}});
+
+        INFO("Conflict rate: " << conflict_rate << "% (" << conflicts_detected
+             << "/" << NUM_ROUNDS << " rounds)");
+        INFO("Total updates: " << total_updates.load());
+
+        // Verify final convergence
+        fixture.wait_for_sync(std::chrono::milliseconds(1000));
+        CHECK(fixture.verify_convergence());
+    }
+
+    SECTION("Concurrent node creations - potential ID conflicts") {
+        // This tests CRDT behavior when multiple agents create nodes
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(4, config_file));
+        fixture.wait_for_sync();
+
+        constexpr int NODES_PER_AGENT = 100;
+        constexpr size_t NUM_AGENTS = 4;
+
+        std::atomic<uint64_t> total_created{0};
+        std::atomic<uint64_t> creation_failures{0};
+
+        std::barrier sync_point(NUM_AGENTS);
+        std::vector<std::thread> threads;
+        threads.reserve(NUM_AGENTS);
+
+        for (size_t agent_idx = 0; agent_idx < NUM_AGENTS; ++agent_idx) {
+            threads.emplace_back([&, agent_idx]() {
+                auto* agent = fixture.get_agent(agent_idx);
+                sync_point.arrive_and_wait();
+
+                for (int i = 0; i < NODES_PER_AGENT; ++i) {
+                    // Each agent uses unique IDs in its range
+                    uint64_t node_id = 8500000 + agent_idx * 10000 + i;
+                    auto node = GraphGenerator::create_test_node(
+                        node_id, agent->get_agent_id(),
+                        "agent" + std::to_string(agent_idx) + "_node" + std::to_string(i));
+
+                    auto result = agent->insert_node(node);
+                    if (result.has_value()) {
+                        total_created.fetch_add(1, std::memory_order_relaxed);
+                    } else {
+                        creation_failures.fetch_add(1, std::memory_order_relaxed);
+                    }
+                }
+            });
+        }
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        // Wait for convergence
+        fixture.wait_for_sync(std::chrono::milliseconds(2000));
+
+        // Verify all agents have the same nodes
+        auto* agent_0 = fixture.get_agent(0);
+        size_t expected_node_count = agent_0->get_nodes().size();
+
+        bool all_match = true;
+        for (size_t i = 1; i < NUM_AGENTS; ++i) {
+            auto* agent = fixture.get_agent(i);
+            if (agent->get_nodes().size() != expected_node_count) {
+                all_match = false;
+            }
+        }
+
+        collector.record_consistency("node_creation_success_rate",
+            static_cast<double>(total_created.load()) /
+            static_cast<double>(NODES_PER_AGENT * NUM_AGENTS) * 100.0, "%");
+
+        collector.record_consistency("final_convergence",
+            all_match ? 100.0 : 0.0, "%");
+
+        INFO("Created: " << total_created.load() << "/" << NODES_PER_AGENT * NUM_AGENTS);
+        INFO("Failures: " << creation_failures.load());
+        INFO("All agents converged: " << (all_match ? "yes" : "no"));
+
+        CHECK(fixture.verify_convergence());
+    }
+
+    SECTION("Edge conflict resolution") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(2, config_file));
+        fixture.wait_for_sync();
+
+        auto* agent_a = fixture.get_agent(0);
+        auto* agent_b = fixture.get_agent(1);
+
+        // Create shared nodes and capture actual IDs
+        auto node1 = GraphGenerator::create_test_node(0, agent_a->get_agent_id(), "edge_node_1");
+        auto node2 = GraphGenerator::create_test_node(0, agent_a->get_agent_id(), "edge_node_2");
+        auto result1 = agent_a->insert_node(node1);
+        auto result2 = agent_a->insert_node(node2);
+        REQUIRE(result1.has_value());
+        REQUIRE(result2.has_value());
+        uint64_t node1_id = result1.value();
+        uint64_t node2_id = result2.value();
+
+        fixture.wait_for_sync();
+        REQUIRE(fixture.verify_convergence());
+
+        uint64_t conflicts = 0;
+        constexpr int NUM_ROUNDS = 50;
+
+        for (int round = 0; round < NUM_ROUNDS; ++round) {
+            // Both agents try to create the same edge simultaneously
+            auto edge_a = GraphGenerator::create_test_edge(
+                node1_id, node2_id, agent_a->get_agent_id(), "conflict_edge");
+            auto edge_b = GraphGenerator::create_test_edge(
+                node1_id, node2_id, agent_b->get_agent_id(), "conflict_edge");
+
+            std::thread ta([&]() { agent_a->insert_or_assign_edge(edge_a); });
+            std::thread tb([&]() { agent_b->insert_or_assign_edge(edge_b); });
+
+            ta.join();
+            tb.join();
+
+            fixture.wait_for_sync(std::chrono::milliseconds(200));
+
+            // Check both agents see the edge
+            auto edge_on_a = agent_a->get_edge(node1_id, node2_id, "conflict_edge");
+            auto edge_on_b = agent_b->get_edge(node1_id, node2_id, "conflict_edge");
+
+            if (!edge_on_a.has_value() || !edge_on_b.has_value()) {
+                conflicts++;
+            }
+
+            // Delete edge for next round
+            agent_a->delete_edge(node1_id, node2_id, "conflict_edge");
+            fixture.wait_for_sync(std::chrono::milliseconds(100));
+        }
+
+        double conflict_rate = static_cast<double>(conflicts) /
+                              static_cast<double>(NUM_ROUNDS) * 100.0;
+
+        collector.record_consistency("edge_conflict_rate",
+            conflict_rate, "%");
+
+        INFO("Edge conflict rate: " << conflict_rate << "%");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "conflict_rate");
+}
+
+TEST_CASE("CRDT eventual consistency verification", "[CONSISTENCY][eventual][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("eventual_consistency");
+
+    MultiAgentFixture fixture;
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(4, config_file));
+    fixture.wait_for_sync();
+
+    SECTION("All agents eventually converge after chaos") {
+        constexpr size_t NUM_AGENTS = 4;
+        constexpr int OPS_PER_AGENT = 50;
+
+        std::barrier sync_point(NUM_AGENTS);
+        std::atomic<bool> stop_flag{false};
+
+        // Each agent performs random operations
+        std::vector<std::thread> threads;
+        for (size_t agent_idx = 0; agent_idx < NUM_AGENTS; ++agent_idx) {
+            threads.emplace_back([&, agent_idx]() {
+                auto* agent = fixture.get_agent(agent_idx);
+                uint64_t base_id = 8700000 + agent_idx * 10000;
+
+                sync_point.arrive_and_wait();
+
+                for (int i = 0; i < OPS_PER_AGENT && !stop_flag.load(); ++i) {
+                    int op = i % 3;
+
+                    if (op == 0) {
+                        // Insert node
+                        auto node = GraphGenerator::create_test_node(
+                            base_id + i, agent->get_agent_id());
+                        agent->insert_node(node);
+                    } else if (op == 1) {
+                        // Update existing node
+                        auto node = agent->get_node(base_id + (i % (std::max(1, i / 2))));
+                        if (node) {
+                            agent->add_or_modify_attrib_local<level_att>(
+                                *node, static_cast<int32_t>(i));
+                            agent->update_node(*node);
+                        }
+                    } else {
+                        // Insert edge
+                        auto root = agent->get_node_root();
+                        if (root) {
+                            auto existing = agent->get_node(base_id + (i % (std::max(1, i / 2))));
+                            if (existing) {
+                                auto edge = GraphGenerator::create_test_edge(
+                                    root->id(), existing->id(), agent->get_agent_id());
+                                agent->insert_or_assign_edge(edge);
+                            }
+                        }
+                    }
+
+                    // Small delay between operations
+                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                }
+            });
+        }
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        // Wait for eventual consistency
+        INFO("Waiting for eventual consistency...");
+
+        auto start = std::chrono::steady_clock::now();
+        bool converged = fixture.verify_convergence(std::chrono::seconds(30));
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_consistency("eventual_consistency_achieved",
+            converged ? 100.0 : 0.0, "%");
+        collector.record_consistency("convergence_duration_after_chaos",
+            static_cast<double>(duration.count()), "ms");
+
+        INFO("Convergence " << (converged ? "achieved" : "FAILED")
+             << " in " << duration.count() << " ms");
+
+        CHECK(converged);
+
+        if (converged) {
+            // Verify all agents have same node count
+            auto* agent_0 = fixture.get_agent(0);
+            size_t node_count = agent_0->get_nodes().size();
+
+            for (size_t i = 1; i < NUM_AGENTS; ++i) {
+                auto* agent = fixture.get_agent(i);
+                CHECK(agent->get_nodes().size() == node_count);
+            }
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "eventual_consistency");
+}
diff --git a/benchmarks/consistency/convergence_time_bench.cpp b/benchmarks/consistency/convergence_time_bench.cpp
new file mode 100644
index 0000000..2af178f
--- /dev/null
+++ b/benchmarks/consistency/convergence_time_bench.cpp
@@ -0,0 +1,253 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <mutex>
+#include <thread>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Convergence time benchmarks", "[CONSISTENCY][convergence][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("convergence_time");
+
+    SECTION("Single update convergence") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(2, config_file));
+        fixture.wait_for_sync();
+
+        auto* agent_a = fixture.get_agent(0);
+        auto* agent_b = fixture.get_agent(1);
+        REQUIRE(agent_a != nullptr);
+        REQUIRE(agent_b != nullptr);
+
+        LatencyTracker tracker(100);
+
+        for (int i = 0; i < 100; ++i) {
+            auto node = GraphGenerator::create_test_node(
+                0, agent_a->get_agent_id(),
+                "conv_node_" + std::to_string(i));
+
+            uint64_t start = get_unix_timestamp();
+            auto result = agent_a->insert_node(node);
+            if (!result.has_value()) continue;
+            uint64_t node_id = result.value();
+
+            // Poll until agent B sees the node
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(5)) {
+                fixture.process_events(1);
+                auto b_node = agent_b->get_node(node_id);
+                if (b_node.has_value()) {
+                    uint64_t conv_time = get_unix_timestamp() - start;
+                    tracker.record(conv_time);
+                    break;
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("single_node_convergence", stats);
+        collector.record_consistency("convergence_success_rate",
+            (static_cast<double>(tracker.count()) / 100.0) * 100, "%");
+
+        INFO("Single node convergence - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+        INFO("Success rate: " << tracker.count() << "/100");
+    }
+
+    SECTION("Batch convergence time") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(2, config_file));
+        fixture.wait_for_sync();
+
+        auto* agent_a = fixture.get_agent(0);
+        auto* agent_b = fixture.get_agent(1);
+
+        LatencyTracker tracker(20);
+
+        for (int batch = 0; batch < 20; ++batch) {
+            // Insert batch of 10 nodes and capture actual IDs
+            std::vector<uint64_t> node_ids;
+            node_ids.reserve(10);
+
+            uint64_t start = get_unix_timestamp();
+
+            for (int i = 0; i < 10; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    0, agent_a->get_agent_id());
+                auto result = agent_a->insert_node(node);
+                if (result.has_value()) {
+                    node_ids.push_back(result.value());
+                }
+            }
+
+            // Wait for all nodes to converge
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(10)) {
+                fixture.process_events(1);
+
+                bool all_converged = true;
+                for (auto id : node_ids) {
+                    if (!agent_b->get_node(id).has_value()) {
+                        all_converged = false;
+                        break;
+                    }
+                }
+
+                if (all_converged) {
+                    uint64_t conv_time = get_unix_timestamp() - start;
+                    tracker.record(conv_time);
+                    break;
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("batch_convergence_10_nodes", stats);
+
+        INFO("Batch convergence (10 nodes) - Mean: " << stats.mean_ms() << " ms");
+    }
+
+    SECTION("Convergence under concurrent updates") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(4, config_file));
+        fixture.wait_for_sync();
+
+        LatencyTracker tracker(50);
+
+        // Each agent creates nodes concurrently
+        for (int round = 0; round < 50; ++round) {
+            std::vector<uint64_t> all_node_ids;
+            std::mutex ids_mutex;
+
+            uint64_t start = get_unix_timestamp();
+
+            // Each agent creates 5 nodes in parallel
+            std::vector<std::thread> threads;
+            for (size_t agent_idx = 0; agent_idx < 4; ++agent_idx) {
+                threads.emplace_back([&, agent_idx]() {
+                    auto* agent = fixture.get_agent(agent_idx);
+                    for (int i = 0; i < 5; ++i) {
+                        auto node = GraphGenerator::create_test_node(
+                            0, agent->get_agent_id());
+                        auto result = agent->insert_node(node);
+                        if (result.has_value()) {
+                            std::lock_guard<std::mutex> lock(ids_mutex);
+                            all_node_ids.push_back(result.value());
+                        }
+                    }
+                });
+            }
+            for (auto& t : threads) t.join();
+
+            // Wait for all agents to see all nodes
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(15)) {
+                fixture.process_events(5);
+
+                bool all_converged = true;
+                for (size_t agent_idx = 0; agent_idx < 4 && all_converged; ++agent_idx) {
+                    auto* agent = fixture.get_agent(agent_idx);
+                    for (auto id : all_node_ids) {
+                        if (!agent->get_node(id).has_value()) {
+                            all_converged = false;
+                            break;
+                        }
+                    }
+                }
+
+                if (all_converged) {
+                    uint64_t conv_time = get_unix_timestamp() - start;
+                    tracker.record(conv_time);
+                    break;
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("concurrent_convergence_4_agents", stats);
+
+        INFO("Concurrent convergence (4 agents) - Mean: " << stats.mean_ms() << " ms, "
+             << "P99: " << stats.p99_ms() << " ms");
+
+        // Check against timeout
+        CHECK(stats.p99_ms() < 1000);  // Should converge within 1 second p99
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "convergence_time");
+}
+
+TEST_CASE("Attribute convergence", "[CONSISTENCY][convergence][attributes][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("attribute_convergence");
+
+    MultiAgentFixture fixture;
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(2, config_file));
+    fixture.wait_for_sync();
+
+    auto* agent_a = fixture.get_agent(0);
+    auto* agent_b = fixture.get_agent(1);
+
+    // Create shared test node and capture actual ID
+    auto test_node = GraphGenerator::create_test_node(
+        0, agent_a->get_agent_id(), "attr_conv_test");
+    auto insert_result = agent_a->insert_node(test_node);
+    REQUIRE(insert_result.has_value());
+    uint64_t shared_node_id = insert_result.value();
+
+    fixture.wait_for_sync();
+    REQUIRE(fixture.verify_convergence());
+
+    SECTION("Attribute update convergence") {
+        LatencyTracker tracker(100);
+
+        for (int i = 0; i < 100; ++i) {
+            auto node = agent_a->get_node(shared_node_id);
+            REQUIRE(node.has_value());
+
+            int32_t new_value = 1000 + i;
+            agent_a->add_or_modify_attrib_local<level_att>(*node, new_value);
+
+            uint64_t start = get_unix_timestamp();
+            agent_a->update_node(*node);
+
+            // Wait for attribute to converge
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(5)) {
+                fixture.process_events(1);
+
+                auto b_node = agent_b->get_node(shared_node_id);
+                if (b_node.has_value()) {
+                    auto attr = agent_b->get_attrib_by_name<level_att>(*b_node);
+                    if (attr.has_value() && attr.value() == new_value) {
+                        uint64_t conv_time = get_unix_timestamp() - start;
+                        tracker.record(conv_time);
+                        break;
+                    }
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("attribute_update_convergence", stats);
+
+        INFO("Attribute convergence - Mean: " << stats.mean_us() << " us");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "attribute_convergence");
+}
diff --git a/benchmarks/core/benchmark_config.h b/benchmarks/core/benchmark_config.h
new file mode 100644
index 0000000..0734131
--- /dev/null
+++ b/benchmarks/core/benchmark_config.h
@@ -0,0 +1,55 @@
+#ifndef DSR_BENCHMARK_CONFIG_H
+#define DSR_BENCHMARK_CONFIG_H
+
+#include <cstdint>
+#include <string>
+#include <chrono>
+
+namespace DSR::Benchmark {
+
+struct BenchmarkConfig {
+    // Timing configuration
+    uint32_t warmup_iterations = 10;
+    uint32_t measurement_iterations = 100;
+    std::chrono::milliseconds sync_wait_time{200};
+    std::chrono::seconds max_convergence_timeout{10};
+
+    // Multi-agent configuration
+    uint32_t default_agent_count = 2;
+    uint32_t max_agent_count = 16;
+
+    // Graph generation
+    uint32_t small_graph_nodes = 100;
+    uint32_t medium_graph_nodes = 1000;
+    uint32_t large_graph_nodes = 10000;
+
+    // Throughput settings
+    uint32_t throughput_duration_seconds = 5;
+    uint32_t concurrent_writer_threads = 4;
+
+    // Output settings
+    std::string results_directory = "results";
+    bool export_json = true;
+    bool export_csv = true;
+    bool verbose = false;
+};
+
+// Default configuration singleton
+inline BenchmarkConfig& default_config() {
+    static BenchmarkConfig config;
+    return config;
+}
+
+// Percentile levels for latency statistics
+constexpr double PERCENTILE_P50 = 0.50;
+constexpr double PERCENTILE_P90 = 0.90;
+constexpr double PERCENTILE_P95 = 0.95;
+constexpr double PERCENTILE_P99 = 0.99;
+
+// Threshold constants for validation
+constexpr uint64_t MAX_EXPECTED_LATENCY_NS = 100'000'000;  // 100ms
+constexpr uint64_t MIN_EXPECTED_THROUGHPUT_OPS = 1000;     // 1000 ops/sec
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_BENCHMARK_CONFIG_H
diff --git a/benchmarks/core/metrics_collector.h b/benchmarks/core/metrics_collector.h
new file mode 100644
index 0000000..cf08f60
--- /dev/null
+++ b/benchmarks/core/metrics_collector.h
@@ -0,0 +1,239 @@
+#ifndef DSR_METRICS_COLLECTOR_H
+#define DSR_METRICS_COLLECTOR_H
+
+#include <string>
+#include <vector>
+#include <map>
+#include <mutex>
+#include <chrono>
+#include <memory>
+#include "timing_utils.h"
+#include "benchmark_config.h"
+
+namespace DSR::Benchmark {
+
+// Categories of benchmark metrics
+enum class MetricCategory {
+    Latency,
+    Throughput,
+    Scalability,
+    Consistency
+};
+
+inline std::string to_string(MetricCategory cat) {
+    switch (cat) {
+        case MetricCategory::Latency: return "latency";
+        case MetricCategory::Throughput: return "throughput";
+        case MetricCategory::Scalability: return "scalability";
+        case MetricCategory::Consistency: return "consistency";
+    }
+    return "unknown";
+}
+
+
+// Individual metric measurement
+struct Metric {
+    std::string name;
+    MetricCategory category;
+    std::string unit;
+    double value;
+    std::map<std::string, double> additional_values;  // For percentiles, etc.
+    std::map<std::string, std::string> tags;          // For categorization
+};
+
+
+// Result of a complete benchmark run
+struct BenchmarkResult {
+    std::string benchmark_name;
+    std::string timestamp;
+    std::chrono::milliseconds total_duration;
+    std::vector<Metric> metrics;
+    std::map<std::string, std::string> metadata;
+};
+
+
+// Thread-safe collector for benchmark metrics
+class MetricsCollector {
+public:
+    MetricsCollector() = default;
+
+    explicit MetricsCollector(std::string benchmark_name)
+        : benchmark_name_(std::move(benchmark_name))
+        , start_time_(std::chrono::steady_clock::now())
+    {}
+
+    // Set benchmark name
+    void set_benchmark_name(const std::string& name) {
+        std::lock_guard lock(mutex_);
+        benchmark_name_ = name;
+    }
+
+    // Add metadata
+    void add_metadata(const std::string& key, const std::string& value) {
+        std::lock_guard lock(mutex_);
+        metadata_[key] = value;
+    }
+
+    // Record a simple metric
+    void record(const std::string& name, MetricCategory category,
+                double value, const std::string& unit = "") {
+        Metric m;
+        m.name = name;
+        m.category = category;
+        m.value = value;
+        m.unit = unit;
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record a metric with tags
+    void record(const std::string& name, MetricCategory category,
+                double value, const std::string& unit,
+                const std::map<std::string, std::string>& tags) {
+        Metric m;
+        m.name = name;
+        m.category = category;
+        m.value = value;
+        m.unit = unit;
+        m.tags = tags;
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record latency statistics from a LatencyTracker
+    void record_latency_stats(const std::string& name, LatencyStats stats,
+                              const std::map<std::string, std::string>& tags = {}) {
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Latency;
+        m.value = stats.mean_ns;
+        m.unit = "ns";
+        m.tags = tags;
+        m.additional_values["count"] = static_cast<double>(stats.count);
+        m.additional_values["mean_ns"] = stats.mean_ns;
+        m.additional_values["stddev_ns"] = stats.stddev_ns;
+        m.additional_values["min_ns"] = static_cast<double>(stats.min_ns);
+        m.additional_values["max_ns"] = static_cast<double>(stats.max_ns);
+        m.additional_values["p50_ns"] = static_cast<double>(stats.p50_ns);
+        m.additional_values["p90_ns"] = static_cast<double>(stats.p90_ns);
+        m.additional_values["p95_ns"] = static_cast<double>(stats.p95_ns);
+        m.additional_values["p99_ns"] = static_cast<double>(stats.p99_ns);
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record throughput
+    void record_throughput(const std::string& name, uint64_t operations,
+                           std::chrono::milliseconds duration,
+                           const std::map<std::string, std::string>& tags = {}) {
+        double ops_per_sec = static_cast<double>(operations) /
+                             (static_cast<double>(duration.count()) / 1000.0);
+
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Throughput;
+        m.value = ops_per_sec;
+        m.unit = "ops/sec";
+        m.tags = tags;
+        m.additional_values["total_operations"] = static_cast<double>(operations);
+        m.additional_values["duration_ms"] = static_cast<double>(duration.count());
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record scalability metric
+    void record_scalability(const std::string& name, uint32_t scale_factor,
+                            double metric_value, const std::string& unit,
+                            const std::map<std::string, std::string>& tags = {}) {
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Scalability;
+        m.value = metric_value;
+        m.unit = unit;
+        m.tags = tags;
+        m.additional_values["scale_factor"] = static_cast<double>(scale_factor);
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record consistency metric
+    void record_consistency(const std::string& name, double value,
+                            const std::string& unit,
+                            const std::map<std::string, std::string>& tags = {}) {
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Consistency;
+        m.value = value;
+        m.unit = unit;
+        m.tags = tags;
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Get all metrics by category
+    [[nodiscard]] std::vector<Metric> get_metrics(MetricCategory category) const {
+        std::lock_guard lock(mutex_);
+        std::vector<Metric> result;
+        for (const auto& m : metrics_) {
+            if (m.category == category) {
+                result.push_back(m);
+            }
+        }
+        return result;
+    }
+
+    // Get all metrics
+    [[nodiscard]] std::vector<Metric> get_all_metrics() const {
+        std::lock_guard lock(mutex_);
+        return metrics_;
+    }
+
+    // Generate final result
+    [[nodiscard]] BenchmarkResult finalize() {
+        auto end_time = std::chrono::steady_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            end_time - start_time_);
+
+        // Generate timestamp
+        auto now = std::chrono::system_clock::now();
+        auto time_t_now = std::chrono::system_clock::to_time_t(now);
+        char timestamp_buf[64];
+        std::strftime(timestamp_buf, sizeof(timestamp_buf), "%Y-%m-%dT%H:%M:%S",
+                      std::localtime(&time_t_now));
+
+        std::lock_guard lock(mutex_);
+        BenchmarkResult result;
+        result.benchmark_name = benchmark_name_;
+        result.timestamp = timestamp_buf;
+        result.total_duration = duration;
+        result.metrics = metrics_;
+        result.metadata = metadata_;
+
+        return result;
+    }
+
+    // Clear all collected metrics
+    void clear() {
+        std::lock_guard lock(mutex_);
+        metrics_.clear();
+        metadata_.clear();
+        start_time_ = std::chrono::steady_clock::now();
+    }
+
+private:
+    mutable std::mutex mutex_;
+    std::string benchmark_name_;
+    std::chrono::steady_clock::time_point start_time_;
+    std::vector<Metric> metrics_;
+    std::map<std::string, std::string> metadata_;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_METRICS_COLLECTOR_H
diff --git a/benchmarks/core/report_generator.h b/benchmarks/core/report_generator.h
new file mode 100644
index 0000000..6831f2a
--- /dev/null
+++ b/benchmarks/core/report_generator.h
@@ -0,0 +1,255 @@
+#ifndef DSR_REPORT_GENERATOR_H
+#define DSR_REPORT_GENERATOR_H
+
+#include <string>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <filesystem>
+#include "metrics_collector.h"
+
+namespace DSR::Benchmark {
+
+class ReportGenerator {
+public:
+    explicit ReportGenerator(std::string output_directory = "results")
+        : output_directory_(std::move(output_directory))
+    {}
+
+    // Export benchmark result to JSON
+    bool export_json(const BenchmarkResult& result, const std::string& filename = "") {
+        std::string filepath = generate_filepath(result, filename, ".json");
+        std::ofstream out(filepath);
+        if (!out.is_open()) {
+            return false;
+        }
+
+        out << "{\n";
+        out << "  \"benchmark_name\": " << quote(result.benchmark_name) << ",\n";
+        out << "  \"timestamp\": " << quote(result.timestamp) << ",\n";
+        out << "  \"total_duration_ms\": " << result.total_duration.count() << ",\n";
+
+        // Metadata
+        out << "  \"metadata\": {\n";
+        bool first = true;
+        for (const auto& [key, value] : result.metadata) {
+            if (!first) out << ",\n";
+            out << "    " << quote(key) << ": " << quote(value);
+            first = false;
+        }
+        out << "\n  },\n";
+
+        // Metrics
+        out << "  \"metrics\": [\n";
+        for (size_t i = 0; i < result.metrics.size(); ++i) {
+            const auto& m = result.metrics[i];
+            out << "    {\n";
+            out << "      \"name\": " << quote(m.name) << ",\n";
+            out << "      \"category\": " << quote(to_string(m.category)) << ",\n";
+            out << "      \"value\": " << format_double(m.value) << ",\n";
+            out << "      \"unit\": " << quote(m.unit);
+
+            if (!m.additional_values.empty()) {
+                out << ",\n      \"additional\": {\n";
+                bool first_add = true;
+                for (const auto& [key, value] : m.additional_values) {
+                    if (!first_add) out << ",\n";
+                    out << "        " << quote(key) << ": " << format_double(value);
+                    first_add = false;
+                }
+                out << "\n      }";
+            }
+
+            if (!m.tags.empty()) {
+                out << ",\n      \"tags\": {\n";
+                bool first_tag = true;
+                for (const auto& [key, value] : m.tags) {
+                    if (!first_tag) out << ",\n";
+                    out << "        " << quote(key) << ": " << quote(value);
+                    first_tag = false;
+                }
+                out << "\n      }";
+            }
+
+            out << "\n    }";
+            if (i < result.metrics.size() - 1) out << ",";
+            out << "\n";
+        }
+        out << "  ]\n";
+        out << "}\n";
+
+        out.close();
+        last_json_path_ = filepath;
+        return true;
+    }
+
+    // Export benchmark result to CSV
+    bool export_csv(const BenchmarkResult& result, const std::string& filename = "") {
+        std::string filepath = generate_filepath(result, filename, ".csv");
+        std::ofstream out(filepath);
+        if (!out.is_open()) {
+            return false;
+        }
+
+        // Header
+        out << "benchmark_name,timestamp,metric_name,category,value,unit,"
+            << "mean_ns,stddev_ns,min_ns,max_ns,p50_ns,p90_ns,p95_ns,p99_ns,count\n";
+
+        // Data rows
+        for (const auto& m : result.metrics) {
+            out << quote_csv(result.benchmark_name) << ","
+                << quote_csv(result.timestamp) << ","
+                << quote_csv(m.name) << ","
+                << quote_csv(to_string(m.category)) << ","
+                << format_double(m.value) << ","
+                << quote_csv(m.unit) << ",";
+
+            // Additional values (latency-specific)
+            auto get_add = [&m](const std::string& key) -> std::string {
+                auto it = m.additional_values.find(key);
+                if (it != m.additional_values.end()) {
+                    return format_double(it->second);
+                }
+                return "";
+            };
+
+            out << get_add("mean_ns") << ","
+                << get_add("stddev_ns") << ","
+                << get_add("min_ns") << ","
+                << get_add("max_ns") << ","
+                << get_add("p50_ns") << ","
+                << get_add("p90_ns") << ","
+                << get_add("p95_ns") << ","
+                << get_add("p99_ns") << ","
+                << get_add("count") << "\n";
+        }
+
+        out.close();
+        last_csv_path_ = filepath;
+        return true;
+    }
+
+    // Export both JSON and CSV
+    bool export_all(const BenchmarkResult& result, const std::string& base_filename = "") {
+        bool json_ok = export_json(result, base_filename);
+        bool csv_ok = export_csv(result, base_filename);
+        return json_ok && csv_ok;
+    }
+
+    // Compare with baseline and generate comparison report
+    bool compare_with_baseline(const BenchmarkResult& current,
+                               const std::string& baseline_json_path,
+                               double regression_threshold_percent = 10.0) {
+        // Read baseline JSON (simplified parsing)
+        std::ifstream baseline_file(baseline_json_path);
+        if (!baseline_file.is_open()) {
+            return false;
+        }
+
+        // For now, just note that comparison is requested
+        // Full JSON parsing would require nlohmann/json
+        comparison_requested_ = true;
+        baseline_path_ = baseline_json_path;
+        regression_threshold_ = regression_threshold_percent;
+
+        return true;
+    }
+
+    // Get last generated file paths
+    [[nodiscard]] const std::string& last_json_path() const { return last_json_path_; }
+    [[nodiscard]] const std::string& last_csv_path() const { return last_csv_path_; }
+
+    // Set output directory
+    void set_output_directory(const std::string& dir) {
+        output_directory_ = dir;
+    }
+
+private:
+    std::string generate_filepath(const BenchmarkResult& result,
+                                  const std::string& filename,
+                                  const std::string& extension) {
+        // Ensure directory exists
+        std::filesystem::create_directories(output_directory_);
+
+        std::string name = filename;
+        if (name.empty()) {
+            // Generate filename from benchmark name and timestamp
+            name = "benchmark_" + sanitize_filename(result.benchmark_name) +
+                   "_" + sanitize_filename(result.timestamp);
+        }
+
+        // Remove extension if present
+        if (name.size() > extension.size() &&
+            name.substr(name.size() - extension.size()) == extension) {
+            name = name.substr(0, name.size() - extension.size());
+        }
+
+        return output_directory_ + "/" + name + extension;
+    }
+
+    static std::string sanitize_filename(const std::string& name) {
+        std::string result;
+        for (char c : name) {
+            if (std::isalnum(c) || c == '_' || c == '-') {
+                result += c;
+            } else if (c == ' ' || c == ':' || c == '/') {
+                result += '_';
+            }
+        }
+        return result;
+    }
+
+    static std::string quote(const std::string& s) {
+        std::string result = "\"";
+        for (char c : s) {
+            if (c == '"') result += "\\\"";
+            else if (c == '\\') result += "\\\\";
+            else if (c == '\n') result += "\\n";
+            else result += c;
+        }
+        result += "\"";
+        return result;
+    }
+
+    static std::string quote_csv(const std::string& s) {
+        if (s.find(',') != std::string::npos ||
+            s.find('"') != std::string::npos ||
+            s.find('\n') != std::string::npos) {
+            std::string escaped;
+            for (char c : s) {
+                if (c == '"') escaped += "\"\"";
+                else escaped += c;
+            }
+            return "\"" + escaped + "\"";
+        }
+        return s;
+    }
+
+    static std::string format_double(double value) {
+        std::ostringstream oss;
+        oss << std::setprecision(6) << std::fixed << value;
+        std::string str = oss.str();
+        // Remove trailing zeros
+        size_t dot_pos = str.find('.');
+        if (dot_pos != std::string::npos) {
+            size_t last_non_zero = str.find_last_not_of('0');
+            if (last_non_zero > dot_pos) {
+                str = str.substr(0, last_non_zero + 1);
+            } else {
+                str = str.substr(0, dot_pos);
+            }
+        }
+        return str;
+    }
+
+    std::string output_directory_;
+    std::string last_json_path_;
+    std::string last_csv_path_;
+    bool comparison_requested_ = false;
+    std::string baseline_path_;
+    double regression_threshold_ = 10.0;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_REPORT_GENERATOR_H
diff --git a/benchmarks/core/timing_utils.h b/benchmarks/core/timing_utils.h
new file mode 100644
index 0000000..30122c1
--- /dev/null
+++ b/benchmarks/core/timing_utils.h
@@ -0,0 +1,245 @@
+#ifndef DSR_TIMING_UTILS_H
+#define DSR_TIMING_UTILS_H
+
+#include <chrono>
+#include <functional>
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <stdexcept>
+#include <dsr/core/utils.h>
+
+namespace DSR::Benchmark {
+
+// Monotonic nanosecond counter for benchmark measurements.
+// Uses steady_clock (CLOCK_MONOTONIC on Linux) instead of system_clock so
+// that NTP adjustments and settimeofday() cannot produce negative intervals
+// or artificially inflate latency samples.
+inline uint64_t bench_now() noexcept {
+    return static_cast<uint64_t>(
+        std::chrono::steady_clock::now().time_since_epoch().count());
+}
+
+// RAII timer that calls a callback with elapsed nanoseconds on destruction
+class ScopedTimer {
+public:
+    using Callback = std::function<void(uint64_t)>;
+
+    explicit ScopedTimer(Callback on_complete)
+        : callback_(std::move(on_complete))
+        , start_time_(bench_now())
+    {}
+
+    ~ScopedTimer() {
+        if (callback_) {
+            uint64_t elapsed = bench_now() - start_time_;
+            callback_(elapsed);
+        }
+    }
+
+    // Disable copy
+    ScopedTimer(const ScopedTimer&) = delete;
+    ScopedTimer& operator=(const ScopedTimer&) = delete;
+
+    // Allow move
+    ScopedTimer(ScopedTimer&& other) noexcept
+        : callback_(std::move(other.callback_))
+        , start_time_(other.start_time_)
+    {
+        other.callback_ = nullptr;
+    }
+
+    ScopedTimer& operator=(ScopedTimer&& other) noexcept {
+        if (this != &other) {
+            callback_ = std::move(other.callback_);
+            start_time_ = other.start_time_;
+            other.callback_ = nullptr;
+        }
+        return *this;
+    }
+
+    // Get elapsed time without stopping
+    [[nodiscard]] uint64_t elapsed_ns() const {
+        return bench_now() - start_time_;
+    }
+
+    // Cancel the callback
+    void cancel() {
+        callback_ = nullptr;
+    }
+
+private:
+    Callback callback_;
+    uint64_t start_time_;
+};
+
+
+// Statistics from latency measurements
+struct LatencyStats {
+    uint64_t count = 0;
+    double mean_ns = 0.0;
+    double stddev_ns = 0.0;
+    uint64_t min_ns = 0;
+    uint64_t max_ns = 0;
+    uint64_t p50_ns = 0;
+    uint64_t p90_ns = 0;
+    uint64_t p95_ns = 0;
+    uint64_t p99_ns = 0;
+
+    // Convenience methods for different units
+    [[nodiscard]] double mean_us() const { return mean_ns / 1000.0; }
+    [[nodiscard]] double mean_ms() const { return mean_ns / 1'000'000.0; }
+    [[nodiscard]] double stddev_us() const { return stddev_ns / 1000.0; }
+    [[nodiscard]] double stddev_ms() const { return stddev_ns / 1'000'000.0; }
+    [[nodiscard]] double min_us() const { return min_ns / 1000.0; }
+    [[nodiscard]] double max_us() const { return max_ns / 1000.0; }
+    [[nodiscard]] double p50_us() const { return p50_ns / 1000.0; }
+    [[nodiscard]] double p90_us() const { return p90_ns / 1000.0; }
+    [[nodiscard]] double p95_us() const { return p95_ns / 1000.0; }
+    [[nodiscard]] double p99_us() const { return p99_ns / 1000.0; }
+    [[nodiscard]] double min_ms() const { return min_ns / 1'000'000.0; }
+    [[nodiscard]] double max_ms() const { return max_ns / 1'000'000.0; }
+    [[nodiscard]] double p50_ms() const { return p50_ns / 1'000'000.0; }
+    [[nodiscard]] double p90_ms() const { return p90_ns / 1'000'000.0; }
+    [[nodiscard]] double p95_ms() const { return p95_ns / 1'000'000.0; }
+    [[nodiscard]] double p99_ms() const { return p99_ns / 1'000'000.0; }
+};
+
+
+// Collects latency samples and computes statistics
+class LatencyTracker {
+public:
+    LatencyTracker() = default;
+
+    // Reserve space for expected samples
+    explicit LatencyTracker(size_t expected_samples) {
+        samples_.reserve(expected_samples);
+    }
+
+    // Record a latency sample in nanoseconds
+    void record(uint64_t latency_ns) {
+        samples_.push_back(latency_ns);
+        stats_valid_ = false;
+    }
+
+    // Record using ScopedTimer callback pattern
+    [[nodiscard]] auto recorder() {
+        return [this](uint64_t latency_ns) {
+            this->record(latency_ns);
+        };
+    }
+
+    // Create a ScopedTimer that records to this tracker
+    [[nodiscard]] ScopedTimer scoped_record() {
+        return ScopedTimer(recorder());
+    }
+
+    // Get number of recorded samples
+    [[nodiscard]] size_t count() const {
+        return samples_.size();
+    }
+
+    // Check if tracker has samples
+    [[nodiscard]] bool empty() const {
+        return samples_.empty();
+    }
+
+    // Clear all samples
+    void clear() {
+        samples_.clear();
+        stats_valid_ = false;
+    }
+
+    // Get raw samples (for export)
+    [[nodiscard]] const std::vector<uint64_t>& samples() const {
+        return samples_;
+    }
+
+    // Compute and return statistics
+    [[nodiscard]] LatencyStats stats() {
+        if (stats_valid_) {
+            return cached_stats_;
+        }
+
+        if (samples_.empty()) {
+            return LatencyStats{};
+        }
+
+        // Sort samples for percentile calculation
+        std::vector<uint64_t> sorted = samples_;
+        std::sort(sorted.begin(), sorted.end());
+
+        LatencyStats result;
+        result.count = sorted.size();
+        result.min_ns = sorted.front();
+        result.max_ns = sorted.back();
+
+        // Calculate mean
+        double sum = std::accumulate(sorted.begin(), sorted.end(), 0.0);
+        result.mean_ns = sum / static_cast<double>(result.count);
+
+        // Calculate standard deviation
+        double sq_sum = std::accumulate(sorted.begin(), sorted.end(), 0.0,
+            [mean = result.mean_ns](double acc, uint64_t val) {
+                double diff = static_cast<double>(val) - mean;
+                return acc + diff * diff;
+            });
+        result.stddev_ns = std::sqrt(sq_sum / static_cast<double>(result.count));
+
+        // Calculate percentiles
+        result.p50_ns = percentile(sorted, 0.50);
+        result.p90_ns = percentile(sorted, 0.90);
+        result.p95_ns = percentile(sorted, 0.95);
+        result.p99_ns = percentile(sorted, 0.99);
+
+        cached_stats_ = result;
+        stats_valid_ = true;
+        return result;
+    }
+
+private:
+    static uint64_t percentile(const std::vector<uint64_t>& sorted, double p) {
+        if (sorted.empty()) return 0;
+        if (sorted.size() == 1) return sorted[0];
+
+        double index = p * static_cast<double>(sorted.size() - 1);
+        size_t lower = static_cast<size_t>(std::floor(index));
+        size_t upper = static_cast<size_t>(std::ceil(index));
+
+        if (lower == upper) {
+            return sorted[lower];
+        }
+
+        double fraction = index - static_cast<double>(lower);
+        return static_cast<uint64_t>(
+            static_cast<double>(sorted[lower]) * (1.0 - fraction) +
+            static_cast<double>(sorted[upper]) * fraction
+        );
+    }
+
+    std::vector<uint64_t> samples_;
+    LatencyStats cached_stats_;
+    bool stats_valid_ = false;
+};
+
+
+// Utility function to measure a single operation
+template<typename Func>
+uint64_t measure_ns(Func&& func) {
+    uint64_t start = bench_now();
+    std::forward<Func>(func)();
+    return bench_now() - start;
+}
+
+// Utility function to run warmup iterations
+template<typename Func>
+void warmup(Func&& func, uint32_t iterations) {
+    for (uint32_t i = 0; i < iterations; ++i) {
+        std::forward<Func>(func)();
+    }
+}
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_TIMING_UTILS_H
diff --git a/benchmarks/fixtures/graph_generator.h b/benchmarks/fixtures/graph_generator.h
new file mode 100644
index 0000000..a6c3191
--- /dev/null
+++ b/benchmarks/fixtures/graph_generator.h
@@ -0,0 +1,354 @@
+#ifndef DSR_GRAPH_GENERATOR_H
+#define DSR_GRAPH_GENERATOR_H
+
+#include <string>
+#include <fstream>
+#include <random>
+#include <vector>
+#include <cstdlib>
+#include <filesystem>
+#include <dsr/api/dsr_api.h>
+
+namespace DSR::Benchmark {
+
+// Graph topology types
+enum class GraphTopology {
+    Linear,      // Chain of nodes
+    Star,        // Hub with spokes
+    Tree,        // Hierarchical tree
+    FullMesh,    // Every node connected to every other
+    Random       // Random connections
+};
+
+
+// Configuration for synthetic graph generation
+struct GraphGeneratorConfig {
+    uint32_t num_nodes = 100;
+    uint32_t edges_per_node = 2;
+    GraphTopology topology = GraphTopology::Tree;
+    std::string node_type = "test_node";
+    std::string edge_type = "test_edge";
+    bool include_rt_edges = false;
+    bool include_attributes = true;
+    uint32_t attributes_per_node = 3;
+};
+
+
+class GraphGenerator {
+public:
+    explicit GraphGenerator(unsigned int seed = std::random_device{}())
+        : rng_(seed)
+    {
+        // Ensure test types are registered (safe to call multiple times)
+        register_test_types();
+    }
+
+    // Register test node/edge types - call this before using any DSR operations
+    static void register_test_types() {
+        static bool registered = false;
+        if (!registered) {
+            node_types::register_type("test_node");
+            edge_types::register_type("test_edge");
+            registered = true;
+        }
+    }
+
+    // Generate a config file with synthetic graph
+    std::string generate_config_file(const GraphGeneratorConfig& config) {
+        std::string filename = temp_filename();
+        std::ofstream out(filename);
+        if (!out.is_open()) {
+            return "";
+        }
+
+        out << "{\n";
+        out << "  \"DSRModel\": {\n";
+        out << "    \"symbols\": {\n";
+
+        // Generate root node
+        out << generate_root_node();
+
+        // Generate additional nodes based on topology
+        auto node_ids = generate_node_ids(config.num_nodes);
+
+        for (size_t i = 0; i < node_ids.size(); ++i) {
+            out << ",\n";
+            out << generate_node(node_ids[i], config, i);
+        }
+
+        out << "\n    }\n";
+        out << "  }\n";
+        out << "}\n";
+
+        out.close();
+        return filename;
+    }
+
+    // Generate small graph (100 nodes)
+    std::string generate_small_graph() {
+        GraphGeneratorConfig config;
+        config.num_nodes = 100;
+        config.topology = GraphTopology::Tree;
+        return generate_config_file(config);
+    }
+
+    // Generate medium graph (1000 nodes)
+    std::string generate_medium_graph() {
+        GraphGeneratorConfig config;
+        config.num_nodes = 1000;
+        config.topology = GraphTopology::Tree;
+        return generate_config_file(config);
+    }
+
+    // Generate large graph (10000 nodes)
+    std::string generate_large_graph() {
+        GraphGeneratorConfig config;
+        config.num_nodes = 10000;
+        config.topology = GraphTopology::Tree;
+        config.include_attributes = false;  // Reduce size
+        return generate_config_file(config);
+    }
+
+    // Generate empty config (just root)
+    std::string generate_empty_graph() {
+        std::string filename = temp_filename();
+        std::ofstream out(filename);
+        if (!out.is_open()) {
+            return "";
+        }
+
+        out << "{\n";
+        out << "  \"DSRModel\": {\n";
+        out << "    \"symbols\": {\n";
+        out << generate_root_node();
+        out << "\n    }\n";
+        out << "  }\n";
+        out << "}\n";
+
+        out.close();
+        return filename;
+    }
+
+    // Add nodes directly to an existing graph
+    void populate_graph(DSRGraph& graph, uint32_t num_nodes,
+                        const std::string& node_type = "test_node") {
+        uint64_t base_id = 1000;
+        auto root = graph.get_node_root();
+        uint64_t parent_id = root ? root->id() : 100;
+
+        for (uint32_t i = 0; i < num_nodes; ++i) {
+            DSR::Node node;
+            node.id(base_id + i);
+            node.name("bench_node_" + std::to_string(i));
+            node.type(node_type);
+            node.agent_id(graph.get_agent_id());
+
+            // Add some attributes
+            graph.add_attrib_local<level_att>(node, static_cast<int32_t>(i % 10));
+
+            graph.insert_node(node);
+
+            // Add edge from parent
+            if (i > 0 && (i % 10) == 0) {
+                parent_id = base_id + i - 1;
+            }
+
+            DSR::Edge edge;
+            edge.from(parent_id);
+            edge.to(node.id());
+            edge.type("test_edge");
+            edge.agent_id(graph.get_agent_id());
+            graph.insert_or_assign_edge(edge);
+        }
+    }
+
+    // Create a node for insertion benchmarks
+    static DSR::Node create_test_node(uint64_t id, uint32_t agent_id,
+                                       const std::string& name = "") {
+        DSR::Node node;
+        node.id(id);
+        node.name(name.empty() ? "test_node_" + std::to_string(id) : name);
+        node.type("test_node");
+        node.agent_id(agent_id);
+        return node;
+    }
+
+    // Create an edge for insertion benchmarks
+    static DSR::Edge create_test_edge(uint64_t from, uint64_t to,
+                                       uint32_t agent_id,
+                                       const std::string& type = "test_edge") {
+        DSR::Edge edge;
+        edge.from(from);
+        edge.to(to);
+        edge.type(type);
+        edge.agent_id(agent_id);
+        return edge;
+    }
+
+private:
+    std::string temp_filename() {
+        std::string base = "/tmp/dsr_bench_";
+        std::uniform_int_distribution<uint64_t> dist;
+        return base + std::to_string(dist(rng_)) + ".json";
+    }
+
+    std::vector<uint64_t> generate_node_ids(uint32_t count) {
+        std::vector<uint64_t> ids;
+        ids.reserve(count);
+        for (uint32_t i = 0; i < count; ++i) {
+            ids.push_back(1000 + i);  // Start from 1000 to avoid conflicts
+        }
+        return ids;
+    }
+
+    std::string generate_root_node() {
+        return R"(      "100": {
+        "attribute": {
+          "level": {
+            "type": 1,
+            "value": 0
+          }
+        },
+        "id": "100",
+        "links": [],
+        "name": "root",
+        "type": "root"
+      })";
+    }
+
+    std::string generate_node(uint64_t id, const GraphGeneratorConfig& config,
+                              size_t index) {
+        std::ostringstream oss;
+        oss << "      \"" << id << "\": {\n";
+
+        // Attributes
+        oss << "        \"attribute\": {\n";
+        oss << "          \"level\": {\n";
+        oss << "            \"type\": 1,\n";
+        oss << "            \"value\": " << (index % 10 + 1) << "\n";
+        oss << "          }";
+
+        if (config.include_attributes) {
+            for (uint32_t a = 0; a < config.attributes_per_node; ++a) {
+                oss << ",\n          \"attr_" << a << "\": {\n";
+                oss << "            \"type\": 1,\n";
+                oss << "            \"value\": " << (rng_() % 1000) << "\n";
+                oss << "          }";
+            }
+        }
+
+        oss << "\n        },\n";
+
+        // ID and name
+        oss << "        \"id\": \"" << id << "\",\n";
+
+        // Links (edges)
+        oss << "        \"links\": [";
+        auto links = generate_links(id, config, index);
+        for (size_t i = 0; i < links.size(); ++i) {
+            if (i > 0) oss << ", ";
+            oss << "\n" << links[i];
+        }
+        if (!links.empty()) oss << "\n        ";
+        oss << "],\n";
+
+        // Name and type
+        oss << "        \"name\": \"node_" << id << "\",\n";
+        oss << "        \"type\": \"" << config.node_type << "\"\n";
+        oss << "      }";
+
+        return oss.str();
+    }
+
+    std::vector<std::string> generate_links(uint64_t from_id,
+                                             const GraphGeneratorConfig& config,
+                                             size_t index) {
+        std::vector<std::string> links;
+
+        // Always link back to root for tree topology
+        if (config.topology == GraphTopology::Tree && index == 0) {
+            links.push_back(generate_link(from_id, 100, config.edge_type, config.include_rt_edges));
+        }
+
+        // Generate additional links based on topology
+        switch (config.topology) {
+            case GraphTopology::Linear:
+                if (index > 0) {
+                    links.push_back(generate_link(from_id, 1000 + index - 1,
+                                                   config.edge_type, config.include_rt_edges));
+                } else {
+                    links.push_back(generate_link(from_id, 100,
+                                                   config.edge_type, config.include_rt_edges));
+                }
+                break;
+
+            case GraphTopology::Star:
+                links.push_back(generate_link(from_id, 100,
+                                               config.edge_type, config.include_rt_edges));
+                break;
+
+            case GraphTopology::Tree: {
+                // Each node links to its parent in tree
+                uint64_t parent_id = (index == 0) ? 100 : (1000 + (index - 1) / 2);
+                links.push_back(generate_link(from_id, parent_id,
+                                               config.edge_type, config.include_rt_edges));
+                break;
+            }
+
+            case GraphTopology::FullMesh:
+                // Limited to avoid explosion
+                for (uint64_t target = 1000; target < from_id && links.size() < 5; ++target) {
+                    links.push_back(generate_link(from_id, target,
+                                                   config.edge_type, config.include_rt_edges));
+                }
+                break;
+
+            case GraphTopology::Random: {
+                std::uniform_int_distribution<uint32_t> count_dist(1, config.edges_per_node);
+                std::uniform_int_distribution<uint64_t> id_dist(100, 1000 + index - 1);
+                uint32_t num_links = (index == 0) ? 1 : count_dist(rng_);
+                for (uint32_t i = 0; i < num_links; ++i) {
+                    uint64_t target = (index == 0) ? 100 : id_dist(rng_);
+                    links.push_back(generate_link(from_id, target,
+                                                   config.edge_type, config.include_rt_edges));
+                }
+                break;
+            }
+        }
+
+        return links;
+    }
+
+    std::string generate_link(uint64_t from, uint64_t to,
+                              const std::string& type, bool include_rt) {
+        std::ostringstream oss;
+        oss << "          {\n";
+        oss << "            \"dst\": \"" << to << "\",\n";
+        oss << "            \"label\": \"" << type << "\",\n";
+        oss << "            \"linkAttribute\": {";
+
+        if (include_rt && type == "RT") {
+            oss << R"(
+              "rt_rotation_euler_xyz": {
+                "type": 3,
+                "value": [0, 0, 0]
+              },
+              "rt_translation": {
+                "type": 3,
+                "value": [0, 0, 0]
+              })";
+        }
+
+        oss << "},\n";
+        oss << "            \"src\": \"" << from << "\"\n";
+        oss << "          }";
+
+        return oss.str();
+    }
+
+    std::mt19937 rng_;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_GRAPH_GENERATOR_H
diff --git a/benchmarks/fixtures/multi_agent_fixture.h b/benchmarks/fixtures/multi_agent_fixture.h
new file mode 100644
index 0000000..4988227
--- /dev/null
+++ b/benchmarks/fixtures/multi_agent_fixture.h
@@ -0,0 +1,265 @@
+#ifndef DSR_MULTI_AGENT_FIXTURE_H
+#define DSR_MULTI_AGENT_FIXTURE_H
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <functional>
+#include <QCoreApplication>
+#include <QTimer>
+#include <QEventLoop>
+#include <dsr/api/dsr_api.h>
+#include <dsr/core/types/type_checking/type_checker.h>
+#include "../core/benchmark_config.h"
+#include "../core/timing_utils.h"
+
+namespace DSR::Benchmark {
+
+// Agent info for tracking
+struct AgentInfo {
+    uint32_t id;
+    std::string name;
+    std::unique_ptr<DSRGraph> graph;
+    std::atomic<int> participants_matched{0};
+};
+
+
+// Forward declaration for type registration
+class GraphGenerator;
+
+// Reusable multi-agent test fixture
+class MultiAgentFixture {
+public:
+    explicit MultiAgentFixture(const BenchmarkConfig& config = default_config())
+        : config_(config)
+    {
+        // Ensure test types are registered before any DSR operations
+        register_benchmark_types();
+    }
+
+    // Register node/edge types needed by benchmarks
+    static void register_benchmark_types() {
+        static bool registered = false;
+        if (!registered) {
+            node_types::register_type("test_node");
+            edge_types::register_type("test_edge");
+            registered = true;
+        }
+    }
+
+    ~MultiAgentFixture() {
+        cleanup();
+    }
+
+    // Disable copy
+    MultiAgentFixture(const MultiAgentFixture&) = delete;
+    MultiAgentFixture& operator=(const MultiAgentFixture&) = delete;
+
+    // Create N agent instances with DSRGraph
+    // First agent loads from config_file, others sync via DDS
+    bool create_agents(uint32_t num_agents, const std::string& config_file) {
+        if (num_agents == 0 || num_agents > config_.max_agent_count) {
+            qWarning("Can't create agents");
+            return false;
+        }
+
+        // Generate unique base ID for this test run
+        base_agent_id_ = static_cast<uint32_t>(rand() % 3095);
+
+        agents_.clear();
+        agents_.reserve(num_agents);
+
+        // Create first agent with config file (it defines the initial graph)
+        {
+            auto agent = std::make_unique<AgentInfo>();
+            agent->id = base_agent_id_;
+            agent->name = "bench_agent_0";
+
+            try {
+                agent->graph = std::make_unique<DSRGraph>(
+                    agent->name,
+                    agent->id,
+                    config_file
+                );
+                agents_.push_back(std::move(agent));
+            } catch (const std::exception& e) {
+                qWarning("Failed to create primary agent: %s", e.what());
+                return false;
+            }
+        }
+
+        // Small delay for DDS to initialize primary agent
+        process_events(50);
+
+        // Create additional agents WITHOUT config file - they sync via DDS
+        for (uint32_t i = 1; i < num_agents; ++i) {
+            auto agent = std::make_unique<AgentInfo>();
+            agent->id = base_agent_id_ + i;
+            agent->name = "bench_agent_" + std::to_string(i);
+
+            try {
+                // No config file - agent receives graph from DDS
+                agent->graph = std::make_unique<DSRGraph>(
+                    agent->name,
+                    agent->id
+                );
+                agents_.push_back(std::move(agent));
+            } catch (const std::exception& e) {
+                qWarning("Failed to create agent %u: %s", i, e.what());
+                return false;
+            }
+
+            // Process events after each agent creation
+            process_events(20);
+        }
+
+        return true;
+    }
+
+    // Wait for DDS synchronization between agents
+    // Actively processes events while waiting
+    void wait_for_sync(std::chrono::milliseconds wait_time = std::chrono::milliseconds{0}) {
+        if (wait_time.count() == 0) {
+            wait_time = config_.sync_wait_time;
+        }
+
+        auto start = std::chrono::steady_clock::now();
+        while (std::chrono::steady_clock::now() - start < wait_time) {
+            process_events(10);
+        }
+    }
+
+    // Verify all agents have converged to same state
+    bool verify_convergence(std::chrono::seconds timeout = std::chrono::seconds{0}) {
+        if (timeout.count() == 0) {
+            timeout = config_.max_convergence_timeout;
+        }
+
+        if (agents_.size() < 2) {
+            return true;  // Single agent is always converged
+        }
+
+        auto start = std::chrono::steady_clock::now();
+
+        while (std::chrono::steady_clock::now() - start < timeout) {
+            if (check_node_convergence()) {
+                return true;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            process_events();
+        }
+
+        return false;
+    }
+
+    // Measure time to convergence
+    std::chrono::milliseconds measure_convergence_time() {
+        auto start = std::chrono::steady_clock::now();
+
+        while (!check_node_convergence()) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            process_events();
+
+            auto elapsed = std::chrono::steady_clock::now() - start;
+            if (elapsed > config_.max_convergence_timeout) {
+                return std::chrono::milliseconds{-1};  // Timeout
+            }
+        }
+
+        return std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+    }
+
+    // Get agent by index
+    DSRGraph* get_agent(size_t index) {
+        if (index < agents_.size()) {
+            return agents_[index]->graph.get();
+        }
+        return nullptr;
+    }
+
+    // Get agent info by index
+    AgentInfo* get_agent_info(size_t index) {
+        if (index < agents_.size()) {
+            return agents_[index].get();
+        }
+        return nullptr;
+    }
+
+    // Get number of agents
+    [[nodiscard]] size_t agent_count() const {
+        return agents_.size();
+    }
+
+    // Connect signal handler to all agents
+    template<typename Signal, typename Slot>
+    void connect_all(Signal signal, Slot slot) {
+        for (auto& agent : agents_) {
+            QObject::connect(agent->graph.get(), signal, slot, Qt::QueuedConnection);
+        }
+    }
+
+    // Process Qt events (for signal delivery)
+    void process_events(int timeout_ms = 10) {
+        auto* app = QCoreApplication::instance();
+        if (app) {
+            app->processEvents(QEventLoop::AllEvents, timeout_ms);
+        }
+    }
+
+    // Run event loop for specified duration
+    void run_event_loop(std::chrono::milliseconds duration) {
+        auto* app = QCoreApplication::instance();
+        if (!app) return;
+
+        QEventLoop loop;
+        QTimer::singleShot(duration.count(), &loop, &QEventLoop::quit);
+        loop.exec();
+    }
+
+    // Cleanup all agents
+    void cleanup() {
+        agents_.clear();
+    }
+
+    // Get number of agents
+    [[nodiscard]] size_t size() const {
+        return agents_.size();
+    }
+
+private:
+    bool check_node_convergence() {
+        if (agents_.size() < 2) return true;
+
+        auto& first_graph = agents_[0]->graph;
+        auto first_nodes = first_graph->get_nodes();
+
+        for (size_t i = 1; i < agents_.size(); ++i) {
+            auto nodes = agents_[i]->graph->get_nodes();
+            if (nodes.size() != first_nodes.size()) {
+                return false;
+            }
+
+            // Check each node exists in the other graph
+            for (const auto& node : first_nodes) {
+                auto other_node = agents_[i]->graph->get_node(node.id());
+                if (!other_node.has_value()) {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    BenchmarkConfig config_;
+    uint32_t base_agent_id_ = 0;
+    std::vector<std::unique_ptr<AgentInfo>> agents_;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_MULTI_AGENT_FIXTURE_H
diff --git a/benchmarks/latency/crdt_join_bench.cpp b/benchmarks/latency/crdt_join_bench.cpp
new file mode 100644
index 0000000..db4ff0e
--- /dev/null
+++ b/benchmarks/latency/crdt_join_bench.cpp
@@ -0,0 +1,229 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include <dsr/core/crdt/delta_crdt.h>
+#include <dsr/core/types/crdt_types.h>
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+
+using namespace DSR::Benchmark;
+
+// Create a test attribute
+static DSR::CRDTAttribute make_test_attribute(uint32_t agent_id, int32_t value) {
+    DSR::CRDTAttribute attr;
+    attr.value(value);
+    attr.timestamp(bench_now());
+    attr.agent_id(agent_id);
+    return attr;
+}
+
+// All four mvreg operations in a single TEST_CASE so they export together
+// to one JSON file.  No Catch2 SECTIONs — each measurement block runs
+// sequentially so all metrics accumulate in one collector.
+TEST_CASE("CRDT mvreg operations", "[CRDT][mvreg]") {
+    MetricsCollector collector("crdt_mvreg");
+
+    // ── mvreg write ───────────────────────────────────────────────────────────
+    {
+        LatencyTracker tracker(1000);
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+
+        for (int i = 0; i < 1000; ++i) {
+            auto attr = make_test_attribute(100, i);
+            uint64_t start = bench_now();
+            auto delta = reg.write(attr);
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("mvreg_write", tracker.stats());
+        INFO("mvreg::write mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    // ── mvreg join (same agent) ───────────────────────────────────────────────
+    {
+        LatencyTracker tracker(1000);
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+
+        auto init_attr = make_test_attribute(100, 0);
+        reg.write(init_attr);
+
+        for (int i = 0; i < 1000; ++i) {
+            mvreg<DSR::CRDTAttribute> delta_reg;
+            delta_reg.id = 100;
+            auto new_attr = make_test_attribute(100, i);
+            auto delta = delta_reg.write(new_attr);
+
+            uint64_t start = bench_now();
+            reg.join(std::move(delta));
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("mvreg_join_same_agent", tracker.stats());
+        INFO("mvreg::join (same agent) mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    // ── mvreg join (different agents) ────────────────────────────────────────
+    {
+        LatencyTracker tracker(1000);
+
+        for (int i = 0; i < 1000; ++i) {
+            mvreg<DSR::CRDTAttribute> reg;
+            reg.id = 100;
+
+            auto attr = make_test_attribute(100, 0);
+            auto delta = reg.write(attr);
+
+            uint32_t other_agent = 200 + (i % 10);
+            mvreg<DSR::CRDTAttribute> delta_reg;
+            delta_reg.id = other_agent;
+            delta_reg.join(std::move(delta));
+            auto new_attr = make_test_attribute(other_agent, i * 2);
+            delta = delta_reg.write(new_attr);
+
+            uint64_t start = bench_now();
+            reg.join(std::move(delta));
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("mvreg_join_different_agent", tracker.stats());
+        INFO("mvreg::join (different agent) mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    // ── mvreg read ────────────────────────────────────────────────────────────
+    {
+        LatencyTracker tracker(1000);
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+
+        auto attr = make_test_attribute(100, 42);
+        reg.write(attr);
+
+        for (int i = 0; i < 1000; ++i) {
+            uint64_t start = bench_now();
+            [[maybe_unused]] const auto& value = reg.read_reg();
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("mvreg_read", tracker.stats());
+        INFO("mvreg::read mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "crdt_mvreg");
+}
+
+TEST_CASE("CRDT dot_context operations", "[CRDT][dot_context]") {
+    MetricsCollector collector("crdt_dot_context");
+
+    // ── makedot ───────────────────────────────────────────────────────────────
+    {
+        LatencyTracker tracker(1000);
+        dot_context ctx;
+
+        for (int i = 0; i < 1000; ++i) {
+            uint64_t start = bench_now();
+            auto dot = ctx.makedot(100 + (i % 10));
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("dot_context_makedot", tracker.stats());
+        INFO("dot_context::makedot mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    // ── dotin ─────────────────────────────────────────────────────────────────
+    {
+        dot_context ctx;
+        for (int i = 0; i < 100; ++i) ctx.makedot(100 + (i % 10));
+
+        LatencyTracker tracker(1000);
+        for (int i = 0; i < 1000; ++i) {
+            std::pair<key_type, int> dot{100 + (i % 10), i % 50};
+            uint64_t start = bench_now();
+            [[maybe_unused]] bool r = ctx.dotin(dot);
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("dot_context_dotin", tracker.stats());
+        INFO("dot_context::dotin mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    // ── join ──────────────────────────────────────────────────────────────────
+    {
+        LatencyTracker tracker(1000);
+
+        for (int i = 0; i < 1000; ++i) {
+            dot_context ctx1;
+            dot_context ctx2;
+            for (int j = 0; j < 10; ++j) {
+                ctx1.makedot(100);
+                ctx2.makedot(200);
+            }
+            uint64_t start = bench_now();
+            ctx1.join(ctx2);
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("dot_context_join", tracker.stats());
+        INFO("dot_context::join mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    // ── compact ───────────────────────────────────────────────────────────────
+    {
+        LatencyTracker tracker(1000);
+
+        for (int i = 0; i < 1000; ++i) {
+            dot_context ctx;
+            for (int j = 0; j < 50; ++j) ctx.insertdot({100, j * 2}, false);
+
+            uint64_t start = bench_now();
+            ctx.compact();
+            tracker.record(bench_now() - start);
+        }
+        collector.record_latency_stats("dot_context_compact", tracker.stats());
+        INFO("dot_context::compact mean: " << tracker.stats().mean_ns << " ns");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "crdt_dot_context");
+}
+
+// Catch2 BENCHMARK macros — kept hidden; run with [!benchmark] to activate.
+TEST_CASE("CRDT micro-benchmarks (Catch2 BENCHMARK)", "[.][crdt][!benchmark]") {
+
+    BENCHMARK("mvreg write") {
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+        auto attr = make_test_attribute(100, 42);
+        return reg.write(attr);
+    };
+
+    BENCHMARK("mvreg join") {
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+        auto attr1 = make_test_attribute(100, 1);
+        auto delta = reg.write(attr1);
+
+        mvreg<DSR::CRDTAttribute> delta_reg;
+        delta_reg.id = 200;
+        delta_reg.join(std::move(delta));
+        auto attr2 = make_test_attribute(200, 2);
+        delta = delta_reg.write(attr2);
+
+        reg.join(std::move(delta));
+        return reg.read_reg();
+    };
+
+    BENCHMARK("dot_context makedot") {
+        dot_context ctx;
+        return ctx.makedot(100);
+    };
+
+    BENCHMARK("dot_context join") {
+        dot_context ctx1;
+        dot_context ctx2;
+        for (int i = 0; i < 10; ++i) {
+            ctx1.makedot(100);
+            ctx2.makedot(200);
+        }
+        ctx1.join(ctx2);
+        return ctx1.cc.size();
+    };
+}
diff --git a/benchmarks/latency/delta_propagation_bench.cpp b/benchmarks/latency/delta_propagation_bench.cpp
new file mode 100644
index 0000000..77380f7
--- /dev/null
+++ b/benchmarks/latency/delta_propagation_bench.cpp
@@ -0,0 +1,334 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <latch>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// Multi-agent tests require working DDS synchronization
+// Skip these by default - run with "[delta]" tag explicitly to test
+TEST_CASE("Delta propagation latency between agents", "[LATENCY][delta][.multi]") {
+    // Setup
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("delta_propagation");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(2, config_file));
+
+    // Wait for DDS discovery and initial sync
+    fixture.wait_for_sync(std::chrono::milliseconds(500));
+    REQUIRE(fixture.verify_convergence(std::chrono::seconds(10)));
+
+    auto* agent_a = fixture.get_agent(0);
+    auto* agent_b = fixture.get_agent(1);
+    REQUIRE(agent_a != nullptr);
+    REQUIRE(agent_b != nullptr);
+
+    SECTION("Node insertion propagation latency") {
+        LatencyTracker tracker(100);
+        std::atomic<uint64_t> receive_time{0};
+        std::atomic<bool> received{false};
+        uint64_t expected_node_id = 0;
+
+        // Connect to agent B's signal
+        QObject::connect(agent_b, &DSR::DSRGraph::update_node_signal, agent_b,
+            [&](uint64_t id, const std::string& type, DSR::SignalInfo) {
+                if (id == expected_node_id) {
+                    receive_time.store(get_unix_timestamp());
+                    received.store(true);
+                }
+            }, Qt::DirectConnection);
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            auto node = GraphGenerator::create_test_node(
+                2000 + i, agent_a->get_agent_id(), "warmup_" + std::to_string(i));
+            agent_a->insert_node(node);
+            fixture.wait_for_sync(std::chrono::milliseconds(50));
+        }
+
+        // Measurement iterations
+        for (int i = 0; i < 100; ++i) {
+            received.store(false);
+
+            auto node = GraphGenerator::create_test_node(
+                expected_node_id, agent_a->get_agent_id(),
+                "bench_node_" + std::to_string(i));
+
+            uint64_t send_time = get_unix_timestamp();
+            expected_node_id = agent_a->insert_node(node).value_or(0);
+            
+            // Wait for signal with timeout
+            auto start = std::chrono::steady_clock::now();
+            while (!received.load()) {
+                fixture.process_events(1);
+                if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                    FAIL("Timeout waiting for node propagation");
+                }
+            }
+
+            uint64_t latency = receive_time.load() - send_time;
+            tracker.record(latency);
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("node_propagation", stats);
+
+        INFO("Node propagation latency - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+
+        // Validation
+        CHECK(stats.p99_ns < MAX_EXPECTED_LATENCY_NS);
+    }
+
+    SECTION("Edge insertion propagation latency") {
+        LatencyTracker tracker(100);
+        std::atomic<uint64_t> receive_time{0};
+        std::atomic<bool> received{false};
+
+        // First create nodes on agent A
+        auto root = agent_a->get_node_root();
+        REQUIRE(root.has_value());
+
+        std::vector<uint64_t> node_to_ids = {};
+
+        for (int i = 0; i < 110; ++i) {
+            auto node = GraphGenerator::create_test_node(
+                4000 + i, agent_a->get_agent_id(), "edge_node_" + std::to_string(i));
+            node_to_ids.emplace_back(agent_a->insert_node(node).value_or(0));
+        }
+
+        // Wait for all nodes to sync to agent B before creating edges
+        fixture.wait_for_sync(std::chrono::milliseconds(500));
+        REQUIRE(fixture.verify_convergence(std::chrono::seconds(10)));
+
+        // Connect to agent B's edge signal
+        uint64_t expected_from = 0;
+        uint64_t expected_to = 0;
+        QObject::connect(agent_b, &DSR::DSRGraph::update_edge_signal, agent_b,
+            [&](uint64_t from, uint64_t to, const std::string& type, DSR::SignalInfo) {
+                if (from == expected_from && to == expected_to) {
+                    receive_time.store(get_unix_timestamp());
+                    received.store(true);
+                }
+            }, Qt::DirectConnection);
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), node_to_ids[i], agent_a->get_agent_id());
+            agent_a->insert_or_assign_edge(edge);
+            fixture.wait_for_sync(std::chrono::milliseconds(50));
+        }
+
+        // Measurement iterations
+        for (int i = 10; i < 110; ++i) {
+            expected_from = root->id();
+            expected_to = node_to_ids[i];
+            received.store(false);
+
+            auto edge = GraphGenerator::create_test_edge(
+                expected_from, expected_to, agent_a->get_agent_id());
+
+            uint64_t send_time = get_unix_timestamp();
+            agent_a->insert_or_assign_edge(edge);
+
+            // Wait for signal with timeout
+            auto start = std::chrono::steady_clock::now();
+            while (!received.load()) {
+                fixture.process_events(1);
+                if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                    FAIL("Timeout waiting for edge propagation");
+                }
+            }
+
+            uint64_t latency = receive_time.load() - send_time;
+            tracker.record(latency);
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("edge_propagation", stats);
+
+        INFO("Edge propagation latency - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+
+        CHECK(stats.p99_ns < MAX_EXPECTED_LATENCY_NS);
+    }
+
+    SECTION("Attribute update propagation latency") {
+        LatencyTracker tracker(100);
+        std::atomic<uint64_t> receive_time{0};
+        std::atomic<bool> received{false};
+
+        // Create a node for attribute updates
+        auto test_node = GraphGenerator::create_test_node(
+            5000, agent_a->get_agent_id(), "attr_test_node");
+        auto insert_result = agent_a->insert_node(test_node);
+        REQUIRE(insert_result.has_value());
+
+        // Wait for sync to agent B
+        fixture.wait_for_sync(std::chrono::milliseconds(500));
+        REQUIRE(fixture.verify_convergence(std::chrono::seconds(10)));
+
+        // Verify node exists on agent A
+        auto check_node = agent_a->get_node(*insert_result);
+        REQUIRE(check_node.has_value());
+
+        // Connect to agent B's attribute signal
+        QObject::connect(agent_b, &DSR::DSRGraph::update_node_attr_signal, agent_b,
+            [&](uint64_t id, const std::vector<std::string>& att_names, DSR::SignalInfo) {
+                if (id == *insert_result) {
+                    receive_time.store(get_unix_timestamp());
+                    received.store(true);
+                }
+            }, Qt::DirectConnection);
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            auto node = agent_a->get_node(*insert_result);
+            if (node) {
+                agent_a->add_or_modify_attrib_local<level_att>(*node, static_cast<int32_t>(i));
+                agent_a->update_node(*node);
+            }
+            fixture.wait_for_sync(std::chrono::milliseconds(50));
+        }
+
+        // Measurement iterations
+        for (int i = 0; i < 100; ++i) {
+            received.store(false);
+
+            auto node = agent_a->get_node(*insert_result);
+            REQUIRE(node.has_value());
+
+            agent_a->add_or_modify_attrib_local<level_att>(*node, static_cast<int32_t>(1000 + i));
+
+            uint64_t send_time = get_unix_timestamp();
+            agent_a->update_node(*node);
+
+            // Wait for signal with timeout
+            auto start = std::chrono::steady_clock::now();
+            while (!received.load()) {
+                fixture.process_events(1);
+                if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                    FAIL("Timeout waiting for attribute propagation");
+                }
+            }
+
+            uint64_t latency = receive_time.load() - send_time;
+            tracker.record(latency);
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("attribute_propagation", stats);
+
+        INFO("Attribute propagation latency - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+
+        CHECK(stats.p99_ns < MAX_EXPECTED_LATENCY_NS);
+    }
+
+    // Export results
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "delta_propagation");
+}
+
+TEST_CASE("Delta propagation with varying agent counts", "[LATENCY][delta][scalability][.multi]") {
+    MetricsCollector collector("delta_propagation_scaling");
+    GraphGenerator generator;
+
+    for (uint32_t num_agents : {2, 4, 8}) {
+        SECTION("With " + std::to_string(num_agents) + " agents") {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents, skipping");
+                continue;
+            }
+
+            // Wait for DDS discovery with all agents
+            fixture.wait_for_sync(std::chrono::milliseconds(500 * num_agents));
+            if (!fixture.verify_convergence(std::chrono::seconds(15))) {
+                WARN("Agents failed to converge, skipping");
+                continue;
+            }
+
+            auto* sender = fixture.get_agent(0);
+            REQUIRE(sender != nullptr);
+
+            LatencyTracker tracker(50);
+
+            // Track reception across all other agents
+            std::atomic<uint32_t> received_count{0};
+            std::vector<std::atomic<uint64_t>> receive_times(num_agents - 1);
+            std::atomic<uint64_t> current_expected_id{0};
+
+            for (size_t i = 1; i < num_agents; ++i) {
+                auto* receiver = fixture.get_agent(i);
+                QObject::connect(receiver, &DSR::DSRGraph::update_node_signal, receiver,
+                    [&, idx = i - 1](uint64_t id, const std::string& type, DSR::SignalInfo) {
+                        if (id == current_expected_id.load()) {
+                            receive_times[idx].store(get_unix_timestamp());
+                            received_count.fetch_add(1);
+                        }
+                    }, Qt::DirectConnection);
+            }
+
+            // Measurement
+            for (int i = 0; i < 50; ++i) {
+                received_count.store(0);
+                for (auto& rt : receive_times) rt.store(0);
+
+                auto node = GraphGenerator::create_test_node(
+                    0, sender->get_agent_id(),
+                    "scale_node_" + std::to_string(i));
+
+                uint64_t send_time = get_unix_timestamp();
+                auto result = sender->insert_node(node);
+                if (result.has_value()) {
+                    current_expected_id.store(result.value());
+                }
+
+                // Wait for all receivers
+                auto start = std::chrono::steady_clock::now();
+                while (received_count.load() < num_agents - 1) {
+                    fixture.process_events(1);
+                    if (std::chrono::steady_clock::now() - start > std::chrono::seconds(10)) {
+                        break;
+                    }
+                }
+
+                // Record max latency (time for all to receive)
+                uint64_t max_receive = 0;
+                for (const auto& rt : receive_times) {
+                    max_receive = std::max(max_receive, rt.load());
+                }
+                if (max_receive > 0) {
+                    tracker.record(max_receive - send_time);
+                }
+            }
+
+            auto stats = tracker.stats();
+            collector.record_latency_stats(
+                "propagation_" + std::to_string(num_agents) + "_agents",
+                stats,
+                {{"num_agents", std::to_string(num_agents)}});
+
+            INFO(num_agents << " agents - Mean: " << stats.mean_us() << " us, "
+                 << "P99: " << stats.p99_us() << " us");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "delta_propagation_scaling");
+}
diff --git a/benchmarks/latency/signal_latency_bench.cpp b/benchmarks/latency/signal_latency_bench.cpp
new file mode 100644
index 0000000..353a1b5
--- /dev/null
+++ b/benchmarks/latency/signal_latency_bench.cpp
@@ -0,0 +1,293 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// Each signal-type gets its own TEST_CASE so all four metrics are exported
+// to separate JSON files.  Callbacks use bench_now() (steady_clock) to match
+// the insert-side timer — no NTP jumps, no system_clock skew.
+// 1 000 samples give stable percentiles (p99 = 10 points instead of 1).
+
+TEST_CASE("Node signal direct latency", "[LATENCY][signal]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    LatencyTracker tracker(1000);
+    std::atomic<uint64_t> callback_time{0};
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_signal, graph,
+        [&](uint64_t, const std::string&, DSR::SignalInfo) {
+            callback_time.store(bench_now());
+            callback_called.store(true);
+        }, Qt::DirectConnection);
+
+    // Warmup — 50 inserts, let caches and branch predictor settle
+    for (int i = 0; i < 50; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        graph->insert_node(node);
+    }
+    fixture.process_events();
+
+    // Measurement — 1 000 samples
+    for (int i = 0; i < 1000; ++i) {
+        callback_called.store(false);
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        uint64_t pre_insert = bench_now();
+        graph->insert_node(node);
+        if (callback_called.load()) {
+            tracker.record(callback_time.load() - pre_insert);
+        }
+    }
+
+    auto stats = tracker.stats();
+    collector.record_latency_stats("node_signal_direct", stats);
+    INFO("Node signal (direct) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_node_direct");
+}
+
+TEST_CASE("Edge signal direct latency", "[LATENCY][signal]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-create enough nodes for warmup (50) + measurement (1000)
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1060);
+    for (int i = 0; i < 1060; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto id = graph->insert_node(node);
+        if (id.has_value()) node_ids.push_back(*id);
+    }
+    REQUIRE(node_ids.size() >= 1050);
+
+    LatencyTracker tracker(1000);
+    std::atomic<uint64_t> callback_time{0};
+    std::atomic<bool> callback_called{false};
+    std::atomic<uint64_t> target_to{0};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_edge_signal, graph,
+        [&](uint64_t, uint64_t to, const std::string&, DSR::SignalInfo) {
+            if (to == target_to.load()) {
+                callback_time.store(bench_now());
+                callback_called.store(true);
+            }
+        }, Qt::DirectConnection);
+
+    // Warmup
+    for (int i = 0; i < 50; ++i) {
+        auto edge = GraphGenerator::create_test_edge(root->id(), node_ids[i], graph->get_agent_id());
+        graph->insert_or_assign_edge(edge);
+    }
+    fixture.process_events();
+
+    // Measurement
+    for (int i = 50; i < 1050; ++i) {
+        target_to.store(node_ids[i]);
+        callback_called.store(false);
+        auto edge = GraphGenerator::create_test_edge(root->id(), node_ids[i], graph->get_agent_id());
+        uint64_t pre_insert = bench_now();
+        graph->insert_or_assign_edge(edge);
+        if (callback_called.load()) {
+            tracker.record(callback_time.load() - pre_insert);
+        }
+    }
+
+    auto stats = tracker.stats();
+    collector.record_latency_stats("edge_signal_direct", stats);
+    INFO("Edge signal (direct) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_edge_direct");
+}
+
+TEST_CASE("Attribute signal direct latency", "[LATENCY][signal]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto test_node = GraphGenerator::create_test_node(0, graph->get_agent_id(), "attr_signal_test");
+    auto node_id = graph->insert_node(test_node);
+    REQUIRE(node_id.has_value());
+
+    LatencyTracker tracker(1000);
+    std::atomic<uint64_t> callback_time{0};
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_attr_signal, graph,
+        [&](uint64_t id, const std::vector<std::string>&, DSR::SignalInfo) {
+            if (id == *node_id) {
+                callback_time.store(bench_now());
+                callback_called.store(true);
+            }
+        }, Qt::DirectConnection);
+
+    // Warmup
+    for (int i = 0; i < 50; ++i) {
+        auto node = graph->get_node(*node_id);
+        if (node) {
+            graph->add_or_modify_attrib_local<level_att>(*node, static_cast<int32_t>(i));
+            graph->update_node(*node);
+        }
+    }
+    fixture.process_events();
+
+    // Measurement
+    for (int i = 0; i < 1000; ++i) {
+        callback_called.store(false);
+        auto node = graph->get_node(*node_id);
+        REQUIRE(node.has_value());
+        graph->add_or_modify_attrib_local<level_att>(*node, static_cast<int32_t>(100 + i));
+        uint64_t pre_update = bench_now();
+        graph->update_node(*node);
+        if (callback_called.load()) {
+            tracker.record(callback_time.load() - pre_update);
+        }
+    }
+
+    auto stats = tracker.stats();
+    collector.record_latency_stats("attr_signal_direct", stats);
+    INFO("Attr signal (direct) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_attr_direct");
+}
+
+TEST_CASE("Node signal queued latency", "[LATENCY][signal]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    LatencyTracker tracker(1000);
+    std::atomic<uint64_t> callback_time{0};
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_signal, graph,
+        [&](uint64_t, const std::string&, DSR::SignalInfo) {
+            callback_time.store(bench_now());
+            callback_called.store(true);
+        }, Qt::QueuedConnection);
+
+    // Warmup
+    for (int i = 0; i < 50; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        graph->insert_node(node);
+        fixture.process_events();
+    }
+
+    // Measurement
+    for (int i = 0; i < 1000; ++i) {
+        callback_called.store(false);
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        uint64_t pre_insert = bench_now();
+        graph->insert_node(node);
+
+        auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(100);
+        while (!callback_called.load() && std::chrono::steady_clock::now() < deadline) {
+            fixture.process_events(1);
+        }
+
+        if (callback_called.load()) {
+            tracker.record(callback_time.load() - pre_insert);
+        }
+    }
+
+    auto stats = tracker.stats();
+    collector.record_latency_stats("node_signal_queued", stats);
+    INFO("Node signal (queued) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_node_queued");
+}
+
+TEST_CASE("Signal emission under load", "[LATENCY][signal][stress]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency_stress");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate graph with 1000 nodes
+    for (int i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        graph->insert_node(node);
+    }
+    fixture.process_events();
+
+    LatencyTracker tracker(1000);
+    std::atomic<uint64_t> callback_time{0};
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_signal, graph,
+        [&](uint64_t, const std::string&, DSR::SignalInfo) {
+            callback_time.store(bench_now());
+            callback_called.store(true);
+        }, Qt::DirectConnection);
+
+    // Warmup
+    for (int i = 0; i < 50; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        graph->insert_node(node);
+    }
+    fixture.process_events();
+
+    // Measurement — 1 000 samples with 1 000-node graph
+    for (int i = 0; i < 1000; ++i) {
+        callback_called.store(false);
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        uint64_t pre_insert = bench_now();
+        graph->insert_node(node);
+        if (callback_called.load()) {
+            tracker.record(callback_time.load() - pre_insert);
+        }
+    }
+
+    auto stats = tracker.stats();
+    collector.record_latency_stats("signal_with_1000_nodes", stats, {{"existing_nodes", "1000"}});
+    INFO("Signal with 1000 nodes - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_latency_stress");
+}
diff --git a/benchmarks/python/bench_binding_overhead.py b/benchmarks/python/bench_binding_overhead.py
new file mode 100644
index 0000000..fe54d2d
--- /dev/null
+++ b/benchmarks/python/bench_binding_overhead.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Python binding overhead vs C++.
+
+Measures the overhead introduced by pybind11 bindings by comparing
+Python API operations with baseline measurements.
+"""
+
+import sys
+import os
+import time
+
+# Add parent to path for imports
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import LatencyTracker, MetricsCollector, make_temp_config_file, warmup
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found. Build with Python bindings enabled.")
+    sys.exit(1)
+
+
+def benchmark_node_creation():
+    """Benchmark Node object creation overhead."""
+    collector = MetricsCollector("binding_overhead_node_creation")
+    tracker = LatencyTracker(1000)
+
+    # Warmup
+    for i in range(100):
+        _ = pydsr.Node(1, "testtype", f"warmup_{i}")
+
+    # Measure
+    for i in range(1000):
+        with tracker.measure():
+            node = pydsr.Node(1, "testtype", f"node_{i}")
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_creation", stats)
+    print(f"Node creation: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+
+    return collector
+
+
+def benchmark_edge_creation():
+    """Benchmark Edge object creation overhead."""
+    collector = MetricsCollector("binding_overhead_edge_creation")
+    tracker = LatencyTracker(1000)
+
+    # Warmup
+    for i in range(100):
+        _ = pydsr.Edge(100, 200, "testtype_e", 1)
+
+    # Measure
+    for i in range(1000):
+        with tracker.measure():
+            edge = pydsr.Edge(100, 200 + i, "testtype_e", 1)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("edge_creation", stats)
+    print(f"Edge creation: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+
+    return collector
+
+
+def benchmark_attribute_creation():
+    """Benchmark Attribute creation with different types."""
+    collector = MetricsCollector("binding_overhead_attribute")
+
+    # String attribute
+    tracker = LatencyTracker(1000)
+    warmup(lambda: pydsr.Attribute("test"))
+    for _ in range(1000):
+        with tracker.measure():
+            _ = pydsr.Attribute("test_string")
+    stats = tracker.stats()
+    collector.record_latency_stats("attribute_string", stats)
+    print(f"Attribute(string): mean={stats.mean_us:.2f} us")
+
+    # Int attribute
+    tracker = LatencyTracker(1000)
+    warmup(lambda: pydsr.Attribute(42))
+    for _ in range(1000):
+        with tracker.measure():
+            _ = pydsr.Attribute(42)
+    stats = tracker.stats()
+    collector.record_latency_stats("attribute_int", stats)
+    print(f"Attribute(int): mean={stats.mean_us:.2f} us")
+
+    # Float attribute
+    tracker = LatencyTracker(1000)
+    warmup(lambda: pydsr.Attribute(3.14))
+    for _ in range(1000):
+        with tracker.measure():
+            _ = pydsr.Attribute(3.14159)
+    stats = tracker.stats()
+    collector.record_latency_stats("attribute_float", stats)
+    print(f"Attribute(float): mean={stats.mean_us:.2f} us")
+
+    # List attribute
+    tracker = LatencyTracker(1000)
+    test_list = [1.0, 2.0, 3.0]
+    warmup(lambda: pydsr.Attribute(test_list))
+    for _ in range(1000):
+        with tracker.measure():
+            _ = pydsr.Attribute(test_list)
+    stats = tracker.stats()
+    collector.record_latency_stats("attribute_list", stats)
+    print(f"Attribute(list[3]): mean={stats.mean_us:.2f} us")
+
+    return collector
+
+
+def benchmark_attribute_numpy():
+    """Benchmark Attribute with numpy arrays (large data)."""
+    try:
+        import numpy as np
+    except ImportError:
+        print("Numpy not available, skipping numpy benchmarks")
+        return None
+
+    collector = MetricsCollector("binding_overhead_numpy")
+
+    for size in [1000, 10000, 100000, 1000000]:
+        tracker = LatencyTracker(100)
+        data = np.random.randint(0, 255, size, dtype=np.uint8)
+
+        # Warmup
+        for _ in range(10):
+            attr = pydsr.Attribute([0])
+            attr.value = data
+
+        # Measure set
+        for _ in range(100):
+            attr = pydsr.Attribute([0])
+            with tracker.measure():
+                attr.value = data
+
+        stats = tracker.stats()
+        collector.record_latency_stats(f"numpy_set_{size}", stats,
+                                        tags={"size": str(size)})
+        print(f"Numpy set ({size} bytes): mean={stats.mean_us:.2f} us")
+
+        # Measure get
+        tracker = LatencyTracker(100)
+        attr = pydsr.Attribute([0])
+        attr.value = data
+        for _ in range(100):
+            with tracker.measure():
+                _ = attr.value
+
+        stats = tracker.stats()
+        collector.record_latency_stats(f"numpy_get_{size}", stats,
+                                        tags={"size": str(size)})
+        print(f"Numpy get ({size} bytes): mean={stats.mean_us:.2f} us")
+
+    return collector
+
+
+def benchmark_graph_creation():
+    """Benchmark DSRGraph creation overhead."""
+    collector = MetricsCollector("binding_overhead_graph")
+    tracker = LatencyTracker(10)
+
+    config_file = make_temp_config_file()
+
+    # This is expensive, only do a few iterations
+    for i in range(10):
+        with tracker.measure():
+            g = pydsr.DSRGraph(0, f"bench_graph_{i}", 100 + i, config_file)
+        del g
+        time.sleep(0.5)  # Allow cleanup
+
+    stats = tracker.stats()
+    collector.record_latency_stats("graph_creation", stats)
+    print(f"Graph creation: mean={stats.mean_ms:.2f} ms")
+
+    os.unlink(config_file)
+    return collector
+
+
+def main():
+    print("=" * 60)
+    print("DSR Python Binding Overhead Benchmarks")
+    print("=" * 60)
+    print()
+
+    collectors = []
+
+    print("--- Node/Edge/Attribute Creation ---")
+    collectors.append(benchmark_node_creation())
+    collectors.append(benchmark_edge_creation())
+    collectors.append(benchmark_attribute_creation())
+
+    print("\n--- Numpy Array Operations ---")
+    numpy_collector = benchmark_attribute_numpy()
+    if numpy_collector:
+        collectors.append(numpy_collector)
+
+    print("\n--- Graph Creation ---")
+    collectors.append(benchmark_graph_creation())
+
+    # Export results
+    print("\n--- Exporting Results ---")
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+
+    for c in collectors:
+        if c:
+            c.export_json(os.path.join(results_dir, f"python_{c.benchmark_name}.json"))
+            c.export_csv(os.path.join(results_dir, f"python_{c.benchmark_name}.csv"))
+
+    print(f"Results exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_graph_operations.py b/benchmarks/python/bench_graph_operations.py
new file mode 100644
index 0000000..1c8df08
--- /dev/null
+++ b/benchmarks/python/bench_graph_operations.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Graph operations (CRUD) performance.
+
+Measures insert, read, update, delete performance for nodes and edges.
+"""
+
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import LatencyTracker, MetricsCollector, make_temp_config_file, warmup
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found.")
+    sys.exit(1)
+
+
+def benchmark_node_operations(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Benchmark node CRUD operations."""
+    agent_id = graph.get_agent_id()
+
+    # --- Insert ---
+    tracker = LatencyTracker(500)
+    base_id = 10000
+
+    # Warmup
+    for i in range(50):
+        node = pydsr.Node(agent_id, "testtype", f"warmup_{i}")
+        graph.insert_node(node)
+
+    # Measure
+    for i in range(500):
+        node = pydsr.Node(agent_id, "testtype", f"bench_node_{i}")
+        with tracker.measure():
+            graph.insert_node(node)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_insert", stats)
+    print(f"Node insert: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+
+    # --- Read by ID ---
+    tracker = LatencyTracker(1000)
+    nodes = graph.get_nodes()
+    node_ids = [n.id for n in nodes[:100]]
+
+    for _ in range(1000):
+        node_id = node_ids[_ % len(node_ids)]
+        with tracker.measure():
+            _ = graph.get_node(node_id)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_read_by_id", stats)
+    print(f"Node read (by id): mean={stats.mean_us:.2f} us")
+
+    # --- Read by name ---
+    tracker = LatencyTracker(1000)
+    node_names = [f"bench_node_{i}" for i in range(100)]
+
+    for i in range(1000):
+        name = node_names[i % len(node_names)]
+        with tracker.measure():
+            _ = graph.get_node(name)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_read_by_name", stats)
+    print(f"Node read (by name): mean={stats.mean_us:.2f} us")
+
+    # --- Update ---
+    tracker = LatencyTracker(500)
+    test_node = graph.get_node("bench_node_0")
+
+    for i in range(500):
+        test_node.attrs["level"] = pydsr.Attribute(i)
+        with tracker.measure():
+            graph.update_node(test_node)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_update", stats)
+    print(f"Node update: mean={stats.mean_us:.2f} us")
+
+    # --- Delete ---
+    tracker = LatencyTracker(100)
+    delete_nodes = [f"bench_node_{i}" for i in range(400, 500)]
+
+    for name in delete_nodes:
+        with tracker.measure():
+            graph.delete_node(name)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_delete", stats)
+    print(f"Node delete: mean={stats.mean_us:.2f} us")
+
+
+def benchmark_edge_operations(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Benchmark edge CRUD operations."""
+    agent_id = graph.get_agent_id()
+
+    # Get root node
+    root = graph.get_node("root")
+    if not root:
+        print("No root node found")
+        return
+
+    # Create target nodes for edges
+    for i in range(200):
+        node = pydsr.Node(agent_id, "testtype", f"edge_target_{i}")
+        graph.insert_node(node)
+
+    time.sleep(0.1)
+
+    # --- Insert edge ---
+    tracker = LatencyTracker(200)
+
+    for i in range(200):
+        target = graph.get_node(f"edge_target_{i}")
+        if target:
+            edge = pydsr.Edge(target.id, root.id, "testtype_e", agent_id)
+            with tracker.measure():
+                graph.insert_or_assign_edge(edge)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("edge_insert", stats)
+    print(f"Edge insert: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+
+    # --- Read edge ---
+    tracker = LatencyTracker(500)
+
+    for i in range(500):
+        target = graph.get_node(f"edge_target_{i % 200}")
+        if target:
+            with tracker.measure():
+                _ = graph.get_edge(root.id, target.id, "testtype_e")
+
+    stats = tracker.stats()
+    collector.record_latency_stats("edge_read", stats)
+    print(f"Edge read: mean={stats.mean_us:.2f} us")
+
+    # --- Delete edge ---
+    tracker = LatencyTracker(100)
+
+    for i in range(100, 200):
+        target = graph.get_node(f"edge_target_{i}")
+        if target:
+            with tracker.measure():
+                graph.delete_edge(root.id, target.id, "testtype_e")
+
+    stats = tracker.stats()
+    collector.record_latency_stats("edge_delete", stats)
+    print(f"Edge delete: mean={stats.mean_us:.2f} us")
+
+
+def benchmark_query_operations(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Benchmark query operations."""
+
+    # --- get_nodes ---
+    tracker = LatencyTracker(100)
+
+    for _ in range(100):
+        with tracker.measure():
+            _ = graph.get_nodes()
+
+    stats = tracker.stats()
+    collector.record_latency_stats("get_all_nodes", stats)
+    print(f"get_nodes(): mean={stats.mean_us:.2f} us")
+
+    # --- get_nodes_by_type ---
+    tracker = LatencyTracker(100)
+
+    for _ in range(100):
+        with tracker.measure():
+            _ = graph.get_nodes_by_type("testtype")
+
+    stats = tracker.stats()
+    collector.record_latency_stats("get_nodes_by_type", stats)
+    print(f"get_nodes_by_type(): mean={stats.mean_us:.2f} us")
+
+    # --- get_edges (from node) ---
+    root = graph.get_node("root")
+    if root:
+        tracker = LatencyTracker(100)
+
+        for _ in range(100):
+            with tracker.measure():
+                _ = graph.get_edges(root.id)
+
+        stats = tracker.stats()
+        collector.record_latency_stats("get_edges_from_node", stats)
+        print(f"get_edges(id): mean={stats.mean_us:.2f} us")
+
+    # --- get_edges_to_id ---
+    if root:
+        tracker = LatencyTracker(100)
+
+        for _ in range(100):
+            with tracker.measure():
+                _ = graph.get_edges_to_id(root.id)
+
+        stats = tracker.stats()
+        collector.record_latency_stats("get_edges_to_id", stats)
+        print(f"get_edges_to_id(id): mean={stats.mean_us:.2f} us")
+
+    # --- get_edges_by_type ---
+    tracker = LatencyTracker(100)
+
+    for _ in range(100):
+        with tracker.measure():
+            _ = graph.get_edges_by_type("testtype_e")
+
+    stats = tracker.stats()
+    collector.record_latency_stats("get_edges_by_type", stats)
+    print(f"get_edges_by_type(): mean={stats.mean_us:.2f} us")
+
+
+def main():
+    print("=" * 60)
+    print("DSR Python Graph Operations Benchmarks")
+    print("=" * 60)
+    print()
+
+    collector = MetricsCollector("graph_operations")
+
+    # Create graph
+    config_file = make_temp_config_file()
+    graph = pydsr.DSRGraph(0, "bench_graph_ops", 42, config_file)
+    time.sleep(0.5)
+
+    print("--- Node Operations ---")
+    benchmark_node_operations(graph, collector)
+
+    print("\n--- Edge Operations ---")
+    benchmark_edge_operations(graph, collector)
+
+    print("\n--- Query Operations ---")
+    benchmark_query_operations(graph, collector)
+
+    # Cleanup
+    del graph
+    os.unlink(config_file)
+
+    # Export
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_graph_operations.json"))
+    collector.export_csv(os.path.join(results_dir, "python_graph_operations.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_signals.py b/benchmarks/python/bench_signals.py
new file mode 100644
index 0000000..333fb7e
--- /dev/null
+++ b/benchmarks/python/bench_signals.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Signal/callback performance.
+
+Measures signal connection, emission, and callback invocation overhead.
+"""
+
+import sys
+import os
+import time
+import threading
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import LatencyTracker, MetricsCollector, make_temp_config_file
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found.")
+    sys.exit(1)
+
+
+def benchmark_signal_callback_latency(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Measure signal callback invocation latency."""
+    agent_id = graph.get_agent_id()
+    tracker = LatencyTracker(100)
+
+    callback_time = [0]
+    callback_received = threading.Event()
+    expected_id = [0]
+
+    def on_node_update(node_id: int, node_type: str):
+        if node_id == expected_id[0]:
+            callback_time[0] = time.perf_counter_ns()
+            callback_received.set()
+
+    # Connect signal
+    pydsr.signals.connect(graph, pydsr.signals.UPDATE_NODE, on_node_update)
+
+    # Warmup
+    for i in range(20):
+        node = pydsr.Node(agent_id, "testtype", f"warmup_sig_{i}")
+        graph.insert_node(node)
+        time.sleep(0.05)
+
+    # Measure
+    for i in range(100):
+        callback_received.clear()
+        node = pydsr.Node(agent_id, "testtype", f"signal_node_{i}")
+
+        send_time = time.perf_counter_ns()
+        expected_id[0] = graph.insert_node(node)
+
+        # Wait for callback
+        if callback_received.wait(timeout=2.0):
+            latency = callback_time[0] - send_time
+            tracker.record(latency)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("signal_callback_latency", stats)
+    print(f"Signal callback latency: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+    print(f"  (received {tracker.count}/100 callbacks)")
+
+
+def benchmark_signal_throughput(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Measure how many signals can be processed per second.
+
+    Uses a fixed insert count instead of a time-based loop to keep the
+    callback backlog bounded.  An unbounded loop (e.g. 3 s × 40K inserts/sec)
+    creates a queue that outlasts the benchmark and blocks graph teardown.
+    """
+    agent_id = graph.get_agent_id()
+
+    callback_count = [0]
+
+    def on_node_update(node_id: int, node_type: str):
+        callback_count[0] += 1
+
+    pydsr.signals.connect(graph, pydsr.signals.UPDATE_NODE, on_node_update)
+
+    INSERT_COUNT = 3000
+    print("Generating signals...")
+    start = time.perf_counter()
+
+    for i in range(INSERT_COUNT):
+        node = pydsr.Node(agent_id, "testtype", f"sig_tp_{i}")
+        graph.insert_node(node)
+
+    # Wait for callbacks to drain, but give up after a timeout so teardown
+    # isn't blocked indefinitely if the callback rate is very slow.
+    drain_deadline = time.perf_counter() + 5.0
+    prev = -1
+    while time.perf_counter() < drain_deadline:
+        time.sleep(0.1)
+        cur = callback_count[0]
+        if cur == prev:          # no new callbacks — queue is drained
+            break
+        prev = cur
+
+    duration = time.perf_counter() - start
+    callbacks_per_sec = callback_count[0] / duration
+
+    collector.record_throughput("signal_callbacks", callback_count[0], duration)
+    print(f"Signal throughput: {callbacks_per_sec:.0f} callbacks/sec")
+    print(f"  ({callback_count[0]} callbacks for {INSERT_COUNT} inserts)")
+
+
+def benchmark_multiple_handlers(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Measure impact of multiple signal handlers."""
+    agent_id = graph.get_agent_id()
+
+    for num_handlers in [1, 5, 10]:
+        callback_counts = [0] * num_handlers
+
+        def make_handler(idx):
+            def handler(node_id: int, node_type: str):
+                callback_counts[idx] += 1
+            return handler
+
+        # Connect multiple handlers
+        handlers = [make_handler(i) for i in range(num_handlers)]
+        for h in handlers:
+            pydsr.signals.connect(graph, pydsr.signals.UPDATE_NODE, h)
+
+        # Generate updates
+        insert_count = 100
+        start = time.perf_counter()
+
+        for i in range(insert_count):
+            node = pydsr.Node(agent_id, "testtype", f"mh_{num_handlers}_{i}")
+            graph.insert_node(node)
+
+        time.sleep(0.3)  # Let callbacks process
+        duration = time.perf_counter() - start
+
+        total_callbacks = sum(callback_counts)
+        collector.record("callbacks_with_handlers", "throughput",
+                        total_callbacks / duration,
+                        "callbacks/sec",
+                        tags={"num_handlers": str(num_handlers)})
+
+        print(f"{num_handlers} handlers: {total_callbacks} callbacks in {duration:.2f}s")
+
+
+def main():
+    print("=" * 60)
+    print("DSR Python Signal Benchmarks")
+    print("=" * 60)
+    print()
+
+    collector = MetricsCollector("signals")
+
+    config_file = make_temp_config_file()
+    graph = pydsr.DSRGraph(0, "bench_signals", 42, config_file)
+    time.sleep(0.5)
+
+    print("--- Signal Callback Latency ---")
+    benchmark_signal_callback_latency(graph, collector)
+
+    print("\n--- Signal Throughput ---")
+    benchmark_signal_throughput(graph, collector)
+
+    print("\n--- Multiple Handlers Impact ---")
+    benchmark_multiple_handlers(graph, collector)
+
+    del graph
+    os.unlink(config_file)
+
+    # Export
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_signals.json"))
+    collector.export_csv(os.path.join(results_dir, "python_signals.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_throughput.py b/benchmarks/python/bench_throughput.py
new file mode 100644
index 0000000..89afafe
--- /dev/null
+++ b/benchmarks/python/bench_throughput.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Single-agent throughput + latency for node/edge operations.
+
+Runs a 5-second measurement window per operation while tracking per-op
+latency via LatencyTracker.measure().  Exports to python_throughput.json.
+"""
+
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import LatencyTracker, MetricsCollector, make_temp_config_file
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found.")
+    sys.exit(1)
+
+_DURATION = 5.0  # seconds per benchmark
+
+
+def benchmark_node_insert(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    agent_id = graph.get_agent_id()
+    tracker = LatencyTracker()
+    ops = 0
+    t_end = time.perf_counter() + _DURATION
+    while time.perf_counter() < t_end:
+        node = pydsr.Node(agent_id, "testtype", f"thr_ins_{ops}")
+        with tracker.measure():
+            graph.insert_node(node)
+        ops += 1
+    collector.record_throughput("node_insert", ops, _DURATION)
+    collector.record_latency_stats("node_insert", tracker.stats())
+    stats = tracker.stats()
+    print(f"Node insert: {ops / _DURATION:.0f} ops/sec, mean {stats.mean_us:.2f} µs")
+
+
+def benchmark_node_read(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    agent_id = graph.get_agent_id()
+
+    # Pre-populate 1000 nodes for round-robin reads
+    node_ids = []
+    for i in range(1000):
+        node = pydsr.Node(agent_id, "testtype", f"thr_rd_{i}")
+        result = graph.insert_node(node)
+        if result is not None:
+            node_ids.append(result)
+    if not node_ids:
+        print("Node read: no nodes to read, skipping")
+        return
+
+    tracker = LatencyTracker()
+    ops = 0
+    pool = len(node_ids)
+    t_end = time.perf_counter() + _DURATION
+    while time.perf_counter() < t_end:
+        nid = node_ids[ops % pool]
+        with tracker.measure():
+            graph.get_node(nid)
+        ops += 1
+    collector.record_throughput("node_read", ops, _DURATION)
+    collector.record_latency_stats("node_read", tracker.stats())
+    stats = tracker.stats()
+    print(f"Node read:   {ops / _DURATION:.0f} ops/sec, mean {stats.mean_us:.2f} µs")
+
+
+def benchmark_node_update(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    agent_id = graph.get_agent_id()
+
+    node = pydsr.Node(agent_id, "testtype", "thr_upd_target")
+    graph.insert_node(node)
+    target = graph.get_node("thr_upd_target")
+    if not target:
+        print("Node update: could not retrieve target node, skipping")
+        return
+
+    tracker = LatencyTracker()
+    ops = 0
+    t_end = time.perf_counter() + _DURATION
+    while time.perf_counter() < t_end:
+        target.attrs["level"] = pydsr.Attribute(ops % 1000)
+        with tracker.measure():
+            graph.update_node(target)
+        ops += 1
+    collector.record_throughput("node_update", ops, _DURATION)
+    collector.record_latency_stats("node_update", tracker.stats())
+    stats = tracker.stats()
+    print(f"Node update: {ops / _DURATION:.0f} ops/sec, mean {stats.mean_us:.2f} µs")
+
+
+def benchmark_edge_insert(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    agent_id = graph.get_agent_id()
+
+    root = graph.get_node("root")
+    if not root:
+        print("Edge insert: no root node, skipping")
+        return
+
+    # Pre-populate 1000 target nodes
+    targets = []
+    for i in range(1000):
+        node = pydsr.Node(agent_id, "testtype", f"thr_etgt_{i}")
+        graph.insert_node(node)
+        n = graph.get_node(f"thr_etgt_{i}")
+        if n:
+            targets.append(n.id)
+    if not targets:
+        print("Edge insert: no target nodes, skipping")
+        return
+
+    tracker = LatencyTracker()
+    ops = 0
+    pool = len(targets)
+    t_end = time.perf_counter() + _DURATION
+    while time.perf_counter() < t_end:
+        tid = targets[ops % pool]
+        edge = pydsr.Edge(tid, root.id, "testtype_e", agent_id)
+        with tracker.measure():
+            graph.insert_or_assign_edge(edge)
+        ops += 1
+    collector.record_throughput("edge_insert", ops, _DURATION)
+    collector.record_latency_stats("edge_insert", tracker.stats())
+    stats = tracker.stats()
+    print(f"Edge insert: {ops / _DURATION:.0f} ops/sec, mean {stats.mean_us:.2f} µs")
+
+
+def benchmark_edge_read(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    agent_id = graph.get_agent_id()
+
+    root = graph.get_node("root")
+    if not root:
+        print("Edge read: no root node, skipping")
+        return
+
+    # Pre-populate 1000 target nodes + edges
+    targets = []
+    for i in range(1000):
+        node = pydsr.Node(agent_id, "testtype", f"thr_erd_{i}")
+        graph.insert_node(node)
+        n = graph.get_node(f"thr_erd_{i}")
+        if n:
+            targets.append(n.id)
+            edge = pydsr.Edge(n.id, root.id, "testtype_e", agent_id)
+            graph.insert_or_assign_edge(edge)
+    if not targets:
+        print("Edge read: no target edges, skipping")
+        return
+
+    tracker = LatencyTracker()
+    ops = 0
+    pool = len(targets)
+    t_end = time.perf_counter() + _DURATION
+    while time.perf_counter() < t_end:
+        tid = targets[ops % pool]
+        with tracker.measure():
+            graph.get_edge(root.id, tid, "testtype_e")
+        ops += 1
+    collector.record_throughput("edge_read", ops, _DURATION)
+    collector.record_latency_stats("edge_read", tracker.stats())
+    stats = tracker.stats()
+    print(f"Edge read:   {ops / _DURATION:.0f} ops/sec, mean {stats.mean_us:.2f} µs")
+
+
+def main():
+    print("=" * 60)
+    print("DSR Python Throughput + Latency Benchmarks")
+    print("=" * 60)
+    print()
+
+    collector = MetricsCollector("python_throughput")
+    config_file = make_temp_config_file()
+
+    graph = pydsr.DSRGraph(0, "bench_throughput", 43, config_file)
+    time.sleep(0.5)
+
+    print("--- Node operations ---")
+    benchmark_node_insert(graph, collector)
+    benchmark_node_read(graph, collector)
+    benchmark_node_update(graph, collector)
+
+    print("\n--- Edge operations ---")
+    benchmark_edge_insert(graph, collector)
+    benchmark_edge_read(graph, collector)
+
+    del graph
+    os.unlink(config_file)
+
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_throughput.json"))
+    collector.export_csv(os.path.join(results_dir, "python_throughput.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_utils.py b/benchmarks/python/bench_utils.py
new file mode 100644
index 0000000..710d743
--- /dev/null
+++ b/benchmarks/python/bench_utils.py
@@ -0,0 +1,245 @@
+"""
+Utility functions for DSR Python benchmarks.
+"""
+
+import time
+import statistics
+import json
+import csv
+import os
+from dataclasses import dataclass, field
+from typing import Callable, List, Dict, Any, Optional
+from contextlib import contextmanager
+
+
+@dataclass
+class LatencyStats:
+    """Statistics from latency measurements."""
+    count: int = 0
+    mean_ns: float = 0.0
+    stddev_ns: float = 0.0
+    min_ns: float = 0.0
+    max_ns: float = 0.0
+    p50_ns: float = 0.0
+    p90_ns: float = 0.0
+    p95_ns: float = 0.0
+    p99_ns: float = 0.0
+
+    @property
+    def mean_us(self) -> float:
+        return self.mean_ns / 1000.0
+
+    @property
+    def mean_ms(self) -> float:
+        return self.mean_ns / 1_000_000.0
+
+    @property
+    def p99_us(self) -> float:
+        return self.p99_ns / 1000.0
+
+    @property
+    def p99_ms(self) -> float:
+        return self.p99_ns / 1_000_000.0
+
+
+class LatencyTracker:
+    """Collects latency samples and computes statistics."""
+
+    def __init__(self, expected_samples: int = 100):
+        self.samples: List[float] = []
+
+    def record(self, latency_ns: float):
+        """Record a latency sample in nanoseconds."""
+        self.samples.append(latency_ns)
+
+    def record_seconds(self, latency_sec: float):
+        """Record a latency sample in seconds."""
+        self.samples.append(latency_sec * 1_000_000_000)
+
+    @contextmanager
+    def measure(self):
+        """Context manager for measuring latency."""
+        start = time.perf_counter_ns()
+        yield
+        self.samples.append(time.perf_counter_ns() - start)
+
+    def clear(self):
+        self.samples.clear()
+
+    @property
+    def count(self) -> int:
+        return len(self.samples)
+
+    def stats(self) -> LatencyStats:
+        """Compute and return statistics."""
+        if not self.samples:
+            return LatencyStats()
+
+        sorted_samples = sorted(self.samples)
+        n = len(sorted_samples)
+
+        def percentile(p: float) -> float:
+            idx = p * (n - 1)
+            lower = int(idx)
+            upper = min(lower + 1, n - 1)
+            frac = idx - lower
+            return sorted_samples[lower] * (1 - frac) + sorted_samples[upper] * frac
+
+        return LatencyStats(
+            count=n,
+            mean_ns=statistics.mean(sorted_samples),
+            stddev_ns=statistics.stdev(sorted_samples) if n > 1 else 0.0,
+            min_ns=sorted_samples[0],
+            max_ns=sorted_samples[-1],
+            p50_ns=percentile(0.50),
+            p90_ns=percentile(0.90),
+            p95_ns=percentile(0.95),
+            p99_ns=percentile(0.99),
+        )
+
+
+@dataclass
+class Metric:
+    """Individual metric measurement."""
+    name: str
+    category: str
+    value: float
+    unit: str = ""
+    additional: Dict[str, float] = field(default_factory=dict)
+    tags: Dict[str, str] = field(default_factory=dict)
+
+
+class MetricsCollector:
+    """Collects benchmark metrics."""
+
+    def __init__(self, benchmark_name: str = ""):
+        self.benchmark_name = benchmark_name
+        self.metrics: List[Metric] = []
+        self.metadata: Dict[str, str] = {}
+        self.start_time = time.time()
+
+    def record(self, name: str, category: str, value: float,
+               unit: str = "", tags: Optional[Dict[str, str]] = None):
+        self.metrics.append(Metric(
+            name=name,
+            category=category,
+            value=value,
+            unit=unit,
+            tags=tags or {},
+        ))
+
+    def record_latency_stats(self, name: str, stats: LatencyStats,
+                             tags: Optional[Dict[str, str]] = None):
+        m = Metric(
+            name=name,
+            category="latency",
+            value=stats.mean_ns,
+            unit="ns",
+            tags=tags or {},
+            additional={
+                "count": stats.count,
+                "mean_ns": stats.mean_ns,
+                "stddev_ns": stats.stddev_ns,
+                "min_ns": stats.min_ns,
+                "max_ns": stats.max_ns,
+                "p50_ns": stats.p50_ns,
+                "p90_ns": stats.p90_ns,
+                "p95_ns": stats.p95_ns,
+                "p99_ns": stats.p99_ns,
+            }
+        )
+        self.metrics.append(m)
+
+    def record_scalability(self, name: str, scale_factor: int, value: float,
+                           unit: str = "", tags: Optional[Dict[str, str]] = None):
+        m = Metric(name=name, category="scalability", value=value, unit=unit,
+                   tags=tags or {}, additional={"scale_factor": float(scale_factor)})
+        self.metrics.append(m)
+
+    def record_throughput(self, name: str, operations: int,
+                          duration_sec: float, tags: Optional[Dict[str, str]] = None):
+        ops_per_sec = operations / duration_sec if duration_sec > 0 else 0
+        m = Metric(
+            name=name,
+            category="throughput",
+            value=ops_per_sec,
+            unit="ops/sec",
+            tags=tags or {},
+            additional={
+                "total_operations": operations,
+                "duration_sec": duration_sec,
+            }
+        )
+        self.metrics.append(m)
+
+    def export_json(self, filepath: str):
+        """Export metrics to JSON."""
+        os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
+        result = {
+            "benchmark_name": self.benchmark_name,
+            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
+            "total_duration_sec": time.time() - self.start_time,
+            "metadata": self.metadata,
+            "metrics": [
+                {
+                    "name": m.name,
+                    "category": m.category,
+                    "value": m.value,
+                    "unit": m.unit,
+                    "additional": m.additional,
+                    "tags": m.tags,
+                }
+                for m in self.metrics
+            ]
+        }
+        with open(filepath, "w") as f:
+            json.dump(result, f, indent=2)
+
+    def export_csv(self, filepath: str):
+        """Export metrics to CSV."""
+        os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
+        with open(filepath, "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                "benchmark_name", "metric_name", "category", "value", "unit",
+                "mean_ns", "p50_ns", "p95_ns", "p99_ns", "count"
+            ])
+            for m in self.metrics:
+                writer.writerow([
+                    self.benchmark_name, m.name, m.category, m.value, m.unit,
+                    m.additional.get("mean_ns", ""),
+                    m.additional.get("p50_ns", ""),
+                    m.additional.get("p95_ns", ""),
+                    m.additional.get("p99_ns", ""),
+                    m.additional.get("count", ""),
+                ])
+
+
+def make_temp_config_file() -> str:
+    """Create a minimal DSR config file."""
+    import tempfile
+    config = {
+        "DSRModel": {
+            "symbols": {
+                "100": {
+                    "attribute": {
+                        "level": {"type": 1, "value": 0}
+                    },
+                    "id": "100",
+                    "links": [],
+                    "name": "root",
+                    "type": "root"
+                }
+            }
+        }
+    }
+    fd, path = tempfile.mkstemp(suffix=".json", prefix="dsr_bench_")
+    with os.fdopen(fd, "w") as f:
+        json.dump(config, f)
+    return path
+
+
+def warmup(func: Callable, iterations: int = 10):
+    """Run warmup iterations."""
+    for _ in range(iterations):
+        func()
diff --git a/benchmarks/python/run_all.py b/benchmarks/python/run_all.py
new file mode 100644
index 0000000..dede986
--- /dev/null
+++ b/benchmarks/python/run_all.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Run all DSR Python benchmarks and record the results as a named run.
+
+Usage:
+    python run_all.py                       # auto-timestamped run
+    python run_all.py --label "after-fix"   # labelled run
+    python run_all.py --list                # list previous runs
+    python run_all.py --delete <run-id>     # remove a run from the index
+"""
+
+import sys
+import os
+import subprocess
+import time
+import json
+import argparse
+import platform
+from datetime import datetime
+
+BENCHMARKS = [
+    "bench_binding_overhead.py",
+    "bench_graph_operations.py",
+    "bench_throughput.py",
+    "bench_signals.py",
+]
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_RESULTS_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "results"))
+RUNS_INDEX = os.path.join(DEFAULT_RESULTS_ROOT, "runs.json")
+
+
+# ── Index helpers ─────────────────────────────────────────────────────────────
+
+def load_runs() -> list:
+    if not os.path.isfile(RUNS_INDEX):
+        return []
+    with open(RUNS_INDEX) as f:
+        return json.load(f)
+
+
+def save_runs(runs: list):
+    os.makedirs(DEFAULT_RESULTS_ROOT, exist_ok=True)
+    with open(RUNS_INDEX, "w") as f:
+        json.dump(runs, f, indent=2)
+
+
+def register_run(run_info: dict):
+    runs = load_runs()
+    runs = [r for r in runs if r["id"] != run_info["id"]]
+    runs.append(run_info)
+    runs.sort(key=lambda r: r["id"])
+    save_runs(runs)
+
+
+# ── Commands ──────────────────────────────────────────────────────────────────
+
+def cmd_list():
+    runs = load_runs()
+    if not runs:
+        print("No runs recorded yet.")
+        return
+    print(f"{'ID':<22}  {'Label':<20}  {'Pass/Total':>10}  {'Duration':>9}")
+    print("-" * 70)
+    for r in runs:
+        ratio = f"{r.get('benchmarks_passed', 0)}/{r.get('benchmarks_run', 0)}"
+        dur = f"{r.get('total_duration_sec', 0):.1f}s"
+        label = r.get("label") or "-"
+        print(f"{r['id']:<22}  {label:<20}  {ratio:>10}  {dur:>9}")
+
+
+def cmd_delete(run_id: str):
+    runs = load_runs()
+    before = len(runs)
+    runs = [r for r in runs if r["id"] != run_id]
+    if len(runs) == before:
+        print(f"Run '{run_id}' not found in index.")
+        return
+    save_runs(runs)
+    print(f"Removed run '{run_id}' from index (result files kept on disk).")
+
+
+def cmd_run_direct() -> int:
+    """Run benchmarks using BENCH_RESULTS_DIR already set in the environment.
+
+    Called by the top-level run_benchmarks.py wrapper so it can manage the
+    run directory and index registration itself.
+    """
+    results_dir = os.environ.get("BENCH_RESULTS_DIR", ".")
+    print("=" * 70)
+    print("  DSR Python Benchmark Suite")
+    print(f"  Output : {results_dir}")
+    print("=" * 70)
+    print()
+
+    env = dict(os.environ)
+    results = []
+    suite_start = time.time()
+
+    for bench in BENCHMARKS:
+        bench_path = os.path.join(SCRIPT_DIR, bench)
+        print(f"\n{'=' * 70}")
+        print(f"Running: {bench}")
+        print("=" * 70)
+        try:
+            proc = subprocess.run([sys.executable, bench_path], cwd=SCRIPT_DIR, env=env, timeout=300)
+            results.append((bench, proc.returncode == 0))
+        except subprocess.TimeoutExpired:
+            print(f"TIMEOUT: {bench}")
+            results.append((bench, False))
+        except Exception as e:
+            print(f"ERROR: {bench}: {e}")
+            results.append((bench, False))
+
+    total_duration = time.time() - suite_start
+    passed = sum(1 for _, ok in results if ok)
+    print(f"\n  {passed}/{len(results)} benchmarks completed in {total_duration:.1f}s")
+    return 0 if all(ok for _, ok in results) else 1
+
+
+def cmd_run(label, results_root):
+    ts = datetime.now()
+    run_id = ts.strftime("%Y%m%dT%H%M%S")
+    dir_name = run_id if not label else f"{run_id}_{label.replace(' ', '-')}"
+    run_dir = os.path.join(results_root, dir_name)
+    os.makedirs(run_dir, exist_ok=True)
+
+    print("=" * 70)
+    print(f"  DSR Python Benchmark Suite")
+    print(f"  Run ID : {run_id}")
+    if label:
+        print(f"  Label  : {label}")
+    print(f"  Output : {run_dir}")
+    print("=" * 70)
+    print()
+
+    env = {**os.environ, "BENCH_RESULTS_DIR": run_dir}
+
+    results = []
+    suite_start = time.time()
+
+    for bench in BENCHMARKS:
+        bench_path = os.path.join(SCRIPT_DIR, bench)
+        print(f"\n{'=' * 70}")
+        print(f"Running: {bench}")
+        print("=" * 70)
+
+        try:
+            proc = subprocess.run(
+                [sys.executable, bench_path],
+                cwd=SCRIPT_DIR,
+                env=env,
+                timeout=300,
+            )
+            results.append((bench, proc.returncode == 0))
+        except subprocess.TimeoutExpired:
+            print(f"TIMEOUT: {bench}")
+            results.append((bench, False))
+        except Exception as e:
+            print(f"ERROR: {bench}: {e}")
+            results.append((bench, False))
+
+    total_duration = time.time() - suite_start
+
+    try:
+        git_hash = subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"],
+            cwd=SCRIPT_DIR, stderr=subprocess.DEVNULL,
+        ).decode().strip()
+    except Exception:
+        git_hash = ""
+
+    run_info = {
+        "id": run_id,
+        "label": label or "",
+        "dir": dir_name,
+        "timestamp": ts.isoformat(),
+        "total_duration_sec": round(total_duration, 2),
+        "benchmarks_run": len(results),
+        "benchmarks_passed": sum(1 for _, ok in results if ok),
+        "git_hash": git_hash,
+        "platform": platform.platform(),
+        "python": sys.version.split()[0],
+    }
+
+    with open(os.path.join(run_dir, "run_info.json"), "w") as f:
+        json.dump(run_info, f, indent=2)
+
+    register_run(run_info)
+
+    print("\n" + "=" * 70)
+    print("  Summary")
+    print("=" * 70)
+    for bench, ok in results:
+        print(f"  [{'PASS' if ok else 'FAIL'}] {bench}")
+
+    passed = sum(1 for _, ok in results if ok)
+    print(f"\n  {passed}/{len(results)} benchmarks completed in {total_duration:.1f}s")
+    print(f"  Run ID  : {run_id}")
+    print(f"  Results : {run_dir}")
+    print(f"  Index   : {RUNS_INDEX}")
+
+    return 0 if all(ok for _, ok in results) else 1
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Run DSR benchmarks and track results")
+    parser.add_argument("--label", "-l", help="Human-readable label for this run")
+    parser.add_argument("--results-root", default=DEFAULT_RESULTS_ROOT,
+                        help="Root directory for all run results")
+    parser.add_argument("--list", action="store_true", help="List all recorded runs")
+    parser.add_argument("--delete", metavar="RUN_ID", help="Remove a run from the index")
+    parser.add_argument("--direct", action="store_true",
+                        help="Run benchmarks using BENCH_RESULTS_DIR from env, skip index registration")
+    args = parser.parse_args()
+
+    if args.list:
+        cmd_list()
+        return 0
+
+    if args.delete:
+        cmd_delete(args.delete)
+        return 0
+
+    if args.direct:
+        return cmd_run_direct()
+
+    return cmd_run(args.label, args.results_root)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/report.py b/benchmarks/report.py
new file mode 100644
index 0000000..ca63f52
--- /dev/null
+++ b/benchmarks/report.py
@@ -0,0 +1,1287 @@
+#!/usr/bin/env python3
+"""
+Generate a visual HTML report from benchmark results.
+
+Single run:
+    python report.py                            # latest run
+    python report.py --run 20260314T153000
+
+Compare two runs:
+    python report.py --run 20260314T153000 --baseline 20260313T090000
+
+List available runs:
+    python report.py --list
+"""
+
+import json
+import os
+import sys
+import glob
+import argparse
+from typing import Optional
+from datetime import datetime
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_RESULTS_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "results"))
+RUNS_INDEX = os.path.join(DEFAULT_RESULTS_ROOT, "runs.json")
+
+
+# ── Data loading ──────────────────────────────────────────────────────────────
+
+def load_runs_index() -> list:
+    if not os.path.isfile(RUNS_INDEX):
+        return []
+    with open(RUNS_INDEX) as f:
+        return json.load(f)
+
+
+def load_run_metrics(run_dir: str) -> list:
+    """Load all JSON metric files from a run directory.
+
+    Scans two locations:
+      - <run_dir>/*.json          Python benchmark output
+      - <run_dir>/cpp/results/*.json   C++ benchmark output (written by dsr_benchmarks)
+    """
+    SKIP = {"run_info.json"}
+    search_paths = [
+        (run_dir, "*.json"),
+        (os.path.join(run_dir, "cpp", "results"), "*.json"),
+    ]
+
+    cpp_dir = os.path.join(run_dir, "cpp", "results")
+    metrics = []
+    for directory, pattern in search_paths:
+        lang = "cpp" if os.path.abspath(directory) == os.path.abspath(cpp_dir) else "python"
+        for path in sorted(glob.glob(os.path.join(directory, pattern))):
+            if os.path.basename(path) in SKIP:
+                continue
+            try:
+                with open(path) as f:
+                    data = json.load(f)
+                    data["_source_file"] = os.path.basename(path)
+                    data["_lang"] = lang
+                    metrics.append(data)
+            except Exception as e:
+                print(f"Warning: could not load {path}: {e}", file=sys.stderr)
+    return metrics
+
+
+def load_run_info(run_dir: str) -> dict:
+    path = os.path.join(run_dir, "run_info.json")
+    if os.path.isfile(path):
+        with open(path) as f:
+            return json.load(f)
+    return {}
+
+
+def resolve_run_dir(run_id: str, results_root: str) -> str:
+    """Find the directory for a run_id (handles labelled dirs like 20260314T153000_label)."""
+    # Direct match
+    direct = os.path.join(results_root, run_id)
+    if os.path.isdir(direct):
+        return direct
+    # Prefix match (labelled)
+    for entry in os.listdir(results_root):
+        if entry.startswith(run_id):
+            candidate = os.path.join(results_root, entry)
+            if os.path.isdir(candidate):
+                return candidate
+    # Look up in index
+    for r in load_runs_index():
+        if r["id"] == run_id:
+            candidate = os.path.join(results_root, r["dir"])
+            if os.path.isdir(candidate):
+                return candidate
+    raise FileNotFoundError(f"Run directory not found for id '{run_id}'")
+
+
+_UNIT_TO_NS = {"ns": 1, "us": 1_000, "µs": 1_000, "ms": 1_000_000, "s": 1_000_000_000}
+
+
+def _to_ns(value: float, unit: str) -> float:
+    return value * _UNIT_TO_NS.get(unit.strip(), 1)
+
+
+def flatten_metrics(bench_files: list) -> tuple[list, list]:
+    """Return (latency_metrics, throughput_metrics) as flat lists."""
+    latency, throughput = [], []
+    for bench in bench_files:
+        bench_name = bench.get("benchmark_name", bench["_source_file"])
+        lang = bench.get("_lang", "python")
+        for m in bench.get("metrics", []):
+            add = m.get("additional", {})
+            tags = m.get("tags", {})
+            unit = m.get("unit", "")
+            category = m.get("category", "")
+
+            # For scalability metrics with repeated names, append the tag that
+            # differentiates them (e.g. graph_size) so each row is unique.
+            metric_name = m["name"]
+            if tags:
+                tag_suffix = "_".join(f"{k}={v}" for k, v in tags.items()
+                                      if k in ("graph_size", "num_threads", "scale_factor"))
+                if tag_suffix:
+                    metric_name = f"{metric_name}@{tag_suffix}"
+
+            entry = {
+                "benchmark": bench_name,
+                "metric": metric_name,
+                "lang": lang,
+                "value": m["value"],
+                "unit": unit,
+                "additional": add,
+            }
+
+            if category == "latency":
+                entry.update({
+                    "mean_ns": add.get("mean_ns", m["value"]),
+                    "p50_ns": add.get("p50_ns", 0),
+                    "p95_ns": add.get("p95_ns", 0),
+                    "p99_ns": add.get("p99_ns", 0),
+                    "min_ns": add.get("min_ns", 0),
+                    "max_ns": add.get("max_ns", 0),
+                    "count": int(add.get("count", 0)),
+                })
+                latency.append(entry)
+            elif category == "throughput":
+                entry.update({
+                    "ops_per_sec": m["value"],
+                    "total_ops": add.get("total_operations", 0),
+                    "duration_sec": add.get("duration_sec", add.get("duration_ms", 0) / 1000),
+                })
+                throughput.append(entry)
+            elif category == "scalability" and unit in _UNIT_TO_NS:
+                # Scalability metrics are time-based (value already in the
+                # declared unit). Promote them to the latency list.
+                mean_ns = _to_ns(m["value"], unit)
+                entry.update({
+                    "mean_ns": mean_ns,
+                    "p50_ns": 0,
+                    "p95_ns": 0,
+                    "p99_ns": 0,
+                    "min_ns": 0,
+                    "max_ns": 0,
+                    "count": int(add.get("count", add.get("scale_factor", 0))),
+                })
+                latency.append(entry)
+    return latency, throughput
+
+
+# ── Scalability flattening ────────────────────────────────────────────────────
+
+SCALE_DIMS = ("threads", "graph_size", "agents")
+
+
+def flatten_scalability(bench_files: list) -> list:
+    """Return a flat list of scalability data points.
+
+    Any metric tagged with a recognised scale dimension (threads, graph_size,
+    or agents) is included — regardless of category — so latency, throughput,
+    and scalability records all contribute.
+    """
+    rows = []
+    for bench in bench_files:
+        lang = bench.get("_lang", "python")
+        bench_name = bench.get("benchmark_name", bench["_source_file"])
+        for m in bench.get("metrics", []):
+            tags = m.get("tags", {})
+            add = m.get("additional", {})
+            scale_dim = next((d for d in SCALE_DIMS if d in tags), None)
+            if scale_dim is None:
+                continue
+            try:
+                scale_val = int(tags[scale_dim])
+            except (ValueError, KeyError):
+                continue
+            cat = m.get("category", "")
+            rows.append({
+                "benchmark": bench_name,
+                "operation": m["name"],
+                "lang": lang,
+                "category": cat,
+                "scale_dim": scale_dim,
+                "scale_val": scale_val,
+                "value": m["value"],
+                "unit": m.get("unit", ""),
+                "mean_ns": add.get("mean_ns", 0.0),
+                "p99_ns": add.get("p99_ns", 0.0),
+                "ops_per_sec": m["value"] if cat == "throughput" else 0.0,
+            })
+    return rows
+
+
+def compute_efficiency(rows: list) -> list:
+    """Compute a normalised-performance series for each (benchmark, op, dim).
+
+    threads / agents  →  parallel efficiency = thr_N / (N × thr_1) × 100
+    graph_size        →  relative throughput  = thr_N / thr_min × 100
+                         (100 % at smallest graph, declining as graph grows)
+
+    Returns a list of {benchmark, operation, scale_dim, scale_val, efficiency,
+    ops_per_sec} dicts.  The JS chart uses the same field regardless of which
+    formula was applied; the label/title is updated per-dimension in JS.
+    """
+    from collections import defaultdict
+
+    groups: dict = defaultdict(list)
+    for r in rows:
+        if r["category"] != "throughput":
+            continue
+        key = (r["benchmark"], r["operation"], r["scale_dim"])
+        groups[key].append(r)
+
+    result = []
+    for (bench, op, dim), pts in groups.items():
+        pts_sorted = sorted(pts, key=lambda p: p["scale_val"])
+
+        if dim in ("threads", "agents"):
+            baseline = next((p for p in pts_sorted if p["scale_val"] == 1), None)
+            if baseline is None or baseline["ops_per_sec"] == 0:
+                continue
+            thr_1 = baseline["ops_per_sec"]
+            for p in pts_sorted:
+                N = p["scale_val"]
+                if N == 0:
+                    continue
+                efficiency = (p["ops_per_sec"] / (N * thr_1)) * 100.0
+                result.append({
+                    "benchmark": bench, "operation": op, "scale_dim": dim,
+                    "scale_val": N, "efficiency": round(efficiency, 2),
+                    "ops_per_sec": p["ops_per_sec"],
+                })
+
+        elif dim == "graph_size":
+            if not pts_sorted or pts_sorted[0]["ops_per_sec"] == 0:
+                continue
+            thr_min = pts_sorted[0]["ops_per_sec"]
+            for p in pts_sorted:
+                relative = (p["ops_per_sec"] / thr_min) * 100.0
+                result.append({
+                    "benchmark": bench, "operation": op, "scale_dim": dim,
+                    "scale_val": p["scale_val"], "efficiency": round(relative, 2),
+                    "ops_per_sec": p["ops_per_sec"],
+                })
+
+    return result
+
+
+# ── HTML generation ───────────────────────────────────────────────────────────
+
+def generate_html(
+    run_info: dict,
+    bench_files: list,
+    output_path: str,
+    baseline_info: Optional[dict] = None,
+    baseline_files: Optional[list] = None,
+):
+    latency, throughput = flatten_metrics(bench_files)
+    b_latency, b_throughput = (flatten_metrics(baseline_files) if baseline_files else ([], []))
+
+    scl_rows = flatten_scalability(bench_files)
+    eff_rows = compute_efficiency(scl_rows)
+    b_scl_rows = flatten_scalability(baseline_files) if baseline_files else []
+
+    run_id = run_info.get("id", "unknown")
+    run_label = run_info.get("label") or run_id
+    b_id = baseline_info.get("id", "") if baseline_info else ""
+    b_label = (baseline_info.get("label") or b_id) if baseline_info else ""
+    comparing = bool(baseline_files)
+    generated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    latency_json = json.dumps(latency)
+    throughput_json = json.dumps(throughput)
+    b_latency_json = json.dumps(b_latency)
+    b_throughput_json = json.dumps(b_throughput)
+    run_info_json = json.dumps(run_info)
+    b_info_json = json.dumps(baseline_info or {})
+    scl_json = json.dumps(scl_rows)
+    eff_json = json.dumps(eff_rows)
+    b_scl_json = json.dumps(b_scl_rows)
+
+    # Summary rows
+    summary = []
+    for b in bench_files:
+        summary.append({
+            "benchmark": b.get("benchmark_name", b["_source_file"]),
+            "timestamp": b.get("timestamp", ""),
+            "duration": f"{b.get('total_duration_sec', 0):.1f}s",
+            "metrics": len(b.get("metrics", [])),
+            "source": b["_source_file"],
+        })
+    summary_json = json.dumps(summary)
+
+    compare_tab = '<button onclick="showTab(\'compare\', this)">Compare</button>' if comparing else ""
+    compare_panel = ""
+    if comparing:
+        compare_panel = '<div id="tab-compare" class="tab-panel"></div>'
+
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Cortex Benchmark Report — {run_label}</title>
+<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/hammerjs@2.0.8/hammer.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/chartjs-plugin-zoom@2.0.1/dist/chartjs-plugin-zoom.min.js"></script>
+<style>
+  :root {{
+    --bg: #0f1117; --surface: #1a1d27; --border: #2a2d3a;
+    --accent: #7c6af7; --accent2: #4fc3f7; --accent3: #81c995; --accent4: #f28b82;
+    --text: #e0e0e0; --muted: #8b8fa8;
+    --font: 'Segoe UI', system-ui, sans-serif;
+    --mono: 'Cascadia Code', 'Consolas', monospace;
+  }}
+  * {{ box-sizing: border-box; margin: 0; padding: 0; }}
+  body {{ background: var(--bg); color: var(--text); font-family: var(--font); }}
+
+  header {{
+    border-bottom: 1px solid var(--border);
+    padding: 20px 40px;
+    background: var(--surface);
+    display: flex;
+    align-items: baseline;
+    gap: 16px;
+  }}
+  header h1 {{ font-size: 1.35rem; font-weight: 600; }}
+  header .run-id {{ color: var(--accent); font-family: var(--mono); font-size: 0.85rem; }}
+  header .meta {{ margin-left: auto; color: var(--muted); font-size: 0.8rem; text-align: right; line-height: 1.6; }}
+
+  nav {{
+    background: var(--surface); border-bottom: 1px solid var(--border);
+    padding: 0 40px; display: flex; gap: 4px;
+  }}
+  nav button {{
+    background: none; border: none; border-bottom: 3px solid transparent;
+    color: var(--muted); cursor: pointer; font-family: var(--font);
+    font-size: 0.9rem; padding: 12px 16px;
+    transition: color 0.15s, border-color 0.15s;
+  }}
+  nav button:hover {{ color: var(--text); }}
+  nav button.active {{ color: var(--accent); border-color: var(--accent); }}
+
+  main {{ padding: 32px 40px; max-width: 1400px; margin: 0 auto; }}
+  .tab-panel {{ display: none; }}
+  .tab-panel.active {{ display: block; }}
+
+  h2 {{ font-size: 1.1rem; font-weight: 600; margin-bottom: 20px; }}
+  h3 {{ font-size: 0.85rem; font-weight: 600; color: var(--muted); text-transform: uppercase;
+       letter-spacing: 0.05em; margin-bottom: 12px; }}
+
+  .grid-2 {{ display: grid; grid-template-columns: 1fr 1fr; gap: 24px; }}
+  .grid-3 {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; }}
+  @media (max-width: 900px) {{ .grid-2, .grid-3 {{ grid-template-columns: 1fr; }} }}
+
+  .card {{ background: var(--surface); border: 1px solid var(--border); border-radius: 8px; padding: 24px; }}
+  .stat-card {{
+    background: var(--surface); border: 1px solid var(--border);
+    border-radius: 8px; padding: 20px; text-align: center;
+  }}
+  .stat-card .val {{ font-size: 1.9rem; font-weight: 700; color: var(--accent); font-family: var(--mono); }}
+  .stat-card .lbl {{ color: var(--muted); font-size: 0.78rem; margin-top: 4px; }}
+
+  .chart-wrap {{ position: relative; height: 320px; }}
+  .chart-wrap.tall {{ height: 420px; }}
+
+  table {{ width: 100%; border-collapse: collapse; font-size: 0.875rem; }}
+  th {{
+    text-align: left; padding: 10px 14px; color: var(--muted); font-weight: 500;
+    border-bottom: 1px solid var(--border); font-size: 0.78rem;
+    text-transform: uppercase; letter-spacing: 0.04em;
+  }}
+  td {{ padding: 10px 14px; border-bottom: 1px solid var(--border); font-family: var(--mono); font-size: 0.85rem; }}
+  tr:last-child td {{ border-bottom: none; }}
+  tr:hover td {{ background: rgba(124,106,247,0.04); }}
+
+  .badge {{
+    display: inline-block; padding: 2px 8px; border-radius: 4px;
+    font-size: 0.72rem; font-weight: 600;
+  }}
+  .badge-latency {{ background: rgba(124,106,247,0.18); color: var(--accent); }}
+  .badge-throughput {{ background: rgba(79,195,247,0.18); color: var(--accent2); }}
+  .badge-cpp {{ background: rgba(251,146,60,0.18); color: #fb923c; }}
+  .badge-python {{ background: rgba(96,165,250,0.18); color: #60a5fa; }}
+  .lang-toggle {{ display: flex; gap: 0; }}
+  .lang-toggle button {{
+    background: var(--surface); border: 1px solid var(--border);
+    color: var(--muted); cursor: pointer; font-family: var(--font);
+    font-size: 0.82rem; padding: 5px 12px; transition: all 0.15s;
+  }}
+  .lang-toggle button:first-child {{ border-radius: 6px 0 0 6px; }}
+  .lang-toggle button:last-child  {{ border-radius: 0 6px 6px 0; border-left: none; }}
+  .lang-toggle button:not(:first-child):not(:last-child) {{ border-left: none; }}
+  .lang-toggle button.active {{ background: rgba(124,106,247,0.15); color: var(--accent); border-color: var(--accent); }}
+
+  .delta {{ font-weight: 600; }}
+  .delta-good {{ color: var(--accent3); }}
+  .delta-bad {{ color: var(--accent4); }}
+  .delta-neutral {{ color: var(--muted); }}
+
+  .section {{ margin-bottom: 36px; }}
+  .filter-bar {{ display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 14px; }}
+  .filter-bar select {{
+    background: var(--surface); border: 1px solid var(--border); border-radius: 6px;
+    color: var(--text); font-family: var(--font); font-size: 0.85rem; padding: 6px 12px; cursor: pointer;
+  }}
+  .empty {{ color: var(--muted); font-style: italic; padding: 24px 0; text-align: center; }}
+
+  .run-pill {{
+    display: inline-flex; align-items: center; gap: 6px;
+    background: var(--surface); border: 1px solid var(--border);
+    border-radius: 20px; padding: 4px 12px; font-size: 0.8rem;
+  }}
+  .run-pill .dot {{ width: 8px; height: 8px; border-radius: 50%; }}
+</style>
+</head>
+<body>
+
+<header>
+  <div>
+    <h1>Cortex Benchmark Report</h1>
+    <span class="run-id">{run_label}</span>
+    {f'<span style="color:var(--muted);font-size:0.8rem"> vs baseline: <span style="color:var(--accent2)">{b_label}</span></span>' if comparing else ""}
+  </div>
+  <div class="meta">
+    Generated: {generated_at}<br>
+    {run_info.get("git_hash") and f"git: {run_info['git_hash']}" or ""}
+  </div>
+</header>
+
+<nav>
+  <button class="active" onclick="showTab('overview', this)">Overview</button>
+  <button onclick="showTab('latency', this)">Latency</button>
+  <button onclick="showTab('throughput', this)">Throughput</button>
+  <button onclick="showTab('scalability', this)">Scalability</button>
+  {compare_tab}
+  <button onclick="showTab('raw', this)">Raw Data</button>
+</nav>
+
+<main>
+
+<!-- OVERVIEW -->
+<div id="tab-overview" class="tab-panel active">
+  <div class="section"><div class="grid-3" id="stat-cards"></div></div>
+  <div class="section grid-2">
+    <div class="card">
+      <h3>Latency — Mean (µs)</h3>
+      <div class="chart-wrap"><canvas id="ov-latency"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Throughput (ops/sec)</h3>
+      <div class="chart-wrap"><canvas id="ov-throughput"></canvas></div>
+    </div>
+  </div>
+  <div class="section">
+    <div class="card">
+      <h3>Run Info</h3>
+      <table><tbody id="run-info-table"></tbody></table>
+    </div>
+  </div>
+  <div class="section">
+    <div class="card">
+      <h3>Benchmark Files</h3>
+      <table>
+        <thead><tr><th>Benchmark</th><th>Timestamp</th><th>Duration</th><th>Metrics</th><th>File</th></tr></thead>
+        <tbody id="summary-table"></tbody>
+      </table>
+    </div>
+  </div>
+</div>
+
+<!-- LATENCY -->
+<div id="tab-latency" class="tab-panel">
+  <div class="section">
+    <div class="filter-bar">
+      <select id="lat-filter" onchange="renderLatency()"><option value="">All benchmarks</option></select>
+      <div class="lang-toggle">
+        <button class="active" onclick="setLangFilter('lat','',this)">All</button>
+        <button onclick="setLangFilter('lat','cpp',this)">C++</button>
+        <button onclick="setLangFilter('lat','python',this)">Python</button>
+      </div>
+      <button onclick="if(latChart) latChart.resetZoom()" style="background:var(--surface);border:1px solid var(--border);border-radius:6px;color:var(--muted);cursor:pointer;font-family:var(--font);font-size:0.85rem;padding:6px 12px;">Reset zoom</button>
+    </div>
+    <div style="color:var(--muted);font-size:0.78rem;margin-bottom:8px;">Scroll to zoom · Click &amp; drag to pan · Double-click to reset</div>
+    <div class="card" style="margin-bottom:24px;">
+      <h3>Latency Distribution — Mean / p50 / p95 / p99</h3>
+      <div class="chart-wrap tall"><canvas id="lat-dist"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Latency Detail</h3>
+      <table>
+        <thead><tr>
+          <th>Lang</th><th>Benchmark</th><th>Operation</th><th>n</th>
+          <th>Mean</th><th>p50</th><th>p95</th><th>p99</th><th>Min</th><th>Max</th>
+          <th title="Coefficient of Variation = stddev/mean. Green &lt;10%, Yellow 10-30%, Red &gt;30%">CV%</th>
+        </tr></thead>
+        <tbody id="lat-table"></tbody>
+      </table>
+    </div>
+  </div>
+</div>
+
+<!-- THROUGHPUT -->
+<div id="tab-throughput" class="tab-panel">
+  <div class="section">
+    <div class="filter-bar">
+      <select id="thr-filter" onchange="renderThroughput()"><option value="">All benchmarks</option></select>
+      <div class="lang-toggle">
+        <button class="active" onclick="setLangFilter('thr','',this)">All</button>
+        <button onclick="setLangFilter('thr','cpp',this)">C++</button>
+        <button onclick="setLangFilter('thr','python',this)">Python</button>
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:24px;">
+      <h3>Operations per Second</h3>
+      <div class="chart-wrap tall"><canvas id="thr-bar"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Throughput Detail</h3>
+      <table>
+        <thead><tr>
+          <th>Lang</th><th>Benchmark</th><th>Operation</th><th>Ops/sec</th><th>Total Ops</th><th>Duration</th>
+        </tr></thead>
+        <tbody id="thr-table"></tbody>
+      </table>
+    </div>
+  </div>
+</div>
+
+<!-- SCALABILITY -->
+<div id="tab-scalability" class="tab-panel">
+  <div class="section">
+    <div class="filter-bar">
+      <select id="scl-dim" onchange="renderScalability()">
+        <option value="threads">Threads</option>
+        <option value="graph_size">Graph size</option>
+        <option value="agents">Agents</option>
+      </select>
+      <select id="scl-op" onchange="renderScalability()"><option value="">All operations</option></select>
+    </div>
+    <div class="grid-2 section">
+      <div class="card">
+        <h3>Throughput (ops/sec)</h3>
+        <div class="chart-wrap"><canvas id="scl-thr-chart"></canvas></div>
+      </div>
+      <div class="card">
+        <h3>Mean Latency (µs)</h3>
+        <div class="chart-wrap"><canvas id="scl-lat-chart"></canvas></div>
+      </div>
+    </div>
+    <div class="card section">
+      <h3 id="scl-eff-title">Scaling Efficiency (% of ideal linear)</h3>
+      <div class="chart-wrap"><canvas id="scl-eff-chart"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Scalability Detail</h3>
+      <table>
+        <thead><tr>
+          <th>Benchmark</th><th>Operation</th><th>Dimension</th><th>Scale</th>
+          <th>Throughput</th><th>Mean Latency</th><th>Efficiency %</th>
+        </tr></thead>
+        <tbody id="scl-table"></tbody>
+      </table>
+    </div>
+  </div>
+</div>
+
+<!-- COMPARE (injected if comparing) -->
+{compare_panel}
+
+<!-- RAW -->
+<div id="tab-raw" class="tab-panel">
+  <div class="section" id="raw-content"></div>
+</div>
+
+</main>
+
+<script>
+// ── Data ─────────────────────────────────────────────────────────────────────
+const LAT   = {latency_json};
+const THR   = {throughput_json};
+const B_LAT = {b_latency_json};
+const B_THR = {b_throughput_json};
+const RUN_INFO = {run_info_json};
+const B_INFO   = {b_info_json};
+const SUMMARY  = {summary_json};
+const COMPARING = {json.dumps(comparing)};
+const SCL = {scl_json};
+const EFF = {eff_json};
+const B_SCL = {b_scl_json};
+
+const RUN_COLOR  = '#7c6af7';
+const BASE_COLOR = '#4fc3f7';
+const PALETTE = ['#7c6af7','#4fc3f7','#81c995','#f28b82','#ffb74d','#e879f9','#34d399','#fb923c'];
+
+// ── Formatters ───────────────────────────────────────────────────────────────
+function fmtNs(ns) {{
+  if (ns >= 1e9) return (ns/1e9).toFixed(2) + ' s';
+  if (ns >= 1e6) return (ns/1e6).toFixed(2) + ' ms';
+  if (ns >= 1e3) return (ns/1e3).toFixed(2) + ' µs';
+  return ns.toFixed(1) + ' ns';
+}}
+function fmtOps(v) {{
+  if (v >= 1e6) return (v/1e6).toFixed(2) + ' M ops/s';
+  if (v >= 1e3) return (v/1e3).toFixed(2) + ' K ops/s';
+  return v.toFixed(1) + ' ops/s';
+}}
+function deltaClass(pct, higherIsBetter) {{
+  if (Math.abs(pct) < 1) return 'delta-neutral';
+  return (pct > 0) === higherIsBetter ? 'delta-good' : 'delta-bad';
+}}
+function fmtDelta(pct, higherIsBetter) {{
+  const sign = pct > 0 ? '+' : '';
+  const cls = deltaClass(pct, higherIsBetter);
+  return `<span class="delta ${{cls}}">${{sign}}${{pct.toFixed(1)}}%</span>`;
+}}
+
+// ── Chart defaults ────────────────────────────────────────────────────────────
+const CD = {{
+  responsive: true, maintainAspectRatio: false,
+  plugins: {{
+    legend: {{ labels: {{ color: '#8b8fa8', font: {{ size: 11 }} }} }},
+    tooltip: {{ backgroundColor: '#1a1d27', borderColor: '#2a2d3a', borderWidth: 1,
+               titleColor: '#e0e0e0', bodyColor: '#8b8fa8' }},
+  }},
+  scales: {{
+    x: {{ ticks: {{ color: '#8b8fa8', font: {{ size: 11 }} }}, grid: {{ color: '#2a2d3a' }} }},
+    y: {{ ticks: {{ color: '#8b8fa8', font: {{ size: 11 }} }}, grid: {{ color: '#2a2d3a' }} }},
+  }},
+}};
+
+// ── Tab navigation ────────────────────────────────────────────────────────────
+let compareRendered = false;
+let sclRendered = false;
+function showTab(name, btn) {{
+  document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
+  document.querySelectorAll('nav button').forEach(b => b.classList.remove('active'));
+  document.getElementById('tab-' + name).classList.add('active');
+  if (btn) btn.classList.add('active');
+  // Lazy-render compare tab the first time it becomes visible so Chart.js
+  // gets correct canvas dimensions (canvas in a hidden div has width=0).
+  if (name === 'compare' && COMPARING && !compareRendered) {{
+    compareRendered = true;
+    renderCompare();
+  }}
+  // Lazy-render scalability tab the first time it becomes visible.
+  if (name === 'scalability' && !sclRendered) {{
+    sclRendered = true;
+    const sclOpSel = document.getElementById('scl-op');
+    [...new Set(SCL.map(r => r.operation))].forEach(op => {{
+      const o = document.createElement('option'); o.value = op; o.textContent = op;
+      sclOpSel.appendChild(o);
+    }});
+    renderScalability();
+  }}
+  // Resize latency/throughput charts when their tabs become visible.
+  if (name === 'latency' && latChart) latChart.resize();
+  if (name === 'throughput' && thrChart) thrChart.resize();
+}}
+
+// ── Overview ──────────────────────────────────────────────────────────────────
+function renderOverview() {{
+  // Stat cards
+  const total = LAT.length + THR.length;
+  const benchNames = [...new Set([...LAT.map(m=>m.benchmark), ...THR.map(m=>m.benchmark)])];
+  const avgMean = LAT.length ? LAT.reduce((s,m)=>s+m.mean_ns,0)/LAT.length : 0;
+  document.getElementById('stat-cards').innerHTML = `
+    <div class="stat-card"><div class="val">${{benchNames.length}}</div><div class="lbl">Benchmark Suites</div></div>
+    <div class="stat-card"><div class="val">${{total}}</div><div class="lbl">Total Metrics</div></div>
+    <div class="stat-card"><div class="val">${{LAT.length ? fmtNs(avgMean) : 'N/A'}}</div><div class="lbl">Avg Mean Latency</div></div>
+  `;
+
+  // Run info table
+  const riRows = Object.entries(RUN_INFO)
+    .filter(([k]) => !k.startsWith('_'))
+    .map(([k,v]) => `<tr><td style="color:var(--muted);width:160px">${{k}}</td><td>${{v}}</td></tr>`)
+    .join('');
+  document.getElementById('run-info-table').innerHTML = riRows || '<tr><td class="empty">No metadata</td></tr>';
+
+  // Summary table
+  document.getElementById('summary-table').innerHTML = SUMMARY.map(r => `
+    <tr>
+      <td>${{r.benchmark}}</td><td>${{r.timestamp}}</td>
+      <td>${{r.duration}}</td><td>${{r.metrics}}</td>
+      <td style="color:var(--muted)">${{r.source}}</td>
+    </tr>`).join('');
+
+  // Overview latency chart
+  if (LAT.length) {{
+    const labels = LAT.map(m => m.metric);
+    const datasets = [{{ label: 'Mean (µs)', data: LAT.map(m => m.mean_ns/1000), backgroundColor: RUN_COLOR+'cc', borderColor: RUN_COLOR, borderWidth: 1 }}];
+    if (COMPARING && B_LAT.length) {{
+      const bMap = Object.fromEntries(B_LAT.map(m => [m.benchmark+'/'+m.metric, m]));
+      datasets.push({{ label: 'Baseline Mean (µs)', data: LAT.map(m => (bMap[m.benchmark+'/'+m.metric]?.mean_ns||0)/1000), backgroundColor: BASE_COLOR+'88', borderColor: BASE_COLOR, borderWidth: 1 }});
+    }}
+    new Chart(document.getElementById('ov-latency'), {{
+      type: 'bar', data: {{ labels, datasets }},
+      options: {{ ...CD, plugins: {{ ...CD.plugins, tooltip: {{ ...CD.plugins.tooltip,
+        callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtNs(ctx.raw*1000)}}` }} }} }} }},
+    }});
+  }} else document.getElementById('ov-latency').parentElement.innerHTML = '<p class="empty">No latency data</p>';
+
+  // Overview throughput chart
+  if (THR.length) {{
+    const labels = THR.map(m => m.metric);
+    const datasets = [{{ label: 'Ops/sec', data: THR.map(m => m.ops_per_sec), backgroundColor: PALETTE[1]+'cc', borderColor: PALETTE[1], borderWidth: 1 }}];
+    if (COMPARING && B_THR.length) {{
+      const bMap = Object.fromEntries(B_THR.map(m => [m.benchmark+'/'+m.metric, m]));
+      datasets.push({{ label: 'Baseline', data: THR.map(m => bMap[m.benchmark+'/'+m.metric]?.ops_per_sec||0), backgroundColor: BASE_COLOR+'88', borderColor: BASE_COLOR, borderWidth: 1 }});
+    }}
+    new Chart(document.getElementById('ov-throughput'), {{
+      type: 'bar', data: {{ labels, datasets }},
+      options: {{ ...CD, plugins: {{ ...CD.plugins, tooltip: {{ ...CD.plugins.tooltip,
+        callbacks: {{ label: ctx => ` ${{fmtOps(ctx.raw)}}` }} }} }} }},
+    }});
+  }} else document.getElementById('ov-throughput').parentElement.innerHTML += '<p class="empty">No throughput data</p>';
+}}
+
+// ── Latency ───────────────────────────────────────────────────────────────────
+let latChart = null;
+function populateFilter(selId, data) {{
+  const sel = document.getElementById(selId);
+  [...new Set(data.map(m => m.benchmark))].forEach(n => {{
+    const o = document.createElement('option'); o.value = n; o.textContent = n; sel.appendChild(o);
+  }});
+}}
+
+// ── Lang filter state ─────────────────────────────────────────────────────────
+const langFilter = {{ lat: '', thr: '' }};
+function setLangFilter(tab, lang, btn) {{
+  langFilter[tab] = lang;
+  btn.closest('.lang-toggle').querySelectorAll('button').forEach(b => b.classList.remove('active'));
+  btn.classList.add('active');
+  if (tab === 'lat') renderLatency();
+  else renderThroughput();
+}}
+function langBadge(lang) {{
+  return `<span class="badge badge-${{lang}}">${{lang}}</span>`;
+}}
+
+function renderLatency() {{
+  let data = LAT;
+  const benchF = document.getElementById('lat-filter').value;
+  if (benchF) data = data.filter(m => m.benchmark === benchF);
+  if (langFilter.lat) data = data.filter(m => m.lang === langFilter.lat);
+  const bMap = COMPARING ? Object.fromEntries(B_LAT.map(m => [m.benchmark+'/'+m.metric, m])) : {{}};
+
+  const labels = data.map(m => m.metric);
+  const toUs = ns => ns/1000;
+
+  if (latChart) {{ latChart.destroy(); latChart = null; }}
+  if (data.length === 0) {{
+    const otherLang = langFilter.lat === 'python' ? 'C++' : langFilter.lat === 'cpp' ? 'Python' : '';
+    const hint = otherLang ? ` Try switching to the ${{otherLang}} filter.` : '';
+    const canvas = document.getElementById('lat-dist');
+    canvas.style.display = 'none';
+    let p = canvas.parentElement.querySelector('.lat-empty-msg');
+    if (!p) {{ p = document.createElement('p'); p.className = 'empty lat-empty-msg'; canvas.parentElement.appendChild(p); }}
+    p.textContent = `No latency data for current filter.${{hint}}`;
+    document.getElementById('lat-table').innerHTML = '<tr><td colspan="10" class="empty">No data</td></tr>';
+    return;
+  }}
+  // Restore canvas if previously hidden
+  const latDistCanvas = document.getElementById('lat-dist');
+  latDistCanvas.style.display = '';
+  const latEmptyMsg = latDistCanvas.parentElement.querySelector('.lat-empty-msg');
+  if (latEmptyMsg) latEmptyMsg.remove();
+  const datasets = [
+    {{ label: 'Mean',  data: data.map(m=>toUs(m.mean_ns)), backgroundColor: RUN_COLOR+'aa' }},
+    {{ label: 'p50',   data: data.map(m=>toUs(m.p50_ns)),  backgroundColor: PALETTE[2]+'aa' }},
+    {{ label: 'p95',   data: data.map(m=>toUs(m.p95_ns)),  backgroundColor: PALETTE[1]+'aa' }},
+    {{ label: 'p99',   data: data.map(m=>toUs(m.p99_ns)),  backgroundColor: PALETTE[3]+'aa' }},
+  ];
+  if (COMPARING && B_LAT.length) {{
+    datasets.push({{ label: 'Baseline Mean', data: data.map(m=>toUs(bMap[m.benchmark+'/'+m.metric]?.mean_ns||0)), backgroundColor: BASE_COLOR+'55', borderColor: BASE_COLOR, borderWidth: 1, borderDash: [4,2] }});
+  }}
+  latChart = new Chart(document.getElementById('lat-dist'), {{
+    type: 'bar', data: {{ labels, datasets }},
+    options: {{ ...CD,
+      scales: {{ ...CD.scales, y: {{ ...CD.scales.y, title: {{ display:true, text:'µs', color:'#8b8fa8' }} }} }},
+      plugins: {{ ...CD.plugins,
+        tooltip: {{ ...CD.plugins.tooltip,
+          callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtNs(ctx.raw*1000)}}` }} }},
+        zoom: {{
+          pan:  {{ enabled: true, mode: 'xy' }},
+          zoom: {{
+            wheel:  {{ enabled: true }},
+            pinch:  {{ enabled: true }},
+            mode:   'xy',
+            onZoomComplete: ({{ chart }}) => chart.update('none'),
+          }},
+          limits: {{ y: {{ min: 0 }} }},
+        }},
+      }},
+    }},
+  }});
+
+  document.getElementById('lat-table').innerHTML = data.map(m => {{
+    const b = bMap[m.benchmark+'/'+m.metric];
+    const deltaCell = b ? fmtDelta(((m.mean_ns - b.mean_ns) / b.mean_ns)*100, false) : '';
+    const stddev = m.additional?.stddev_ns ?? 0;
+    const cv = m.mean_ns > 0 ? (stddev / m.mean_ns) * 100 : 0;
+    const cvColor = cv < 10 ? 'var(--accent3)' : cv < 30 ? '#ffb74d' : 'var(--accent4)';
+    const cvCell = m.count > 1 ? `<span style="color:${{cvColor}};font-weight:600">${{cv.toFixed(1)}}%</span>` : '—';
+    return `<tr>
+      <td>${{langBadge(m.lang)}}</td>
+      <td><span class="badge badge-latency">${{m.benchmark}}</span></td>
+      <td>${{m.metric}}</td>
+      <td style="color:var(--muted)">${{m.count.toLocaleString()}}</td>
+      <td>${{fmtNs(m.mean_ns)}}${{deltaCell ? ' ' + deltaCell : ''}}</td>
+      <td>${{fmtNs(m.p50_ns)}}</td>
+      <td>${{fmtNs(m.p95_ns)}}</td>
+      <td style="color:var(--accent4)">${{fmtNs(m.p99_ns)}}</td>
+      <td style="color:var(--muted)">${{fmtNs(m.min_ns)}}</td>
+      <td style="color:var(--muted)">${{fmtNs(m.max_ns)}}</td>
+      <td>${{cvCell}}</td>
+    </tr>`;
+  }}).join('') || '<tr><td colspan="11" class="empty">No data</td></tr>';
+}}
+
+// ── Throughput ────────────────────────────────────────────────────────────────
+let thrChart = null;
+function renderThroughput() {{
+  let data = THR;
+  const benchF = document.getElementById('thr-filter').value;
+  if (benchF) data = data.filter(m => m.benchmark === benchF);
+  if (langFilter.thr) data = data.filter(m => m.lang === langFilter.thr);
+  const bMap = COMPARING ? Object.fromEntries(B_THR.map(m => [m.benchmark+'/'+m.metric, m])) : {{}};
+
+  if (thrChart) thrChart.destroy();
+  const datasets = [{{ label: 'Ops/sec', data: data.map(m=>m.ops_per_sec), backgroundColor: PALETTE[1]+'cc', borderColor: PALETTE[1], borderWidth: 1 }}];
+  if (COMPARING && B_THR.length) {{
+    datasets.push({{ label: 'Baseline', data: data.map(m=>bMap[m.benchmark+'/'+m.metric]?.ops_per_sec||0), backgroundColor: BASE_COLOR+'55', borderColor: BASE_COLOR, borderWidth: 1 }});
+  }}
+  thrChart = new Chart(document.getElementById('thr-bar'), {{
+    type: 'bar', data: {{ labels: data.map(m=>m.metric), datasets }},
+    options: {{ ...CD, indexAxis: 'y',
+      scales: {{ ...CD.scales, x: {{ ...CD.scales.x, title: {{ display:true, text:'ops/sec', color:'#8b8fa8' }} }} }},
+      plugins: {{ ...CD.plugins, tooltip: {{ ...CD.plugins.tooltip,
+        callbacks: {{ label: ctx => ' ' + fmtOps(ctx.raw) }} }} }},
+    }},
+  }});
+
+  document.getElementById('thr-table').innerHTML = data.map(m => {{
+    const b = bMap[m.benchmark+'/'+m.metric];
+    const deltaCell = b ? fmtDelta(((m.ops_per_sec - b.ops_per_sec) / b.ops_per_sec)*100, true) : '';
+    return `<tr>
+      <td>${{langBadge(m.lang)}}</td>
+      <td><span class="badge badge-throughput">${{m.benchmark}}</span></td>
+      <td>${{m.metric}}</td>
+      <td style="color:var(--accent2)">${{fmtOps(m.ops_per_sec)}}${{deltaCell ? ' ' + deltaCell : ''}}</td>
+      <td style="color:var(--muted)">${{m.total_ops.toLocaleString()}}</td>
+      <td style="color:var(--muted)">${{m.duration_sec.toFixed(2)}}s</td>
+    </tr>`;
+  }}).join('') || '<tr><td colspan="6" class="empty">No data</td></tr>';
+}}
+
+// ── Scalability Tab ───────────────────────────────────────────────────────────
+let sclThrChart = null, sclLatChart = null, sclEffChart = null;
+
+// Show or hide an empty-state message on a canvas card.
+// Hides/shows the <canvas> and adds/removes a sibling <p class="empty">.
+function sclShowEmpty(canvasId, isEmpty, msg) {{
+  const canvas = document.getElementById(canvasId);
+  if (!canvas) return;
+  canvas.style.display = isEmpty ? 'none' : '';
+  let p = canvas.parentElement.querySelector('.scl-empty-msg');
+  if (isEmpty) {{
+    if (!p) {{ p = document.createElement('p'); p.className = 'empty scl-empty-msg'; canvas.parentElement.appendChild(p); }}
+    p.innerHTML = msg;
+  }} else {{
+    if (p) p.remove();
+  }}
+}}
+
+function renderScalability() {{
+  const dim    = document.getElementById('scl-dim').value;
+  const opSel  = document.getElementById('scl-op').value;
+
+  let rows = SCL.filter(r => r.scale_dim === dim);
+  if (opSel) rows = rows.filter(r => r.operation === opSel);
+
+  // Baseline rows for this dimension (only relevant when COMPARING)
+  let bRows = COMPARING ? B_SCL.filter(r => r.scale_dim === dim) : [];
+  if (opSel) bRows = bRows.filter(r => r.operation === opSel);
+
+  const ops = [...new Set(rows.map(r => r.operation))];
+
+  // ── Empty state ───────────────────────────────────────────────────────────
+  const noDataMsg = dim === 'agents'
+    ? 'No agent-scaling data — run with <code>[SCALABILITY][agents]</code> filter to generate it.'
+    : `No scalability data for dimension: <strong>${{dim}}</strong>`;
+
+  // ── Throughput line chart ──────────────────────────────────────────────────
+  const thrData = ops.map((op, i) => {{
+    const pts = rows.filter(r => r.operation === op && r.category === 'throughput')
+                    .sort((a, b) => a.scale_val - b.scale_val);
+    return {{
+      label: op,
+      data: pts.map(p => ({{x: p.scale_val, y: p.ops_per_sec}})),
+      borderColor: PALETTE[i % PALETTE.length],
+      backgroundColor: PALETTE[i % PALETTE.length] + '33',
+      tension: 0.3, parsing: false,
+    }};
+  }}).filter(ds => ds.data.length > 0);
+
+  // Add dashed baseline series when comparing
+  if (COMPARING && bRows.length) {{
+    const bOps = [...new Set(bRows.map(r => r.operation))];
+    bOps.forEach((op, i) => {{
+      const pts = bRows.filter(r => r.operation === op && r.category === 'throughput')
+                       .sort((a, b) => a.scale_val - b.scale_val);
+      if (pts.length === 0) return;
+      thrData.push({{
+        label: op + ' (baseline)',
+        data: pts.map(p => ({{x: p.scale_val, y: p.ops_per_sec}})),
+        borderColor: PALETTE[i % PALETTE.length] + '88',
+        backgroundColor: 'transparent',
+        borderDash: [5, 3],
+        tension: 0.3, parsing: false,
+      }});
+    }});
+  }}
+
+  if (sclThrChart) {{ sclThrChart.destroy(); sclThrChart = null; }}
+  sclShowEmpty('scl-thr-chart', thrData.length === 0, noDataMsg);
+  if (thrData.length > 0) {{
+    sclThrChart = new Chart(document.getElementById('scl-thr-chart'), {{
+      type: 'line', data: {{ datasets: thrData }},
+      options: {{ ...CD,
+        scales: {{ ...CD.scales,
+          x: {{ ...CD.scales.x, type: 'linear', title: {{ display: true, text: dim, color: '#8b8fa8' }} }},
+          y: {{ ...CD.scales.y, title: {{ display: true, text: 'ops/sec', color: '#8b8fa8' }} }},
+        }},
+        plugins: {{ ...CD.plugins,
+          tooltip: {{ ...CD.plugins.tooltip,
+            callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtOps(ctx.parsed.y)}}` }} }},
+        }},
+      }},
+    }});
+  }}
+
+  // ── Latency line chart ────────────────────────────────────────────────────
+  const latData = ops.map((op, i) => {{
+    const pts = rows.filter(r => r.operation === op && r.category === 'latency')
+                    .sort((a, b) => a.scale_val - b.scale_val);
+    return {{
+      label: op,
+      data: pts.map(p => ({{x: p.scale_val, y: p.mean_ns / 1000}})),
+      borderColor: PALETTE[i % PALETTE.length],
+      backgroundColor: PALETTE[i % PALETTE.length] + '33',
+      tension: 0.3, parsing: false,
+    }};
+  }}).filter(ds => ds.data.length > 0);
+
+  // Add dashed baseline latency series when comparing
+  if (COMPARING && bRows.length) {{
+    const bOps = [...new Set(bRows.map(r => r.operation))];
+    bOps.forEach((op, i) => {{
+      const pts = bRows.filter(r => r.operation === op && r.category === 'latency')
+                       .sort((a, b) => a.scale_val - b.scale_val);
+      if (pts.length === 0) return;
+      latData.push({{
+        label: op + ' (baseline)',
+        data: pts.map(p => ({{x: p.scale_val, y: p.mean_ns / 1000}})),
+        borderColor: PALETTE[i % PALETTE.length] + '88',
+        backgroundColor: 'transparent',
+        borderDash: [5, 3],
+        tension: 0.3, parsing: false,
+      }});
+    }});
+  }}
+
+  if (sclLatChart) {{ sclLatChart.destroy(); sclLatChart = null; }}
+  sclShowEmpty('scl-lat-chart', latData.length === 0, noDataMsg);
+  if (latData.length > 0) {{
+    sclLatChart = new Chart(document.getElementById('scl-lat-chart'), {{
+      type: 'line', data: {{ datasets: latData }},
+      options: {{ ...CD,
+        scales: {{ ...CD.scales,
+          x: {{ ...CD.scales.x, type: 'linear', title: {{ display: true, text: dim, color: '#8b8fa8' }} }},
+          y: {{ ...CD.scales.y, title: {{ display: true, text: 'µs', color: '#8b8fa8' }} }},
+        }},
+        plugins: {{ ...CD.plugins,
+          tooltip: {{ ...CD.plugins.tooltip,
+            callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtNs(ctx.parsed.y * 1000)}}` }} }},
+        }},
+      }},
+    }});
+  }}
+
+  // ── Efficiency line chart ─────────────────────────────────────────────────
+  // Update title based on dimension
+  const effTitleEl = document.getElementById('scl-eff-title');
+  if (effTitleEl) {{
+    effTitleEl.textContent = dim === 'graph_size'
+      ? 'Relative Throughput (% vs smallest graph)'
+      : 'Scaling Efficiency (% of ideal linear)';
+  }}
+
+  let effRows = EFF.filter(r => r.scale_dim === dim);
+  if (opSel) effRows = effRows.filter(r => r.operation === opSel);
+  const effOps = [...new Set(effRows.map(r => r.operation))];
+
+  const effData = effOps.map((op, i) => {{
+    const pts = effRows.filter(r => r.operation === op)
+                       .sort((a, b) => a.scale_val - b.scale_val);
+    return {{
+      label: op,
+      data: pts.map(p => ({{x: p.scale_val, y: p.efficiency}})),
+      borderColor: PALETTE[i % PALETTE.length],
+      backgroundColor: 'transparent',
+      tension: 0.3, parsing: false,
+    }};
+  }});
+
+  // Add ideal 100% reference line
+  const allScales = [...new Set(effRows.map(r => r.scale_val))].sort((a,b)=>a-b);
+  if (allScales.length >= 2) {{
+    effData.push({{
+      label: dim === 'graph_size' ? 'Reference (100%)' : 'Ideal linear (100%)',
+      data: allScales.map(x => ({{x, y: 100}})),
+      borderColor: '#8b8fa855',
+      backgroundColor: 'transparent',
+      borderDash: [6, 3],
+      pointRadius: 0,
+      tension: 0, parsing: false,
+    }});
+  }}
+
+  if (sclEffChart) {{ sclEffChart.destroy(); sclEffChart = null; }}
+  sclShowEmpty('scl-eff-chart', effData.length === 0, noDataMsg);
+  if (effData.length > 0) {{
+    sclEffChart = new Chart(document.getElementById('scl-eff-chart'), {{
+      type: 'line', data: {{ datasets: effData }},
+      options: {{ ...CD,
+        scales: {{ ...CD.scales,
+          x: {{ ...CD.scales.x, type: 'linear', title: {{ display: true, text: dim, color: '#8b8fa8' }} }},
+          y: {{ ...CD.scales.y,
+            title: {{ display: true, text: 'Efficiency %', color: '#8b8fa8' }},
+            ticks: {{ callback: v => v + '%', color: '#8b8fa8', font: {{size: 11}} }},
+          }},
+        }},
+        plugins: {{ ...CD.plugins,
+          tooltip: {{ ...CD.plugins.tooltip,
+            callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{ctx.parsed.y.toFixed(1)}}%` }} }},
+        }},
+      }},
+    }});
+  }}
+
+  // ── Detail table ──────────────────────────────────────────────────────────
+  const effMap = {{}};
+  EFF.forEach(r => {{ effMap[r.operation + '/' + r.scale_dim + '/' + r.scale_val] = r.efficiency; }});
+
+  const tableRows = rows.map(r => {{
+    const effKey = r.operation + '/' + r.scale_dim + '/' + r.scale_val;
+    const eff = effMap[effKey];
+    const effCell = eff !== undefined ? eff.toFixed(1) + '%' : '—';
+    const thrCell = r.category === 'throughput' ? fmtOps(r.ops_per_sec) : '—';
+    const latCell = r.mean_ns > 0 ? fmtNs(r.mean_ns) : '—';
+    return `<tr>
+      <td style="color:var(--muted)">${{r.benchmark}}</td>
+      <td>${{r.operation}}</td>
+      <td style="color:var(--accent2)">${{r.scale_dim}}</td>
+      <td style="font-weight:600">${{r.scale_val}}</td>
+      <td style="color:var(--accent2)">${{thrCell}}</td>
+      <td>${{latCell}}</td>
+      <td style="color:var(--accent3)">${{effCell}}</td>
+    </tr>`;
+  }}).join('') || `<tr><td colspan="7" class="empty">${{noDataMsg}}</td></tr>`;
+
+  document.getElementById('scl-table').innerHTML = tableRows;
+}}
+
+// ── Compare Tab ───────────────────────────────────────────────────────────────
+function renderCompare() {{
+  const el = document.getElementById('tab-compare');
+  if (!el || !COMPARING) return;
+
+  // Build delta table for latency
+  const bLatMap = Object.fromEntries(B_LAT.map(m => [m.benchmark+'/'+m.metric, m]));
+  const bThrMap = Object.fromEntries(B_THR.map(m => [m.benchmark+'/'+m.metric, m]));
+
+  const latRows = LAT.map(m => {{
+    const b = bLatMap[m.benchmark+'/'+m.metric];
+    if (!b) return `<tr><td><span class="badge badge-latency">${{m.benchmark}}</span></td><td>${{m.metric}}</td><td colspan="4" style="color:var(--muted)">no baseline</td></tr>`;
+    const pct = ((m.mean_ns - b.mean_ns) / b.mean_ns) * 100;
+    const p99pct = ((m.p99_ns - b.p99_ns) / b.p99_ns) * 100;
+    return `<tr>
+      <td><span class="badge badge-latency">${{m.benchmark}}</span></td>
+      <td>${{m.metric}}</td>
+      <td>${{fmtNs(b.mean_ns)}}</td>
+      <td>${{fmtNs(m.mean_ns)}}</td>
+      <td>${{fmtDelta(pct, false)}}</td>
+      <td>${{fmtDelta(p99pct, false)}}</td>
+    </tr>`;
+  }}).join('');
+
+  const thrRows = THR.map(m => {{
+    const b = bThrMap[m.benchmark+'/'+m.metric];
+    if (!b) return `<tr><td><span class="badge badge-throughput">${{m.benchmark}}</span></td><td>${{m.metric}}</td><td colspan="3" style="color:var(--muted)">no baseline</td></tr>`;
+    const pct = ((m.ops_per_sec - b.ops_per_sec) / b.ops_per_sec) * 100;
+    return `<tr>
+      <td><span class="badge badge-throughput">${{m.benchmark}}</span></td>
+      <td>${{m.metric}}</td>
+      <td style="color:var(--muted)">${{fmtOps(b.ops_per_sec)}}</td>
+      <td style="color:var(--accent2)">${{fmtOps(m.ops_per_sec)}}</td>
+      <td>${{fmtDelta(pct, true)}}</td>
+    </tr>`;
+  }}).join('');
+
+  // Delta bar chart — mean latency
+  const allMetrics = [...new Set([...LAT.map(m=>m.benchmark+'/'+m.metric), ...B_LAT.map(m=>m.benchmark+'/'+m.metric)])];
+  const deltaLabels = [], deltaValues = [], deltaColors = [];
+  for (const key of allMetrics) {{
+    const cur = LAT.find(m => m.benchmark+'/'+m.metric === key);
+    const base = bLatMap[key];
+    if (cur && base) {{
+      deltaLabels.push(cur.metric);
+      const pct = ((cur.mean_ns - base.mean_ns) / base.mean_ns) * 100;
+      deltaValues.push(pct);
+      deltaColors.push(pct > 5 ? '#f28b8288' : pct < -5 ? '#81c99588' : '#8b8fa855');
+    }}
+  }}
+
+  el.innerHTML = `
+    <div class="section">
+      <div style="display:flex;gap:16px;align-items:center;margin-bottom:20px;flex-wrap:wrap;">
+        <div class="run-pill"><span class="dot" style="background:${{RUN_COLOR}}"></span>{run_label}</div>
+        <span style="color:var(--muted)">vs baseline</span>
+        <div class="run-pill"><span class="dot" style="background:${{BASE_COLOR}}"></span>{b_label}</div>
+      </div>
+
+      ${{deltaLabels.length ? `
+      <div class="card section">
+        <h3>Latency Change vs Baseline (mean, %)</h3>
+        <div class="chart-wrap tall"><canvas id="cmp-delta-chart"></canvas></div>
+      </div>` : ''}}
+
+      <div class="card section">
+        <h3>Latency Comparison</h3>
+        <table>
+          <thead><tr>
+            <th>Benchmark</th><th>Operation</th>
+            <th>Baseline Mean</th><th>Current Mean</th><th>Δ Mean</th><th>Δ p99</th>
+          </tr></thead>
+          <tbody>${{latRows || '<tr><td colspan="6" class="empty">No latency data</td></tr>'}}</tbody>
+        </table>
+      </div>
+
+      <div class="card section">
+        <h3>Throughput Comparison</h3>
+        <table>
+          <thead><tr>
+            <th>Benchmark</th><th>Operation</th>
+            <th>Baseline</th><th>Current</th><th>Δ</th>
+          </tr></thead>
+          <tbody>${{thrRows || '<tr><td colspan="5" class="empty">No throughput data</td></tr>'}}</tbody>
+        </table>
+      </div>
+    </div>
+  `;
+
+  if (deltaLabels.length) {{
+    new Chart(document.getElementById('cmp-delta-chart'), {{
+      type: 'bar',
+      data: {{ labels: deltaLabels, datasets: [{{ label: 'Δ Mean Latency (%)', data: deltaValues, backgroundColor: deltaColors, borderWidth: 0 }}] }},
+      options: {{ ...CD, indexAxis: 'y',
+        scales: {{ ...CD.scales,
+          x: {{ ...CD.scales.x,
+            title: {{ display:true, text:'% change (positive = slower)', color:'#8b8fa8' }},
+            ticks: {{ callback: v => v + '%', color:'#8b8fa8', font:{{size:11}} }},
+          }},
+        }},
+        plugins: {{ ...CD.plugins,
+          tooltip: {{ ...CD.plugins.tooltip, callbacks: {{ label: ctx => ` ${{ctx.raw.toFixed(1)}}%` }} }},
+          annotation: {{ annotations: {{ zero: {{ type:'line', xMin:0, xMax:0, borderColor:'#8b8fa855', borderWidth:1 }} }} }},
+        }},
+      }},
+    }});
+  }}
+}}
+
+// ── Raw ───────────────────────────────────────────────────────────────────────
+function renderRaw() {{
+  const all = [...LAT.map(m=>({{'type':'latency',...m}})), ...THR.map(m=>({{'type':'throughput',...m}}))];
+  document.getElementById('raw-content').innerHTML = `
+    <div class="card">
+      <pre style="overflow:auto;font-family:var(--mono);font-size:0.78rem;color:var(--muted);max-height:600px;">${{JSON.stringify(all, null, 2)}}</pre>
+    </div>`;
+}}
+
+// ── Init ──────────────────────────────────────────────────────────────────────
+populateFilter('lat-filter', LAT);
+populateFilter('thr-filter', THR);
+// scl-op is populated lazily on first tab activation (needs SCL data)
+renderOverview();
+renderLatency();
+renderThroughput();
+renderRaw();
+</script>
+</body>
+</html>
+"""
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"Report written to: {os.path.abspath(output_path)}")
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate visual HTML benchmark report")
+    parser.add_argument("--run", "-r", help="Run ID to report on (default: latest)")
+    parser.add_argument("--baseline", "-b", help="Run ID to compare against")
+    parser.add_argument("--results-root", default=DEFAULT_RESULTS_ROOT)
+    parser.add_argument("--output", "-o", help="Output HTML file (default: <run_dir>/report.html)")
+    parser.add_argument("--list", action="store_true", help="List available runs")
+    args = parser.parse_args()
+
+    runs = load_runs_index()
+
+    if args.list:
+        if not runs:
+            print("No runs recorded. Run 'python run_all.py' first.")
+            return
+        print(f"{'ID':<22}  {'Label':<20}  Dir")
+        print("-" * 70)
+        for r in runs:
+            print(f"{r['id']:<22}  {(r.get('label') or '-'):<20}  {r['dir']}")
+        return
+
+    # Resolve target run
+    if args.run:
+        run_dir = resolve_run_dir(args.run, args.results_root)
+    elif runs:
+        # Latest run
+        latest = runs[-1]
+        run_dir = os.path.join(args.results_root, latest["dir"])
+        print(f"Using latest run: {latest['id']}")
+    else:
+        # Fallback: flat results directory (old layout)
+        run_dir = args.results_root
+        print(f"No runs index found, reading from: {run_dir}")
+
+    run_info = load_run_info(run_dir)
+    bench_files = load_run_metrics(run_dir)
+    if not bench_files:
+        print(f"No metric JSON files found in: {run_dir}", file=sys.stderr)
+        sys.exit(1)
+    print(f"Loaded {len(bench_files)} metric file(s) from run '{run_info.get('id', run_dir)}'")
+
+    # Resolve baseline
+    baseline_info, baseline_files = None, None
+    if args.baseline:
+        b_dir = resolve_run_dir(args.baseline, args.results_root)
+        baseline_info = load_run_info(b_dir)
+        baseline_files = load_run_metrics(b_dir)
+        print(f"Baseline: {len(baseline_files)} file(s) from run '{baseline_info.get('id', b_dir)}'")
+
+    output_path = args.output or os.path.join(run_dir, "report.html")
+    generate_html(run_info, bench_files, output_path, baseline_info, baseline_files)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py
new file mode 100644
index 0000000..4ac790e
--- /dev/null
+++ b/benchmarks/run_benchmarks.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+"""
+Top-level DSR benchmark runner — executes C++ and Python suites in one shot.
+
+Usage:
+    python run_benchmarks.py                         # run both suites
+    python run_benchmarks.py --label "after-fix"     # named run
+    python run_benchmarks.py --cpp-only              # skip Python
+    python run_benchmarks.py --python-only           # skip C++
+    python run_benchmarks.py --build                 # cmake build before running
+    python run_benchmarks.py --cpp-filter "[LATENCY]"# pass filter to dsr_benchmarks
+    python run_benchmarks.py --report                # open HTML report when done
+    python run_benchmarks.py --compare <run-id>      # compare against a previous run
+    python run_benchmarks.py --list                  # list recorded runs
+    python run_benchmarks.py --delete <run-id>       # remove a run from the index
+    python run_benchmarks.py --repeat 5              # run C++ 5× and report median
+    python run_benchmarks.py --priority -10          # run with higher OS priority (requires root)
+    python run_benchmarks.py --taskset 0,1           # pin C++ benchmarks to CPU cores 0 and 1
+"""
+
+import sys
+import os
+import subprocess
+import time
+import json
+import argparse
+import platform
+import shlex
+from typing import Optional
+from datetime import datetime
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PYTHON_DIR = os.path.join(SCRIPT_DIR, "python")
+BUILD_DIR = os.path.join(SCRIPT_DIR, "build")
+RESULTS_ROOT = os.path.join(SCRIPT_DIR, "results")
+RUNS_INDEX = os.path.join(RESULTS_ROOT, "runs.json")
+
+
+# ── Index helpers (mirrors python/run_all.py) ──────────────────────────────────
+
+def load_runs() -> list:
+    if not os.path.isfile(RUNS_INDEX):
+        return []
+    with open(RUNS_INDEX) as f:
+        return json.load(f)
+
+
+def save_runs(runs: list):
+    os.makedirs(RESULTS_ROOT, exist_ok=True)
+    with open(RUNS_INDEX, "w") as f:
+        json.dump(runs, f, indent=2)
+
+
+def register_run(run_info: dict):
+    runs = load_runs()
+    runs = [r for r in runs if r["id"] != run_info["id"]]
+    runs.append(run_info)
+    runs.sort(key=lambda r: r["id"])
+    save_runs(runs)
+
+
+# ── Locate C++ binary ─────────────────────────────────────────────────────────
+
+def find_cpp_binary(override: Optional[str]) -> Optional[str]:
+    if override:
+        return override if os.path.isfile(override) else None
+    candidate = os.path.join(BUILD_DIR, "dsr_benchmarks")
+    return candidate if os.path.isfile(candidate) else None
+
+
+def win_to_wsl(path: str) -> str:
+    """Convert a Windows absolute path to a WSL /mnt/... path."""
+    path = path.replace("\\", "/")
+    if len(path) >= 2 and path[1] == ":":
+        drive = path[0].lower()
+        path = f"/mnt/{drive}{path[2:]}"
+    return path
+
+
+def is_wsl_needed() -> bool:
+    """Return True if we're on Windows and wsl.exe is available (ELF binary)."""
+    if platform.system() != "Windows":
+        return False
+    try:
+        subprocess.run(["wsl", "--version"], capture_output=True, timeout=3)
+        return True
+    except Exception:
+        return False
+
+
+# ── Build step ────────────────────────────────────────────────────────────────
+
+def build_cpp() -> bool:
+    if not os.path.isdir(BUILD_DIR):
+        print(f"Build directory not found: {BUILD_DIR}")
+        return False
+    print("Building C++ benchmarks...")
+    if is_wsl_needed():
+        wsl_build = win_to_wsl(BUILD_DIR)
+        result = subprocess.run(
+            ["wsl", "-e", "bash", "-c", f"cmake --build {wsl_build} --parallel"],
+            cwd=SCRIPT_DIR,
+        )
+    else:
+        result = subprocess.run(
+            ["cmake", "--build", BUILD_DIR, "--parallel"],
+            cwd=SCRIPT_DIR,
+        )
+    return result.returncode == 0
+
+
+# ── Median merge ──────────────────────────────────────────────────────────────
+
+def _median(values: list) -> float:
+    """Return the median of a list of numbers (handles even-length lists)."""
+    import statistics
+    return statistics.median(values) if values else 0.0
+
+
+def merge_cpp_results(src_dirs: list[str], dest_dir: str):
+    """
+    Load the same JSON result files from N run directories and write a merged
+    copy to dest_dir where each metric's numerical fields are replaced by the
+    median across all N runs.  Non-numeric fields (name, unit, tags, category)
+    are taken from the first run.
+
+    This cancels OS-scheduler noise: a single run that was preempted by a
+    Windows background process no longer inflates the reported mean.
+    """
+    import statistics as _stats
+
+    os.makedirs(dest_dir, exist_ok=True)
+
+    # Collect all JSON basenames present in any source directory
+    all_files: set[str] = set()
+    for d in src_dirs:
+        results_d = os.path.join(d, "results")
+        if os.path.isdir(results_d):
+            for f in os.listdir(results_d):
+                if f.endswith(".json"):
+                    all_files.add(f)
+
+    merged_count = 0
+    for basename in sorted(all_files):
+        # Load this file from every run that has it
+        loaded = []
+        for d in src_dirs:
+            path = os.path.join(d, "results", basename)
+            if os.path.isfile(path):
+                try:
+                    with open(path) as fh:
+                        loaded.append(json.load(fh))
+                except Exception as e:
+                    print(f"  Warning: could not load {path}: {e}", file=sys.stderr)
+
+        if not loaded:
+            continue
+
+        if len(loaded) == 1:
+            # Only one run has this file — copy as-is
+            import shutil
+            shutil.copy(os.path.join(src_dirs[0], "results", basename),
+                        os.path.join(dest_dir, basename))
+            continue
+
+        # Build merged result: start from first run's structure
+        merged = json.loads(json.dumps(loaded[0]))  # deep copy
+
+        # Index metrics by name+tags key so we match the right metric across runs
+        def metric_key(m: dict) -> str:
+            tags = m.get("tags", {})
+            tag_str = ",".join(f"{k}={v}" for k, v in sorted(tags.items()))
+            return f"{m['name']}|{tag_str}"
+
+        per_run_metrics: dict[str, list[dict]] = {}
+        for run_data in loaded:
+            for m in run_data.get("metrics", []):
+                k = metric_key(m)
+                per_run_metrics.setdefault(k, []).append(m)
+
+        merged_metrics = []
+        for m in merged.get("metrics", []):
+            k = metric_key(m)
+            peers = per_run_metrics.get(k, [m])
+            if len(peers) < 2:
+                merged_metrics.append(m)
+                continue
+
+            merged_m = json.loads(json.dumps(m))  # deep copy
+            # Median the top-level value
+            values = [p["value"] for p in peers if isinstance(p.get("value"), (int, float))]
+            if values:
+                merged_m["value"] = _median(values)
+
+            # Median all additional numeric fields
+            all_add_keys: set[str] = set()
+            for p in peers:
+                all_add_keys.update(p.get("additional", {}).keys())
+            for key in all_add_keys:
+                vals = [p.get("additional", {}).get(key)
+                        for p in peers if isinstance(p.get("additional", {}).get(key), (int, float))]
+                if vals:
+                    merged_m.setdefault("additional", {})[key] = _median(vals)
+
+            merged_metrics.append(merged_m)
+
+        merged["metrics"] = merged_metrics
+        merged.setdefault("metadata", {})["repeat_runs"] = str(len(loaded))
+        merged["metadata"]["aggregation"] = "median"
+
+        out_path = os.path.join(dest_dir, basename)
+        with open(out_path, "w") as fh:
+            json.dump(merged, fh, indent=2)
+        merged_count += 1
+
+    print(f"  Merged {merged_count} result file(s) from {len(src_dirs)} runs (median)")
+
+
+# ── Run C++ suite ─────────────────────────────────────────────────────────────
+
+def _build_cpp_cmd(binary: str, catch2_filter: Optional[str], verbose: bool,
+                   priority: Optional[int], taskset: Optional[str]) -> str:
+    """Build the shell command string for one C++ benchmark invocation."""
+    parts = []
+    if taskset:
+        parts += [f"taskset -c {shlex.quote(taskset)}"]
+    if priority is not None:
+        parts += [f"nice -n {priority}"]
+    wsl_binary = win_to_wsl(binary) if is_wsl_needed() else binary
+    parts.append(shlex.quote(wsl_binary))
+    if catch2_filter:
+        parts.append(shlex.quote(catch2_filter))
+    if verbose:
+        parts.append("--verbose")
+    return " ".join(parts)
+
+
+def _run_cpp_once(binary: str, cpp_cwd: str, catch2_filter: Optional[str],
+                  verbose: bool, priority: Optional[int], taskset: Optional[str]) -> tuple[bool, float]:
+    os.makedirs(cpp_cwd, exist_ok=True)
+    start = time.time()
+    if is_wsl_needed():
+        wsl_cwd = win_to_wsl(cpp_cwd)
+        cmd_str = _build_cpp_cmd(binary, catch2_filter, verbose, priority, taskset)
+        bash_cmd = f"cd {wsl_cwd} && {cmd_str}"
+        result = subprocess.run(["wsl", "-e", "bash", "-c", bash_cmd])
+    else:
+        cmd = []
+        if taskset:
+            cmd += ["taskset", "-c", taskset]
+        if priority is not None:
+            cmd += ["nice", "-n", str(priority)]
+        cmd.append(binary)
+        if catch2_filter:
+            cmd.append(catch2_filter)
+        if verbose:
+            cmd.append("--verbose")
+        result = subprocess.run(cmd, cwd=cpp_cwd)
+    duration = time.time() - start
+    return result.returncode == 0, duration
+
+
+def run_cpp(binary: str, run_dir: str, catch2_filter: Optional[str], verbose: bool,
+            repeat: int = 1, priority: Optional[int] = None, taskset: Optional[str] = None):
+    """
+    Run dsr_benchmarks 'repeat' times.  If repeat > 1, each invocation writes
+    to a separate cpp_N/ subdirectory; results are then median-merged into
+    cpp/results/ so the rest of the pipeline sees a single stable result set.
+    """
+    print(f"\n{'=' * 70}")
+    print(f"Running: C++ benchmarks ({os.path.basename(binary)})")
+    if catch2_filter:
+        print(f"Filter : {catch2_filter}")
+    if repeat > 1:
+        print(f"Repeat : {repeat}× (median aggregation)")
+    if priority is not None:
+        print(f"Priority: nice {priority:+d}")
+    if taskset:
+        print(f"CPU affinity: {taskset}")
+    print("=" * 70)
+
+    total_start = time.time()
+    all_ok = True
+
+    if repeat <= 1:
+        # Single run — original behaviour
+        cpp_cwd = os.path.join(run_dir, "cpp")
+        print(f"Output : {cpp_cwd}/results/")
+        ok, dur = _run_cpp_once(binary, cpp_cwd, catch2_filter, verbose, priority, taskset)
+        all_ok = ok
+    else:
+        # Multiple runs → median merge
+        run_cwds = []
+        for r in range(1, repeat + 1):
+            cpp_cwd = os.path.join(run_dir, f"cpp_{r}")
+            print(f"\n--- Run {r}/{repeat} → {cpp_cwd}/results/ ---")
+            ok, dur = _run_cpp_once(binary, cpp_cwd, catch2_filter, verbose, priority, taskset)
+            if not ok:
+                print(f"  Warning: run {r} exited non-zero")
+                all_ok = False
+            run_cwds.append(cpp_cwd)
+
+        # Merge into canonical cpp/results/
+        dest = os.path.join(run_dir, "cpp", "results")
+        print(f"\nMerging {repeat} runs → {dest}")
+        merge_cpp_results(run_cwds, dest)
+
+    total_dur = time.time() - total_start
+    print(f"\nC++ suite {'PASSED' if all_ok else 'FAILED'} in {total_dur:.1f}s")
+    return all_ok, total_dur
+
+
+# ── Run Python suite ──────────────────────────────────────────────────────────
+
+def run_python(run_dir: str, label: Optional[str]):
+    """
+    Delegate to python/run_all.py passing BENCH_RESULTS_DIR so Python files
+    land directly in <run_dir>/ (not a subdirectory).
+    """
+    print(f"\n{'=' * 70}")
+    print("Running: Python benchmarks")
+    print(f"Output : {run_dir}/")
+    print("=" * 70)
+
+    env = {**os.environ, "BENCH_RESULTS_DIR": run_dir}
+    cmd = [sys.executable, os.path.join(PYTHON_DIR, "run_all.py"), "--direct"]
+    # --direct: benchmarks write to BENCH_RESULTS_DIR, skip run_all.py's own
+    # index registration so run_benchmarks.py stays the single source of truth.
+
+    start = time.time()
+    result = subprocess.run(cmd, cwd=PYTHON_DIR, env=env)
+    duration = time.time() - start
+
+    ok = result.returncode == 0
+    print(f"\nPython suite {'PASSED' if ok else 'FAILED'} in {duration:.1f}s")
+    return ok, duration
+
+
+# ── Commands ──────────────────────────────────────────────────────────────────
+
+def cmd_list():
+    runs = load_runs()
+    if not runs:
+        print("No runs recorded yet.")
+        return
+    print(f"{'ID':<22}  {'Label':<20}  {'Suites':<12}  {'Duration':>9}")
+    print("-" * 70)
+    for r in runs:
+        suites = ", ".join(r.get("suites_run", [])) or "-"
+        dur = f"{r.get('total_duration_sec', 0):.1f}s"
+        label = r.get("label") or "-"
+        print(f"{r['id']:<22}  {label:<20}  {suites:<12}  {dur:>9}")
+
+
+def cmd_delete(run_id: str):
+    runs = load_runs()
+    before = len(runs)
+    runs = [r for r in runs if r["id"] != run_id]
+    if len(runs) == before:
+        print(f"Run '{run_id}' not found in index.")
+        return
+    save_runs(runs)
+    print(f"Removed run '{run_id}' from index (files kept on disk).")
+
+
+def cmd_run(args):
+    ts = datetime.now()
+    run_id = ts.strftime("%Y%m%dT%H%M%S")
+    dir_name = run_id if not args.label else f"{run_id}_{args.label.replace(' ', '-')}"
+    run_dir = os.path.join(RESULTS_ROOT, dir_name)
+    os.makedirs(run_dir, exist_ok=True)
+
+    print("=" * 70)
+    print("  DSR Benchmark Suite (C++ + Python)")
+    print(f"  Run ID : {run_id}")
+    if args.label:
+        print(f"  Label  : {args.label}")
+    print(f"  Output : {run_dir}")
+    print("=" * 70)
+
+    # Optionally build C++
+    if args.build:
+        if not build_cpp():
+            print("Build failed — aborting.")
+            return 1
+
+    suites_run = []
+    results = {}
+    total_start = time.time()
+
+    # C++ suite
+    if not args.python_only:
+        binary = find_cpp_binary(args.cpp_binary)
+        if binary:
+            ok, dur = run_cpp(binary, run_dir, args.cpp_filter, args.verbose,
+                              repeat=args.repeat, priority=args.priority, taskset=args.taskset)
+            results["cpp"] = {"ok": ok, "duration_sec": dur}
+            suites_run.append("cpp")
+        else:
+            print("\nWARNING: C++ binary not found. Use --cpp-binary or --build.")
+            print(f"  Searched: {os.path.join(BUILD_DIR, 'dsr_benchmarks')}")
+            results["cpp"] = {"ok": False, "duration_sec": 0, "skipped": True}
+
+    # Python suite
+    if not args.cpp_only:
+        ok, dur = run_python(run_dir, args.label)
+        results["python"] = {"ok": ok, "duration_sec": dur}
+        suites_run.append("python")
+
+    total_duration = time.time() - total_start
+
+    # Gather git hash
+    try:
+        git_hash = subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"],
+            cwd=SCRIPT_DIR, stderr=subprocess.DEVNULL,
+        ).decode().strip()
+    except Exception:
+        git_hash = ""
+
+    run_info = {
+        "id": run_id,
+        "label": args.label or "",
+        "dir": dir_name,
+        "timestamp": ts.isoformat(),
+        "total_duration_sec": round(total_duration, 2),
+        "suites_run": suites_run,
+        "suites_passed": [s for s in suites_run if results.get(s, {}).get("ok")],
+        "git_hash": git_hash,
+        "platform": platform.platform(),
+        "python": sys.version.split()[0],
+    }
+
+    with open(os.path.join(run_dir, "run_info.json"), "w") as f:
+        json.dump(run_info, f, indent=2)
+
+    register_run(run_info)
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("  Summary")
+    print("=" * 70)
+    all_ok = True
+    for suite in ["cpp", "python"]:
+        if suite not in results:
+            continue
+        r = results[suite]
+        if r.get("skipped"):
+            print(f"  [SKIP] {suite}")
+        else:
+            status = "PASS" if r["ok"] else "FAIL"
+            print(f"  [{status}] {suite} ({r['duration_sec']:.1f}s)")
+            if not r["ok"]:
+                all_ok = False
+
+    print(f"\n  Run ID  : {run_id}")
+    print(f"  Results : {run_dir}")
+    print(f"  Index   : {RUNS_INDEX}")
+
+    # Generate report
+    if args.report or args.compare:
+        report_args = ["--run", run_id, "--results-root", RESULTS_ROOT]
+        if args.compare:
+            report_args += ["--baseline", args.compare]
+        report_path = os.path.join(run_dir, "report.html")
+        report_args += ["--output", report_path]
+
+        print(f"\nGenerating report...")
+        subprocess.run(
+            [sys.executable, os.path.join(SCRIPT_DIR, "report.py")] + report_args,
+            cwd=SCRIPT_DIR,
+        )
+
+        if args.open_report and os.path.isfile(report_path):
+            import webbrowser
+            webbrowser.open(f"file://{report_path}")
+
+    return 0 if all_ok else 1
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run DSR C++ and Python benchmarks together",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--label", "-l", help="Human-readable label for this run")
+    parser.add_argument("--cpp-binary", metavar="PATH",
+                        help=f"Path to dsr_benchmarks binary (default: {os.path.join(BUILD_DIR, 'dsr_benchmarks')})")
+    parser.add_argument("--cpp-filter", metavar="FILTER",
+                        help='Catch2 test filter, e.g. "[LATENCY]" or "[THROUGHPUT]"')
+    parser.add_argument("--build", action="store_true",
+                        help="Build C++ benchmarks before running")
+    parser.add_argument("--cpp-only", action="store_true", help="Skip Python suite")
+    parser.add_argument("--python-only", action="store_true", help="Skip C++ suite")
+    parser.add_argument("--verbose", "-v", action="store_true",
+                        help="Pass --verbose to C++ binary (shows Qt debug messages)")
+    parser.add_argument("--report", action="store_true",
+                        help="Generate HTML report after the run")
+    parser.add_argument("--open", dest="open_report", action="store_true",
+                        help="Open the HTML report in a browser after generation")
+    parser.add_argument("--compare", metavar="RUN_ID",
+                        help="Generate a comparison report against this baseline run")
+    parser.add_argument("--list", action="store_true", help="List all recorded runs")
+    parser.add_argument("--delete", metavar="RUN_ID",
+                        help="Remove a run from the index")
+    parser.add_argument("--repeat", "-r", type=int, default=1, metavar="N",
+                        help="Run C++ benchmarks N times and report the median (reduces OS noise)")
+    parser.add_argument("--priority", type=int, default=None, metavar="NICE",
+                        help="Set process nice level (e.g. -10); values < 0 require root/sudo")
+    parser.add_argument("--taskset", metavar="CPULIST",
+                        help="Pin C++ benchmarks to CPU cores via taskset (e.g. '0,1')")
+
+    args = parser.parse_args()
+
+    if args.list:
+        cmd_list()
+        return 0
+
+    if args.delete:
+        cmd_delete(args.delete)
+        return 0
+
+    if args.cpp_only and args.python_only:
+        print("Error: --cpp-only and --python-only are mutually exclusive.")
+        return 1
+
+    return cmd_run(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/scalability/agent_scaling_bench.cpp b/benchmarks/scalability/agent_scaling_bench.cpp
new file mode 100644
index 0000000..af0fbac
--- /dev/null
+++ b/benchmarks/scalability/agent_scaling_bench.cpp
@@ -0,0 +1,276 @@
+#include <catch2/catch_test_macros.hpp>
+#include <thread>
+#include <atomic>
+#include <barrier>
+#include <vector>
+#include <chrono>
+#include <string>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+using namespace std::chrono;
+
+// Multi-agent scaling benchmarks.  Tagged [.multi] so they are excluded from
+// the default test run (DDS multi-agent tests are slow and require specific
+// network setup).  Opt in with: --cpp-filter "[SCALABILITY][agents]"
+//
+// Loop over {1, 2, 4} agents.  One thread per agent operates on its own
+// DSRGraph instance; a 3-second window measures total throughput and latency.
+
+static constexpr auto AGENT_DUR = std::chrono::seconds(3);
+
+// ── Node insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node insert agent scaling", "[SCALABILITY][agents][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_agent_scaling");
+
+    for (uint32_t N : {1u, 2u, 4u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(N, config_file));
+        fixture.wait_for_sync();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(500000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t i = 0; i < N; ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t base_id = 800000ULL + agent_idx * 200000ULL;
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[agent_idx];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = GraphGenerator::create_test_node(
+                        base_id + local_ops, graph->get_agent_id());
+                    uint64_t ts = bench_now();
+                    graph->insert_node(node);
+                    samples.push_back(bench_now() - ts);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(AGENT_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_insert", total_ops.load(), dur,
+            {{"agents", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_insert", merged.stats(),
+                {{"agents", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_insert", N, ops_per_sec, "ops/sec",
+            {{"agents", n_str}, {"scale_dim", "agents"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_agent_scaling");
+}
+
+// ── Node read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Node read agent scaling", "[SCALABILITY][agents][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_agent_scaling");
+
+    for (uint32_t N : {1u, 2u, 4u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(N, config_file));
+        fixture.wait_for_sync();
+
+        // Pre-populate 1000 nodes on agent 0; they sync to all agents.
+        auto* graph0 = fixture.get_agent(0);
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(1000);
+        for (uint64_t i = 0; i < 1000; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph0->get_agent_id());
+            auto res = graph0->insert_node(node);
+            if (res.has_value()) node_ids.push_back(res.value());
+        }
+        REQUIRE(!node_ids.empty());
+        fixture.wait_for_sync();
+
+        const size_t pool_size = node_ids.size();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(500000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t i = 0; i < N; ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[agent_idx];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    uint64_t id = node_ids[local_ops % pool_size];
+                    uint64_t ts = bench_now();
+                    auto node = graph->get_node(id);
+                    samples.push_back(bench_now() - ts);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(AGENT_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_read", total_ops.load(), dur,
+            {{"agents", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_read", merged.stats(),
+                {{"agents", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_read", N, ops_per_sec, "ops/sec",
+            {{"agents", n_str}, {"scale_dim", "agents"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_agent_scaling");
+}
+
+// ── Node update ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node update agent scaling", "[SCALABILITY][agents][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_agent_scaling");
+
+    for (uint32_t N : {1u, 2u, 4u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(N, config_file));
+        fixture.wait_for_sync();
+
+        // Each agent gets its own dedicated node to avoid update contention.
+        std::vector<uint64_t> agent_node_ids(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto* graph = fixture.get_agent(i);
+            auto node = GraphGenerator::create_test_node(
+                700000 + i, graph->get_agent_id(),
+                "agent_update_node_" + std::to_string(i));
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+            agent_node_ids[i] = res.value();
+        }
+        fixture.wait_for_sync();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(500000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t i = 0; i < N; ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t nid = agent_node_ids[agent_idx];
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[agent_idx];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = graph->get_node(nid);
+                    if (node) {
+                        graph->add_or_modify_attrib_local<level_att>(
+                            *node, static_cast<int32_t>(local_ops % 1000));
+                        uint64_t ts = bench_now();
+                        graph->update_node(*node);
+                        samples.push_back(bench_now() - ts);
+                        local_ops++;
+                    }
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(AGENT_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_update", total_ops.load(), dur,
+            {{"agents", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_update", merged.stats(),
+                {{"agents", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_update", N, ops_per_sec, "ops/sec",
+            {{"agents", n_str}, {"scale_dim", "agents"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_agent_scaling");
+}
diff --git a/benchmarks/scalability/graph_size_impact_bench.cpp b/benchmarks/scalability/graph_size_impact_bench.cpp
new file mode 100644
index 0000000..0786e96
--- /dev/null
+++ b/benchmarks/scalability/graph_size_impact_bench.cpp
@@ -0,0 +1,275 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Graph size impact on performance", "[SCALABILITY][graphsize]") {
+    MetricsCollector collector("graph_size_impact");
+    GraphGenerator generator;
+
+    SECTION("Node lookup performance vs graph size") {
+        for (uint32_t size : {100, 1000, 10000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            // Populate graph and store actual IDs
+            std::vector<uint64_t> node_ids;
+            node_ids.reserve(size);
+            for (uint32_t i = 0; i < size; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    0, graph->get_agent_id());
+                auto result = graph->insert_node(node);
+                if (result.has_value()) {
+                    node_ids.push_back(result.value());
+                }
+            }
+            REQUIRE(!node_ids.empty());
+
+            // Measure lookup performance
+            LatencyTracker tracker(1000);
+
+            for (int i = 0; i < 1000; ++i) {
+                uint64_t id = node_ids[i % node_ids.size()];
+                auto timer = tracker.scoped_record();
+                auto node = graph->get_node(id);
+            }
+
+            auto stats = tracker.stats();
+            collector.record_scalability(
+                "node_lookup",
+                size,
+                stats.mean_ns,
+                "ns",
+                {{"graph_size", std::to_string(size)}});
+
+            INFO(size << " nodes - Lookup: " << stats.mean_ns << " ns");
+        }
+    }
+
+    SECTION("Node insertion performance vs graph size") {
+        for (uint32_t size : {100, 1000, 10000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            // Populate graph to target size
+            for (uint32_t i = 0; i < size; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    2000000 + i, graph->get_agent_id());
+                graph->insert_node(node);
+            }
+
+            // Measure insertion performance
+            LatencyTracker tracker(100);
+
+            for (int i = 0; i < 100; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    3000000 + i, graph->get_agent_id());
+
+                auto timer = tracker.scoped_record();
+                graph->insert_node(node);
+            }
+
+            auto stats = tracker.stats();
+            collector.record_scalability(
+                "node_insert_latency",
+                size,
+                stats.mean_us(),
+                "us",
+                {{"graph_size", std::to_string(size)}});
+
+            INFO(size << " existing nodes - Insert: " << stats.mean_us() << " us");
+        }
+    }
+
+    SECTION("Edge operations vs edge count") {
+        for (uint32_t edge_count : {100, 1000, 5000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            auto root = graph->get_node_root();
+            REQUIRE(root.has_value());
+
+            // Create nodes for edges and store actual IDs
+            std::vector<uint64_t> node_ids;
+            node_ids.reserve(edge_count + 100);
+            for (uint32_t i = 0; i < edge_count + 100; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    0, graph->get_agent_id());
+                auto result = graph->insert_node(node);
+                if (result.has_value()) {
+                    node_ids.push_back(result.value());
+                }
+            }
+            REQUIRE(node_ids.size() >= edge_count);
+
+            // Create edges
+            for (uint32_t i = 0; i < edge_count; ++i) {
+                auto edge = GraphGenerator::create_test_edge(
+                    root->id(), node_ids[i], graph->get_agent_id());
+                graph->insert_or_assign_edge(edge);
+            }
+
+            // Measure edge lookup performance
+            LatencyTracker lookup_tracker(1000);
+            for (int i = 0; i < 1000; ++i) {
+                uint64_t target = node_ids[i % edge_count];
+                auto timer = lookup_tracker.scoped_record();
+                auto edge = graph->get_edge(root->id(), target, "test_edge");
+            }
+
+            auto lookup_stats = lookup_tracker.stats();
+            collector.record_scalability(
+                "edge_lookup",
+                edge_count,
+                lookup_stats.mean_ns,
+                "ns",
+                {{"edge_count", std::to_string(edge_count)}});
+
+            // Measure edge insertion performance
+            LatencyTracker insert_tracker(100);
+            for (int i = 0; i < 100; ++i) {
+                uint64_t target = node_ids[edge_count + i];
+                auto edge = GraphGenerator::create_test_edge(
+                    root->id(), target, graph->get_agent_id());
+
+                auto timer = insert_tracker.scoped_record();
+                graph->insert_or_assign_edge(edge);
+            }
+
+            auto insert_stats = insert_tracker.stats();
+            collector.record_scalability(
+                "edge_insert_latency",
+                edge_count,
+                insert_stats.mean_us(),
+                "us",
+                {{"edge_count", std::to_string(edge_count)}});
+
+            INFO(edge_count << " edges - Lookup: " << lookup_stats.mean_ns
+                 << " ns, Insert: " << insert_stats.mean_us() << " us");
+        }
+    }
+
+    SECTION("get_nodes performance vs graph size") {
+        for (uint32_t size : {100, 1000, 5000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            // Populate
+            for (uint32_t i = 0; i < size; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    5000000 + i, graph->get_agent_id());
+                graph->insert_node(node);
+            }
+
+            // Measure full scan
+            LatencyTracker tracker(100);
+            for (int i = 0; i < 100; ++i) {
+                auto timer = tracker.scoped_record();
+                auto nodes = graph->get_nodes();
+            }
+
+            auto stats = tracker.stats();
+            collector.record_scalability(
+                "get_all_nodes",
+                size,
+                stats.mean_us(),
+                "us",
+                {{"graph_size", std::to_string(size)}});
+
+            INFO(size << " nodes - get_nodes: " << stats.mean_us() << " us");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "graph_size_impact");
+}
+
+TEST_CASE("Memory pressure impact", "[SCALABILITY][memory]") {
+    MetricsCollector collector("memory_pressure");
+    GraphGenerator generator;
+
+    SECTION("Operation latency under memory pressure") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        // Create increasingly large graph and measure periodically
+        std::vector<std::pair<uint32_t, double>> size_vs_latency;
+
+        for (uint32_t target_size : {1000, 5000, 10000, 20000}) {
+            // Add nodes to reach target size
+            uint64_t current_size = graph->get_nodes().size();
+            for (uint64_t i = current_size; i < target_size; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    6000000 + i, graph->get_agent_id());
+                graph->insert_node(node);
+            }
+
+            // Measure insertion latency
+            LatencyTracker tracker(50);
+            for (int i = 0; i < 50; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    7000000 + target_size * 100 + i, graph->get_agent_id());
+
+                auto timer = tracker.scoped_record();
+                graph->insert_node(node);
+            }
+
+            auto stats = tracker.stats();
+            collector.record_scalability(
+                "insert_under_pressure",
+                target_size,
+                stats.mean_us(),
+                "us",
+                {{"graph_size", std::to_string(target_size)}});
+
+            size_vs_latency.push_back({target_size, stats.mean_us()});
+            INFO(target_size << " nodes - Insert latency: " << stats.mean_us() << " us");
+        }
+
+        // Check for non-linear degradation
+        if (size_vs_latency.size() >= 2) {
+            double first_latency = size_vs_latency.front().second;
+            double last_latency = size_vs_latency.back().second;
+            double size_ratio = static_cast<double>(size_vs_latency.back().first) /
+                               static_cast<double>(size_vs_latency.front().first);
+            double latency_ratio = last_latency / first_latency;
+
+            collector.record("latency_degradation_ratio", MetricCategory::Scalability,
+                latency_ratio / size_ratio, "x");
+
+            INFO("Latency degradation ratio: " << latency_ratio / size_ratio << "x");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "memory_pressure");
+}
diff --git a/benchmarks/scalability/graph_size_scaling_bench.cpp b/benchmarks/scalability/graph_size_scaling_bench.cpp
new file mode 100644
index 0000000..c748b2a
--- /dev/null
+++ b/benchmarks/scalability/graph_size_scaling_bench.cpp
@@ -0,0 +1,313 @@
+#include <catch2/catch_test_macros.hpp>
+#include <chrono>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+using namespace std::chrono;
+
+// For each operation, measures latency (1000 samples) and throughput (3-second
+// window) at three pre-existing graph sizes: {100, 1000, 10000} nodes.
+// "graph_size" = number of nodes already in the graph before measurement
+// begins, so the benchmark captures the cost of operating on an N-node graph.
+
+static constexpr auto GS_THR_DUR = std::chrono::seconds(3);
+
+// ── Node insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node insert graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        // Pre-populate to target size
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(2000000 + i, graph->get_agent_id());
+            graph->insert_node(node);
+        }
+
+        // Latency — 1000 samples
+        LatencyTracker tracker(1000);
+        for (int i = 0; i < 1000; ++i) {
+            auto node = GraphGenerator::create_test_node(3000000 + i, graph->get_agent_id());
+            auto t = tracker.scoped_record();
+            graph->insert_node(node);
+        }
+        auto stats = tracker.stats();
+
+        // Throughput — 3-second window
+        uint64_t ops = 0;
+        auto start = steady_clock::now();
+        auto end = start + GS_THR_DUR;
+        while (steady_clock::now() < end) {
+            auto node = GraphGenerator::create_test_node(4000000 + ops, graph->get_agent_id());
+            graph->insert_node(node);
+            ops++;
+        }
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("node_insert", stats, {{"graph_size", n_str}});
+        collector.record_throughput("node_insert", ops, dur, {{"graph_size", n_str}});
+        collector.record_scalability("node_insert", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_graphsize_scaling");
+}
+
+// ── Node read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Node read graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            if (res.has_value()) node_ids.push_back(res.value());
+        }
+        REQUIRE(!node_ids.empty());
+
+        // Latency — 1000 samples
+        LatencyTracker tracker(1000);
+        for (int i = 0; i < 1000; ++i) {
+            uint64_t id = node_ids[i % node_ids.size()];
+            auto t = tracker.scoped_record();
+            auto node = graph->get_node(id);
+        }
+        auto stats = tracker.stats();
+
+        // Throughput — 3-second window
+        uint64_t ops = 0;
+        auto start = steady_clock::now();
+        auto end = start + GS_THR_DUR;
+        while (steady_clock::now() < end) {
+            uint64_t id = node_ids[ops % node_ids.size()];
+            auto node = graph->get_node(id);
+            ops++;
+        }
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("node_read", stats, {{"graph_size", n_str}});
+        collector.record_throughput("node_read", ops, dur, {{"graph_size", n_str}});
+        collector.record_scalability("node_read", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_graphsize_scaling");
+}
+
+// ── Node update ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node update graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            if (res.has_value()) node_ids.push_back(res.value());
+        }
+        REQUIRE(!node_ids.empty());
+
+        // Latency — 1000 samples
+        LatencyTracker tracker(1000);
+        for (int i = 0; i < 1000; ++i) {
+            uint64_t id = node_ids[i % node_ids.size()];
+            auto node = graph->get_node(id);
+            if (node) {
+                graph->add_or_modify_attrib_local<level_att>(
+                    *node, static_cast<int32_t>(i % 1000));
+                auto t = tracker.scoped_record();
+                graph->update_node(*node);
+            }
+        }
+        auto stats = tracker.stats();
+
+        // Throughput — 3-second window
+        uint64_t ops = 0;
+        auto start = steady_clock::now();
+        auto end = start + GS_THR_DUR;
+        while (steady_clock::now() < end) {
+            uint64_t id = node_ids[ops % node_ids.size()];
+            auto node = graph->get_node(id);
+            if (node) {
+                graph->add_or_modify_attrib_local<level_att>(
+                    *node, static_cast<int32_t>(ops % 1000));
+                graph->update_node(*node);
+                ops++;
+            }
+        }
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("node_update", stats, {{"graph_size", n_str}});
+        collector.record_throughput("node_update", ops, dur, {{"graph_size", n_str}});
+        collector.record_scalability("node_update", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_graphsize_scaling");
+}
+
+// ── Edge insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge insert graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        auto root = graph->get_node_root();
+        REQUIRE(root.has_value());
+
+        // Pre-populate N target nodes
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            if (res.has_value()) node_ids.push_back(res.value());
+        }
+        REQUIRE(!node_ids.empty());
+
+        // Latency — 1000 samples
+        LatencyTracker tracker(1000);
+        for (int i = 0; i < 1000; ++i) {
+            uint64_t target = node_ids[i % node_ids.size()];
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), target, graph->get_agent_id());
+            auto t = tracker.scoped_record();
+            graph->insert_or_assign_edge(edge);
+        }
+        auto stats = tracker.stats();
+
+        // Throughput — 3-second window
+        uint64_t ops = 0;
+        auto start = steady_clock::now();
+        auto end = start + GS_THR_DUR;
+        while (steady_clock::now() < end) {
+            uint64_t target = node_ids[ops % node_ids.size()];
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), target, graph->get_agent_id());
+            graph->insert_or_assign_edge(edge);
+            ops++;
+        }
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("edge_insert", stats, {{"graph_size", n_str}});
+        collector.record_throughput("edge_insert", ops, dur, {{"graph_size", n_str}});
+        collector.record_scalability("edge_insert", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_graphsize_scaling");
+}
+
+// ── Edge read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge read graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        auto root = graph->get_node_root();
+        REQUIRE(root.has_value());
+
+        // Pre-populate N nodes + edges
+        std::vector<uint64_t> target_ids;
+        target_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            if (res.has_value()) {
+                target_ids.push_back(res.value());
+                auto edge = GraphGenerator::create_test_edge(
+                    root->id(), res.value(), graph->get_agent_id());
+                graph->insert_or_assign_edge(edge);
+            }
+        }
+        REQUIRE(!target_ids.empty());
+
+        // Latency — 1000 samples
+        LatencyTracker tracker(1000);
+        for (int i = 0; i < 1000; ++i) {
+            uint64_t target = target_ids[i % target_ids.size()];
+            auto t = tracker.scoped_record();
+            auto edge = graph->get_edge(root->id(), target, "test_edge");
+        }
+        auto stats = tracker.stats();
+
+        // Throughput — 3-second window
+        uint64_t ops = 0;
+        auto start = steady_clock::now();
+        auto end = start + GS_THR_DUR;
+        while (steady_clock::now() < end) {
+            uint64_t target = target_ids[ops % target_ids.size()];
+            auto edge = graph->get_edge(root->id(), target, "test_edge");
+            ops++;
+        }
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("edge_read", stats, {{"graph_size", n_str}});
+        collector.record_throughput("edge_read", ops, dur, {{"graph_size", n_str}});
+        collector.record_scalability("edge_read", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_graphsize_scaling");
+}
diff --git a/benchmarks/scalability/multi_agent_sync_bench.cpp b/benchmarks/scalability/multi_agent_sync_bench.cpp
new file mode 100644
index 0000000..f52a033
--- /dev/null
+++ b/benchmarks/scalability/multi_agent_sync_bench.cpp
@@ -0,0 +1,286 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <vector>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Multi-agent synchronization benchmarks", "[SCALABILITY][sync][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("multi_agent_sync");
+
+    SECTION("Initial sync time vs agent count") {
+        for (uint32_t num_agents : {2, 4, 8, 16}) {
+            auto config_file = generator.generate_empty_graph();
+
+            LatencyTracker tracker(10);
+
+            for (int trial = 0; trial < 10; ++trial) {
+                MultiAgentFixture fixture;
+
+                uint64_t start = get_unix_timestamp();
+                bool created = fixture.create_agents(num_agents, config_file);
+                if (!created) {
+                    WARN("Could not create " << num_agents << " agents");
+                    break;
+                }
+
+                fixture.wait_for_sync();
+                bool converged = fixture.verify_convergence();
+                uint64_t elapsed = get_unix_timestamp() - start;
+
+                if (converged) {
+                    tracker.record(elapsed);
+                }
+
+                // Cleanup before next trial
+            }
+
+            if (tracker.count() > 0) {
+                auto stats = tracker.stats();
+                collector.record_scalability(
+                    "initial_sync_time",
+                    num_agents,
+                    stats.mean_ms(),
+                    "ms",
+                    {{"num_agents", std::to_string(num_agents)}});
+
+                INFO(num_agents << " agents - Initial sync: " << stats.mean_ms() << " ms");
+            }
+        }
+    }
+
+    SECTION("Convergence time after operation") {
+        for (uint32_t num_agents : {2, 4, 8}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents");
+                continue;
+            }
+            fixture.wait_for_sync();
+
+            LatencyTracker tracker(50);
+
+            // Measure convergence time after node insertion
+            for (int i = 0; i < 50; ++i) {
+                auto* sender = fixture.get_agent(0);
+                auto node = GraphGenerator::create_test_node(
+                    700000 + i, sender->get_agent_id(),
+                    "sync_node_" + std::to_string(i));
+
+                uint64_t start = get_unix_timestamp();
+                sender->insert_node(node);
+
+                auto conv_time = fixture.measure_convergence_time();
+                if (conv_time.count() >= 0) {
+                    tracker.record(static_cast<uint64_t>(conv_time.count()) * 1'000'000);  // ms to ns
+                }
+            }
+
+            if (tracker.count() > 0) {
+                auto stats = tracker.stats();
+                collector.record_scalability(
+                    "convergence_after_insert",
+                    num_agents,
+                    stats.mean_ms(),
+                    "ms",
+                    {{"num_agents", std::to_string(num_agents)}});
+
+                INFO(num_agents << " agents - Convergence time: " << stats.mean_ms() << " ms");
+            }
+        }
+    }
+
+    SECTION("Broadcast time to all agents") {
+        for (uint32_t num_agents : {2, 4, 8}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents");
+                continue;
+            }
+            fixture.wait_for_sync();
+
+            LatencyTracker tracker(50);
+
+            // Track when each agent receives the update
+            std::vector<std::atomic<uint64_t>> receive_times(num_agents - 1);
+            std::vector<std::atomic<bool>> received(num_agents - 1);
+
+            for (size_t i = 1; i < num_agents; ++i) {
+                auto* receiver = fixture.get_agent(i);
+                QObject::connect(receiver, &DSR::DSRGraph::update_node_signal, receiver,
+                    [&, idx = i - 1](uint64_t id, const std::string& type, DSR::SignalInfo) {
+                        if (id >= 800000 && id < 900000 && !received[idx].load()) {
+                            receive_times[idx].store(get_unix_timestamp());
+                            received[idx].store(true);
+                        }
+                    }, Qt::DirectConnection);
+            }
+
+            auto* sender = fixture.get_agent(0);
+
+            for (int i = 0; i < 50; ++i) {
+                // Reset tracking
+                for (size_t j = 0; j < num_agents - 1; ++j) {
+                    receive_times[j].store(0);
+                    received[j].store(false);
+                }
+
+                auto node = GraphGenerator::create_test_node(
+                    800000 + i, sender->get_agent_id(),
+                    "broadcast_node_" + std::to_string(i));
+
+                uint64_t send_time = get_unix_timestamp();
+                sender->insert_node(node);
+
+                // Wait for all receivers
+                auto start = std::chrono::steady_clock::now();
+                while (true) {
+                    bool all_received = true;
+                    for (size_t j = 0; j < num_agents - 1; ++j) {
+                        if (!received[j].load()) {
+                            all_received = false;
+                            break;
+                        }
+                    }
+
+                    if (all_received) break;
+
+                    fixture.process_events(1);
+
+                    if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                        break;
+                    }
+                }
+
+                // Find max receive time (last agent to receive)
+                uint64_t max_time = 0;
+                for (size_t j = 0; j < num_agents - 1; ++j) {
+                    if (received[j].load()) {
+                        max_time = std::max(max_time, receive_times[j].load());
+                    }
+                }
+
+                if (max_time > send_time) {
+                    tracker.record(max_time - send_time);
+                }
+            }
+
+            if (tracker.count() > 0) {
+                auto stats = tracker.stats();
+                collector.record_scalability(
+                    "broadcast_to_all",
+                    num_agents,
+                    stats.mean_us(),
+                    "us",
+                    {{"num_agents", std::to_string(num_agents)}});
+
+                INFO(num_agents << " agents - Broadcast time: " << stats.mean_us() << " us");
+            }
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "multi_agent_sync");
+}
+
+TEST_CASE("Scaling efficiency", "[SCALABILITY][efficiency][.multi]") {
+    GraphGenerator generator;
+    MetricsCollector collector("scaling_efficiency");
+
+    std::map<uint32_t, double> throughputs;
+
+    SECTION("Throughput scaling with agents") {
+        for (uint32_t num_agents : {1, 2, 4, 8}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents");
+                continue;
+            }
+            fixture.wait_for_sync();
+
+            constexpr auto TEST_DURATION = std::chrono::seconds(3);
+            std::atomic<uint64_t> total_ops{0};
+            std::atomic<bool> stop_flag{false};
+
+            std::vector<std::thread> threads;
+            threads.reserve(num_agents);
+
+            auto start = std::chrono::steady_clock::now();
+
+            for (size_t i = 0; i < num_agents; ++i) {
+                threads.emplace_back([&, agent_idx = i]() {
+                    auto* graph = fixture.get_agent(agent_idx);
+                    uint64_t base_id = 900000 + agent_idx * 50000;
+                    uint64_t local_ops = 0;
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        auto node = GraphGenerator::create_test_node(
+                            base_id + local_ops, graph->get_agent_id());
+                        graph->insert_node(node);
+                        local_ops++;
+                    }
+
+                    total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(TEST_DURATION);
+            stop_flag.store(true);
+
+            for (auto& t : threads) {
+                t.join();
+            }
+
+            auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::steady_clock::now() - start);
+
+            double ops_per_sec = static_cast<double>(total_ops.load()) /
+                                (static_cast<double>(actual_duration.count()) / 1000.0);
+
+            throughputs[num_agents] = ops_per_sec;
+
+            collector.record_scalability(
+                "throughput_scaling",
+                num_agents,
+                ops_per_sec,
+                "ops/sec",
+                {{"num_agents", std::to_string(num_agents)}});
+
+            INFO(num_agents << " agents - Throughput: " << ops_per_sec << " ops/sec");
+        }
+
+        // Calculate scaling efficiency
+        if (throughputs.count(1) > 0 && throughputs.count(2) > 0) {
+            double efficiency_2 = throughputs[2] / (2 * throughputs[1]) * 100;
+            collector.record("scaling_efficiency_2_agents", MetricCategory::Scalability,
+                efficiency_2, "%");
+            INFO("Scaling efficiency (2 agents): " << efficiency_2 << "%");
+        }
+
+        if (throughputs.count(1) > 0 && throughputs.count(4) > 0) {
+            double efficiency_4 = throughputs[4] / (4 * throughputs[1]) * 100;
+            collector.record("scaling_efficiency_4_agents", MetricCategory::Scalability,
+                efficiency_4, "%");
+            INFO("Scaling efficiency (4 agents): " << efficiency_4 << "%");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "scaling_efficiency");
+}
diff --git a/benchmarks/scalability/thread_scaling_bench.cpp b/benchmarks/scalability/thread_scaling_bench.cpp
new file mode 100644
index 0000000..41d04af
--- /dev/null
+++ b/benchmarks/scalability/thread_scaling_bench.cpp
@@ -0,0 +1,456 @@
+#include <catch2/catch_test_macros.hpp>
+#include <thread>
+#include <atomic>
+#include <barrier>
+#include <vector>
+#include <chrono>
+#include <string>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+using namespace std::chrono;
+
+// Measures throughput + latency across {1, 2, 4, 8} threads for each
+// operation.  Each iteration runs a 5-second window; per-thread raw latency
+// samples are merged into a single LatencyTracker for aggregate stats.
+// A record_scalability() entry is added so the Scalability tab can plot
+// the efficiency curve (scale_dim = "threads").
+
+static constexpr auto THREAD_DUR = std::chrono::seconds(5);
+
+// ── Node insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node insert thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t t = 0; t < N; ++t) {
+            threads.emplace_back([&, tid = t]() {
+                uint64_t base_id = 200000ULL + tid * 200000ULL;
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[tid];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = GraphGenerator::create_test_node(
+                        base_id + local_ops, graph->get_agent_id());
+                    uint64_t ts = bench_now();
+                    graph->insert_node(node);
+                    samples.push_back(bench_now() - ts);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(THREAD_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_insert", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_insert", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_insert", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_thread_scaling");
+}
+
+// ── Node read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Node read thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate once; all thread-count iterations share this pool.
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) node_ids.push_back(res.value());
+    }
+    REQUIRE(!node_ids.empty());
+    const size_t pool_size = node_ids.size();
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t t = 0; t < N; ++t) {
+            threads.emplace_back([&, tid = t]() {
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[tid];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    uint64_t id = node_ids[local_ops % pool_size];
+                    uint64_t ts = bench_now();
+                    auto node = graph->get_node(id);
+                    samples.push_back(bench_now() - ts);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(THREAD_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_read", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_read", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_read", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_thread_scaling");
+}
+
+// ── Node update ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node update thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-insert 8 nodes (one per thread for the largest N); each thread
+    // updates its own node to measure scaling without lock contention.
+    constexpr uint32_t MAX_THREADS = 8;
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(MAX_THREADS);
+    for (uint32_t t = 0; t < MAX_THREADS; ++t) {
+        auto node = GraphGenerator::create_test_node(
+            500000 + t, graph->get_agent_id(),
+            "update_node_" + std::to_string(t));
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        node_ids.push_back(res.value());
+    }
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t t = 0; t < N; ++t) {
+            threads.emplace_back([&, tid = t]() {
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[tid];
+                uint64_t nid = node_ids[tid];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = graph->get_node(nid);
+                    if (node) {
+                        graph->add_or_modify_attrib_local<level_att>(
+                            *node, static_cast<int32_t>(local_ops % 1000));
+                        uint64_t ts = bench_now();
+                        graph->update_node(*node);
+                        samples.push_back(bench_now() - ts);
+                        local_ops++;
+                    }
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(THREAD_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_update", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_update", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_update", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_thread_scaling");
+}
+
+// ── Edge insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge insert thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate target node pool; shared across all N iterations.
+    constexpr uint32_t POOL_SIZE = 10000;
+    std::vector<uint64_t> pool;
+    pool.reserve(POOL_SIZE);
+    for (uint64_t i = 0; i < POOL_SIZE; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) pool.push_back(res.value());
+    }
+    REQUIRE(!pool.empty());
+    const size_t pool_size = pool.size();
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        const uint32_t stride = static_cast<uint32_t>(pool_size / N) + 1;
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t t = 0; t < N; ++t) {
+            threads.emplace_back([&, tid = t]() {
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[tid];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    uint64_t idx = (local_ops + tid * stride) % pool_size;
+                    auto edge = GraphGenerator::create_test_edge(
+                        root->id(), pool[idx], graph->get_agent_id());
+                    uint64_t ts = bench_now();
+                    graph->insert_or_assign_edge(edge);
+                    samples.push_back(bench_now() - ts);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(THREAD_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("edge_insert", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("edge_insert", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("edge_insert", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_thread_scaling");
+}
+
+// ── Edge read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge read thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate 1000 nodes + edges; shared across all N iterations.
+    constexpr uint32_t POOL_SIZE = 1000;
+    std::vector<uint64_t> pool;
+    pool.reserve(POOL_SIZE);
+    for (uint64_t i = 0; i < POOL_SIZE; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) {
+            pool.push_back(res.value());
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), res.value(), graph->get_agent_id());
+            graph->insert_or_assign_edge(edge);
+        }
+    }
+    REQUIRE(!pool.empty());
+    const size_t pool_size = pool.size();
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        const uint32_t stride = static_cast<uint32_t>(pool_size / N) + 1;
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t t = 0; t < N; ++t) {
+            threads.emplace_back([&, tid = t]() {
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[tid];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    uint64_t idx = (local_ops + tid * stride) % pool_size;
+                    uint64_t ts = bench_now();
+                    auto edge = graph->get_edge(root->id(), pool[idx], "test_edge");
+                    samples.push_back(bench_now() - ts);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(THREAD_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("edge_read", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("edge_read", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("edge_read", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_thread_scaling");
+}
diff --git a/benchmarks/throughput/concurrent_writers_bench.cpp b/benchmarks/throughput/concurrent_writers_bench.cpp
new file mode 100644
index 0000000..3363393
--- /dev/null
+++ b/benchmarks/throughput/concurrent_writers_bench.cpp
@@ -0,0 +1,338 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <thread>
+#include <atomic>
+#include <barrier>
+#include <vector>
+#include <chrono>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Concurrent writers throughput", "[THROUGHPUT][concurrent]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("concurrent_writers");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    auto run_concurrent_test = [&](uint32_t num_threads, const std::string& test_name) {
+        std::atomic<uint64_t> total_operations{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(num_threads);
+
+        // Base node ID for each thread to avoid collisions
+        std::vector<std::thread> threads;
+        threads.reserve(num_threads);
+
+        auto start = std::chrono::steady_clock::now();
+
+        for (uint32_t t = 0; t < num_threads; ++t) {
+            threads.emplace_back([&, thread_id = t]() {
+                uint64_t base_id = 100000 + thread_id * 100000;
+                uint64_t local_ops = 0;
+
+                // Synchronize start
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = GraphGenerator::create_test_node(
+                        base_id + local_ops, graph->get_agent_id(),
+                        "thread_" + std::to_string(thread_id) + "_node_" + std::to_string(local_ops));
+                    graph->insert_node(node);
+                    local_ops++;
+                }
+
+                total_operations.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(TEST_DURATION);
+        stop_flag.store(true, std::memory_order_relaxed);
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_throughput(test_name, total_operations.load(), actual_duration,
+            {{"num_threads", std::to_string(num_threads)}});
+
+        double ops_per_sec = static_cast<double>(total_operations.load()) /
+                            (static_cast<double>(actual_duration.count()) / 1000.0);
+
+        return ops_per_sec;
+    };
+
+    SECTION("2 concurrent writers") {
+        double ops = run_concurrent_test(2, "concurrent_insert_2_threads");
+        INFO("2 threads: " << ops << " ops/sec");
+        CHECK(ops >= MIN_EXPECTED_THROUGHPUT_OPS);
+    }
+
+    SECTION("4 concurrent writers") {
+        double ops = run_concurrent_test(4, "concurrent_insert_4_threads");
+        INFO("4 threads: " << ops << " ops/sec");
+    }
+
+    SECTION("8 concurrent writers") {
+        double ops = run_concurrent_test(8, "concurrent_insert_8_threads");
+        INFO("8 threads: " << ops << " ops/sec");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "concurrent_writers");
+}
+
+TEST_CASE("Concurrent read-write throughput", "[THROUGHPUT][concurrent]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("concurrent_read_write");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate graph and store actual IDs
+    std::vector<uint64_t> pre_node_ids;
+    pre_node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(
+            0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        if (result.has_value()) {
+            pre_node_ids.push_back(result.value());
+        }
+    }
+    REQUIRE(!pre_node_ids.empty());
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    SECTION("Mixed read-write workload") {
+        constexpr uint32_t NUM_READERS = 4;
+        constexpr uint32_t NUM_WRITERS = 2;
+        constexpr uint32_t TOTAL_THREADS = NUM_READERS + NUM_WRITERS;
+
+        std::atomic<uint64_t> read_ops{0};
+        std::atomic<uint64_t> write_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(TOTAL_THREADS);
+
+        std::vector<std::thread> threads;
+        threads.reserve(TOTAL_THREADS);
+
+        auto start = std::chrono::steady_clock::now();
+
+        // Reader threads
+        for (uint32_t t = 0; t < NUM_READERS; ++t) {
+            threads.emplace_back([&, thread_id = t]() {
+                uint64_t local_ops = 0;
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    uint64_t id = pre_node_ids[local_ops % pre_node_ids.size()];
+                    auto node = graph->get_node(id);
+                    local_ops++;
+                }
+
+                read_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        // Writer threads
+        for (uint32_t t = 0; t < NUM_WRITERS; ++t) {
+            threads.emplace_back([&, thread_id = t]() {
+                uint64_t base_id = 300000 + thread_id * 100000;
+                uint64_t local_ops = 0;
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = GraphGenerator::create_test_node(
+                        base_id + local_ops, graph->get_agent_id());
+                    graph->insert_node(node);
+                    local_ops++;
+                }
+
+                write_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(TEST_DURATION);
+        stop_flag.store(true, std::memory_order_relaxed);
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_throughput("concurrent_reads", read_ops.load(), actual_duration,
+            {{"num_readers", std::to_string(NUM_READERS)}});
+        collector.record_throughput("concurrent_writes", write_ops.load(), actual_duration,
+            {{"num_writers", std::to_string(NUM_WRITERS)}});
+
+        double read_ops_sec = static_cast<double>(read_ops.load()) /
+                             (static_cast<double>(actual_duration.count()) / 1000.0);
+        double write_ops_sec = static_cast<double>(write_ops.load()) /
+                              (static_cast<double>(actual_duration.count()) / 1000.0);
+
+        INFO("Read throughput: " << read_ops_sec << " ops/sec");
+        INFO("Write throughput: " << write_ops_sec << " ops/sec");
+    }
+
+    SECTION("Update contention test") {
+        constexpr uint32_t NUM_THREADS = 4;
+
+        // All threads update the same node
+        auto test_node = GraphGenerator::create_test_node(
+            0, graph->get_agent_id(), "contention_test");
+        auto contention_id_opt = graph->insert_node(test_node);
+        REQUIRE(contention_id_opt.has_value());
+        uint64_t contention_node_id = contention_id_opt.value();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> successful_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(NUM_THREADS);
+
+        std::vector<std::thread> threads;
+        threads.reserve(NUM_THREADS);
+
+        auto start = std::chrono::steady_clock::now();
+
+        for (uint32_t t = 0; t < NUM_THREADS; ++t) {
+            threads.emplace_back([&, thread_id = t, node_id = contention_node_id]() {
+                uint64_t local_total = 0;
+                uint64_t local_success = 0;
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = graph->get_node(node_id);
+                    if (node) {
+                        graph->add_or_modify_attrib_local<level_att>(
+                            *node, static_cast<int32_t>(thread_id * 1000 + local_total));
+                        if (graph->update_node(*node)) {
+                            local_success++;
+                        }
+                    }
+                    local_total++;
+                }
+
+                total_ops.fetch_add(local_total, std::memory_order_relaxed);
+                successful_ops.fetch_add(local_success, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(TEST_DURATION);
+        stop_flag.store(true, std::memory_order_relaxed);
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        double success_rate = static_cast<double>(successful_ops.load()) /
+                             static_cast<double>(total_ops.load()) * 100.0;
+
+        collector.record("update_contention_total", MetricCategory::Throughput,
+            static_cast<double>(total_ops.load()), "ops",
+            {{"num_threads", std::to_string(NUM_THREADS)}});
+        collector.record("update_contention_success_rate", MetricCategory::Throughput,
+            success_rate, "%",
+            {{"num_threads", std::to_string(NUM_THREADS)}});
+
+        INFO("Total attempts: " << total_ops.load());
+        INFO("Successful updates: " << successful_ops.load());
+        INFO("Success rate: " << success_rate << "%");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "concurrent_read_write");
+}
+
+TEST_CASE("Multi-agent concurrent operations", "[THROUGHPUT][concurrent][multiagent][.multi]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("multiagent_concurrent");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(4, config_file));
+    fixture.wait_for_sync();
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    SECTION("Each agent writes independently") {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(fixture.agent_count());
+
+        std::vector<std::thread> threads;
+        threads.reserve(fixture.agent_count());
+
+        auto start = std::chrono::steady_clock::now();
+
+        for (size_t i = 0; i < fixture.agent_count(); ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t base_id = 600000 + agent_idx * 100000;
+                uint64_t local_ops = 0;
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = GraphGenerator::create_test_node(
+                        base_id + local_ops, graph->get_agent_id(),
+                        "agent_" + std::to_string(agent_idx) + "_node_" + std::to_string(local_ops));
+                    graph->insert_node(node);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(TEST_DURATION);
+        stop_flag.store(true, std::memory_order_relaxed);
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_throughput("multiagent_concurrent_insert",
+            total_ops.load(), actual_duration,
+            {{"num_agents", std::to_string(fixture.agent_count())}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                            (static_cast<double>(actual_duration.count()) / 1000.0);
+
+        INFO("Multi-agent concurrent throughput: " << ops_per_sec << " ops/sec");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "multiagent_concurrent");
+}
diff --git a/benchmarks/throughput/single_agent_ops_bench.cpp b/benchmarks/throughput/single_agent_ops_bench.cpp
new file mode 100644
index 0000000..8875ab9
--- /dev/null
+++ b/benchmarks/throughput/single_agent_ops_bench.cpp
@@ -0,0 +1,468 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <chrono>
+#include <atomic>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// Each operation gets its own TEST_CASE so Catch2 doesn't re-run setup for
+// every SECTION and overwrite the exported JSON with only the last result.
+
+TEST_CASE("Node insertion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    // Warmup — 500ms discard to prime caches, branch predictor, allocators
+    {
+        auto warmup_end = std::chrono::steady_clock::now() + std::chrono::milliseconds(500);
+        while (std::chrono::steady_clock::now() < warmup_end) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            graph->insert_node(node);
+        }
+    }
+
+    uint64_t operations = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        graph->insert_node(node);
+        operations++;
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("node_insert", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Node insert throughput: " << ops_per_sec << " ops/sec");
+    CHECK(ops_per_sec >= MIN_EXPECTED_THROUGHPUT_OPS);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_throughput");
+}
+
+TEST_CASE("Node read throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        if (result.has_value()) node_ids.push_back(result.value());
+    }
+    REQUIRE(!node_ids.empty());
+
+    uint64_t operations = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        auto node = graph->get_node(node_ids[operations % node_ids.size()]);
+        operations++;
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("node_read", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Node read throughput: " << ops_per_sec << " ops/sec");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_throughput");
+}
+
+TEST_CASE("Node update throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    auto test_node = GraphGenerator::create_test_node(0, graph->get_agent_id(), "update_test");
+    auto insert_result = graph->insert_node(test_node);
+    REQUIRE(insert_result.has_value());
+    uint64_t node_id = insert_result.value();
+
+    // Warmup — 500ms discard
+    {
+        auto warmup_end = std::chrono::steady_clock::now() + std::chrono::milliseconds(500);
+        uint32_t w = 0;
+        while (std::chrono::steady_clock::now() < warmup_end) {
+            auto node = graph->get_node(node_id);
+            if (node) {
+                graph->add_or_modify_attrib_local<level_att>(*node, static_cast<int32_t>(w++ % 1000));
+                graph->update_node(*node);
+            }
+        }
+    }
+
+    uint64_t operations = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        auto node = graph->get_node(node_id);
+        if (node) {
+            graph->add_or_modify_attrib_local<level_att>(
+                *node, static_cast<int32_t>(operations % 1000));
+            graph->update_node(*node);
+            operations++;
+        }
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("node_update", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Node update throughput: " << ops_per_sec << " ops/sec");
+    CHECK(ops_per_sec >= MIN_EXPECTED_THROUGHPUT_OPS);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_throughput");
+}
+
+TEST_CASE("Edge insertion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(10000);
+    for (uint64_t i = 0; i < 10000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        if (result.has_value()) target_ids.push_back(result.value());
+    }
+    REQUIRE(!target_ids.empty());
+
+    uint64_t operations = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        uint64_t target = target_ids[operations % target_ids.size()];
+        auto edge = GraphGenerator::create_test_edge(root->id(), target, graph->get_agent_id());
+        graph->insert_or_assign_edge(edge);
+        operations++;
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("edge_insert", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Edge insert throughput: " << ops_per_sec << " ops/sec");
+    CHECK(ops_per_sec >= MIN_EXPECTED_THROUGHPUT_OPS);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_throughput");
+}
+
+TEST_CASE("Edge read throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        if (result.has_value()) {
+            target_ids.push_back(result.value());
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), result.value(), graph->get_agent_id());
+            graph->insert_or_assign_edge(edge);
+        }
+    }
+    REQUIRE(!target_ids.empty());
+
+    uint64_t operations = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        uint64_t target = target_ids[operations % target_ids.size()];
+        auto edge = graph->get_edge(root->id(), target, "test_edge");
+        operations++;
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("edge_read", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Edge read throughput: " << ops_per_sec << " ops/sec");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_throughput");
+}
+
+TEST_CASE("Mixed operations throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("mixed_ops_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(500);
+    for (uint64_t i = 0; i < 500; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        if (result.has_value()) node_ids.push_back(result.value());
+    }
+    REQUIRE(!node_ids.empty());
+
+    uint64_t operations = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        int op_type = operations % 10;
+        if (op_type < 4) {
+            auto node = graph->get_node(node_ids[operations % node_ids.size()]);
+        } else if (op_type < 7) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto result = graph->insert_node(node);
+            if (result.has_value()) node_ids.push_back(result.value());
+        } else {
+            auto node = graph->get_node(node_ids[operations % node_ids.size()]);
+            if (node) {
+                graph->add_or_modify_attrib_local<level_att>(
+                    *node, static_cast<int32_t>(operations));
+                graph->update_node(*node);
+            }
+        }
+        operations++;
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("mixed_ops", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Mixed ops throughput: " << ops_per_sec << " ops/sec");
+    CHECK(ops_per_sec >= MIN_EXPECTED_THROUGHPUT_OPS);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "mixed_ops_throughput");
+}
+
+TEST_CASE("Node deletion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_delete_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    // Pre-populate a large pool so we can delete without running out.
+    // We refill the pool when it drops below a threshold.
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(50000);
+    for (uint64_t i = 0; i < 50000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) node_ids.push_back(res.value());
+    }
+    REQUIRE(!node_ids.empty());
+
+    uint64_t operations = 0;
+    size_t pool_idx = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        if (pool_idx >= node_ids.size()) break;
+        graph->delete_node(node_ids[pool_idx++]);
+        operations++;
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("node_delete", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Node delete throughput: " << ops_per_sec << " ops/sec");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_delete_throughput");
+}
+
+TEST_CASE("Edge deletion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_delete_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate 50K nodes with edges from root → node
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(50000);
+    for (uint64_t i = 0; i < 50000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) {
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), res.value(), graph->get_agent_id());
+            graph->insert_or_assign_edge(edge);
+            target_ids.push_back(res.value());
+        }
+    }
+    REQUIRE(!target_ids.empty());
+
+    uint64_t operations = 0;
+    size_t pool_idx = 0;
+    auto start = std::chrono::steady_clock::now();
+    auto end = start + TEST_DURATION;
+
+    while (std::chrono::steady_clock::now() < end) {
+        if (pool_idx >= target_ids.size()) break;
+        graph->delete_edge(root->id(), target_ids[pool_idx++], "test_edge");
+        operations++;
+    }
+
+    auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() - start);
+
+    collector.record_throughput("edge_delete", operations, actual_duration);
+
+    double ops_per_sec = static_cast<double>(operations) /
+                        (static_cast<double>(actual_duration.count()) / 1000.0);
+    INFO("Edge delete throughput: " << ops_per_sec << " ops/sec");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_delete_throughput");
+}
+
+// Catch2 BENCHMARK macros (microbenchmark mode, run with [!benchmark])
+TEST_CASE("Single agent operations (Catch2 BENCHMARK)", "[THROUGHPUT][single][!benchmark]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    BENCHMARK("Node insert") {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        return graph->insert_node(node);
+    };
+
+    auto read_node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+    auto read_id_opt = graph->insert_node(read_node);
+    REQUIRE(read_id_opt.has_value());
+    uint64_t read_id = read_id_opt.value();
+
+    BENCHMARK("Node read") {
+        return graph->get_node(read_id);
+    };
+
+    BENCHMARK("Node update") {
+        auto node = graph->get_node(read_id);
+        if (node) {
+            graph->add_or_modify_attrib_local<level_att>(*node, 42);
+            return graph->update_node(*node);
+        }
+        return false;
+    };
+}
diff --git a/benchmarks/throughput/single_agent_ops_with_latency_bench.cpp b/benchmarks/throughput/single_agent_ops_with_latency_bench.cpp
new file mode 100644
index 0000000..13214ae
--- /dev/null
+++ b/benchmarks/throughput/single_agent_ops_with_latency_bench.cpp
@@ -0,0 +1,348 @@
+#include <catch2/catch_test_macros.hpp>
+#include <chrono>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+using namespace std::chrono;
+
+// Each TEST_CASE measures both throughput (5-second window) and latency
+// simultaneously, recording both to the same collector and exporting to a
+// unique JSON file.  Tags {"threads","1","graph_size","0"} mark these as the
+// single-thread, empty-graph baseline for the Scalability tab.
+
+TEST_CASE("Node insert latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    LatencyTracker tracker(500000);
+    uint64_t ops = 0;
+
+    auto start = steady_clock::now();
+    auto end = start + seconds(5);
+
+    while (steady_clock::now() < end) {
+        auto node = GraphGenerator::create_test_node(ops, graph->get_agent_id());
+        {
+            auto t = tracker.scoped_record();
+            graph->insert_node(node);
+        }
+        ops++;
+    }
+
+    auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+    collector.record_throughput("node_insert", ops, dur,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_latency_stats("node_insert", tracker.stats(),
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_lat_thr");
+}
+
+TEST_CASE("Node read latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate 1000 nodes for round-robin reads
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) node_ids.push_back(res.value());
+    }
+    REQUIRE(!node_ids.empty());
+
+    LatencyTracker tracker(500000);
+    uint64_t ops = 0;
+
+    auto start = steady_clock::now();
+    auto end = start + seconds(5);
+
+    while (steady_clock::now() < end) {
+        uint64_t id = node_ids[ops % node_ids.size()];
+        {
+            auto t = tracker.scoped_record();
+            auto node = graph->get_node(id);
+        }
+        ops++;
+    }
+
+    auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+    collector.record_throughput("node_read", ops, dur,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_latency_stats("node_read", tracker.stats(),
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_lat_thr");
+}
+
+TEST_CASE("Node update latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto test_node = GraphGenerator::create_test_node(0, graph->get_agent_id(), "update_test");
+    auto insert_result = graph->insert_node(test_node);
+    REQUIRE(insert_result.has_value());
+    uint64_t node_id = insert_result.value();
+
+    LatencyTracker tracker(500000);
+    uint64_t ops = 0;
+
+    auto start = steady_clock::now();
+    auto end = start + seconds(5);
+
+    while (steady_clock::now() < end) {
+        auto node = graph->get_node(node_id);
+        if (node) {
+            graph->add_or_modify_attrib_local<level_att>(
+                *node, static_cast<int32_t>(ops % 1000));
+            {
+                auto t = tracker.scoped_record();
+                graph->update_node(*node);
+            }
+            ops++;
+        }
+    }
+
+    auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+    collector.record_throughput("node_update", ops, dur,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_latency_stats("node_update", tracker.stats(),
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_lat_thr");
+}
+
+TEST_CASE("Edge insert latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate target node pool
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(10000);
+    for (uint64_t i = 0; i < 10000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) target_ids.push_back(res.value());
+    }
+    REQUIRE(!target_ids.empty());
+
+    LatencyTracker tracker(500000);
+    uint64_t ops = 0;
+
+    auto start = steady_clock::now();
+    auto end = start + seconds(5);
+
+    while (steady_clock::now() < end) {
+        uint64_t target = target_ids[ops % target_ids.size()];
+        auto edge = GraphGenerator::create_test_edge(root->id(), target, graph->get_agent_id());
+        {
+            auto t = tracker.scoped_record();
+            graph->insert_or_assign_edge(edge);
+        }
+        ops++;
+    }
+
+    auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+    collector.record_throughput("edge_insert", ops, dur,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_latency_stats("edge_insert", tracker.stats(),
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_lat_thr");
+}
+
+TEST_CASE("Edge read latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate 1000 nodes + edges
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) {
+            target_ids.push_back(res.value());
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), res.value(), graph->get_agent_id());
+            graph->insert_or_assign_edge(edge);
+        }
+    }
+    REQUIRE(!target_ids.empty());
+
+    LatencyTracker tracker(500000);
+    uint64_t ops = 0;
+
+    auto start = steady_clock::now();
+    auto end = start + seconds(5);
+
+    while (steady_clock::now() < end) {
+        uint64_t target = target_ids[ops % target_ids.size()];
+        {
+            auto t = tracker.scoped_record();
+            auto edge = graph->get_edge(root->id(), target, "test_edge");
+        }
+        ops++;
+    }
+
+    auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+    collector.record_throughput("edge_read", ops, dur,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_latency_stats("edge_read", tracker.stats(),
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_lat_thr");
+}
+
+TEST_CASE("Node delete latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_delete_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate a large pool to delete from
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(50000);
+    for (uint64_t i = 0; i < 50000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) node_ids.push_back(res.value());
+    }
+    REQUIRE(!node_ids.empty());
+
+    LatencyTracker tracker(500000);
+    uint64_t ops = 0;
+    size_t pool_idx = 0;
+
+    auto start = steady_clock::now();
+    auto end = start + seconds(5);
+
+    while (steady_clock::now() < end && pool_idx < node_ids.size()) {
+        {
+            auto t = tracker.scoped_record();
+            graph->delete_node(node_ids[pool_idx++]);
+        }
+        ops++;
+    }
+
+    auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+    collector.record_throughput("node_delete", ops, dur,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_latency_stats("node_delete", tracker.stats(),
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_delete_lat_thr");
+}
+
+TEST_CASE("Edge delete latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_delete_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate 50K nodes with edges from root → node
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(50000);
+    for (uint64_t i = 0; i < 50000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        if (res.has_value()) {
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), res.value(), graph->get_agent_id());
+            graph->insert_or_assign_edge(edge);
+            target_ids.push_back(res.value());
+        }
+    }
+    REQUIRE(!target_ids.empty());
+
+    LatencyTracker tracker(500000);
+    uint64_t ops = 0;
+    size_t pool_idx = 0;
+
+    auto start = steady_clock::now();
+    auto end = start + seconds(5);
+
+    while (steady_clock::now() < end && pool_idx < target_ids.size()) {
+        {
+            auto t = tracker.scoped_record();
+            graph->delete_edge(root->id(), target_ids[pool_idx++], "test_edge");
+        }
+        ops++;
+    }
+
+    auto dur = duration_cast<milliseconds>(steady_clock::now() - start);
+    collector.record_throughput("edge_delete", ops, dur,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_latency_stats("edge_delete", tracker.stats(),
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_delete_lat_thr");
+}