From b4031611e299d0038a1ae680e9503d0d461f9a91 Mon Sep 17 00:00:00 2001
From: Nico <nico@openclaw>
Date: Fri, 3 Apr 2026 18:18:09 +0200
Subject: [PATCH] Add --repeat=N mode to test runner with timing stats
 (avg/p50/p95)

- run_tests.py: --repeat=N runs each test N times, aggregates into one result
- Stats include: runs, pass_rate, min/avg/p50/p95/max_ms
- Stats posted in result.stats field for dashboard display
- Works with all suites (engine, api, matrix, roundtrip)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/run_tests.py | 124 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 103 insertions(+), 21 deletions(-)

diff --git a/tests/run_tests.py b/tests/run_tests.py
index 275e214..5dbdc8b 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -3,12 +3,11 @@
 Test orchestrator — runs test suites and posts results to dev assay.
 
 Usage:
-  python tests/run_all_tests.py                          # all suites
-  python tests/run_all_tests.py api                      # one suite
-  python tests/run_all_tests.py roundtrip                # one suite
-  python tests/run_all_tests.py api/health               # single test
-  python tests/run_all_tests.py roundtrip/full_eras      # single test
-  python tests/run_all_tests.py api/health roundtrip/full_chat  # multiple tests
+  python tests/run_tests.py                              # all suites
+  python tests/run_tests.py api                          # one suite
+  python tests/run_tests.py matrix/eras_query[haiku]     # single test
+  python tests/run_tests.py matrix --repeat=3            # each test 3x, report avg/p50/p95
+  python tests/run_tests.py api/health roundtrip/full_chat  # multiple tests
 
 Test names: suite/name (without the suite prefix in the test registry).
   engine tests:    graph_load, node_instantiation, edge_types_complete,
@@ -46,6 +45,7 @@ class TestResult:
     duration_ms: float = 0
     error: str = ''
     ts: str = ''
+    stats: dict = field(default_factory=dict)  # {runs, min_ms, avg_ms, p50_ms, p95_ms, max_ms, pass_rate}
 
 
 def post_result(result: TestResult):
@@ -125,29 +125,100 @@ SUITES = {
 }
 
 
-def parse_filters(args: list[str]) -> tuple[set[str] | None, set[str]]:
-    """Parse CLI args into (suite_filter, test_filter).
+def _compute_stats(durations: list[float], passed: int, total: int) -> dict:
+    """Compute timing stats from a list of durations."""
+    if not durations:
+        return {}
+    durations.sort()
+    n = len(durations)
+    return {
+        'runs': total,
+        'passed': passed,
+        'pass_rate': round(100 * passed / total) if total else 0,
+        'min_ms': round(durations[0]),
+        'avg_ms': round(sum(durations) / n),
+        'p50_ms': round(durations[n // 2]),
+        'p95_ms': round(durations[min(int(n * 0.95), n - 1)]),
+        'max_ms': round(durations[-1]),
+    }
+
+
+def run_test_repeated(name: str, suite: str, fn, repeat: int) -> TestResult:
+    """Run a test N times, aggregate timing stats into one result."""
+    # Post running status
+    result = TestResult(run_id=RUN_ID, test=name, suite=suite, status='running', ts=_now_iso())
+    post_result(result)
+
+    durations = []
+    passed_count = 0
+    last_error = ''
+
+    for i in range(repeat):
+        start = time.time()
+        try:
+            fn()
+            elapsed = round((time.time() - start) * 1000)
+            durations.append(elapsed)
+            passed_count += 1
+        except (AssertionError, Exception) as e:
+            elapsed = round((time.time() - start) * 1000)
+            durations.append(elapsed)
+            last_error = str(e)[:200]
+
+    stats = _compute_stats(durations, passed_count, repeat)
+    result.stats = stats
+    result.duration_ms = stats.get('avg_ms', 0)
+    result.status = 'pass' if passed_count == repeat else ('fail' if passed_count > 0 else 'error')
+    result.error = f'{stats["pass_rate"]}% pass, avg={stats["avg_ms"]}ms p50={stats["p50_ms"]}ms p95={stats["p95_ms"]}ms'
+    if last_error and passed_count < repeat:
+        result.error += f' | last err: {last_error}'
+    result.ts = _now_iso()
+    post_result(result)
+    return result
+
+
+def parse_args(args: list[str]) -> tuple[set[str] | None, set[str], int]:
+    """Parse CLI args into (suite_filter, test_filter, repeat).
+
+    Supports: --repeat=N or --repeat N
 
     Returns:
         suite_filter: set of suite names, or None for all suites
         test_filter: set of 'suite/test' names (empty = run all in suite)
+        repeat: number of times to run each test (default 1)
     """
-    if not args:
-        return None, set()
+    repeat = 1
+    filtered_args = []
+    skip_next = False
+    for i, arg in enumerate(args):
+        if skip_next:
+            skip_next = False
+            continue
+        if arg.startswith('--repeat='):
+            repeat = int(arg.split('=', 1)[1])
+        elif arg == '--repeat' and i + 1 < len(args):
+            repeat = int(args[i + 1])
+            skip_next = True
+        else:
+            filtered_args.append(arg)
+
+    if not filtered_args:
+        return None, set(), repeat
 
     suites = set()
     tests = set()
-    for arg in args:
+    for arg in filtered_args:
         if '/' in arg:
             tests.add(arg)
             suites.add(arg.split('/')[0])
         else:
             suites.add(arg)
-    return suites, tests
+    return suites, tests, repeat
 
 
-def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestResult]:
-    """Run tests from a suite, optionally filtered."""
+def run_suite(suite_name: str, tests: dict, test_filter: set[str],
+              repeat: int = 1) -> list[TestResult]:
+    """Run tests from a suite, optionally filtered and repeated."""
     results = []
     for name, fn in tests.items():
         # Apply test filter if specified
@@ -157,21 +228,32 @@ def run_suite(suite_name: str, tests: dict, test_filter: set[str]) -> list[TestR
         if test_filter and full_name not in test_filter and f'{suite_name}/{short_name}' not in test_filter:
             continue
 
-        r = run_test(name, suite_name, fn)
+        if repeat > 1:
+            r = run_test_repeated(name, suite_name, fn, repeat)
+            status = 'PASS' if r.status == 'pass' else 'FAIL'
+            stats = r.stats
+            print(f'  [{status}] {suite_name}/{name} ×{repeat} '
+                  f'(avg={stats.get("avg_ms", 0)}ms p50={stats.get("p50_ms", 0)}ms '
+                  f'p95={stats.get("p95_ms", 0)}ms pass={stats.get("pass_rate", 0)}%)', flush=True)
+        else:
+            r = run_test(name, suite_name, fn)
+            status = 'PASS' if r.status == 'pass' else 'FAIL'
+            print(f'  [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True)
+
         results.append(r)
-        status = 'PASS' if r.status == 'pass' else 'FAIL'
-        print(f'  [{status}] {suite_name}/{name} ({r.duration_ms:.0f}ms)', flush=True)
-        if r.error:
+        if r.error and repeat == 1:
             print(f'           {r.error[:200]}', flush=True)
     return results
 
 
 def main():
-    suite_filter, test_filter = parse_filters(sys.argv[1:])
+    suite_filter, test_filter, repeat = parse_args(sys.argv[1:])
 
     print(f'=== Test Run {RUN_ID} ===', flush=True)
     if suite_filter:
         print(f'Filter: suites={suite_filter}, tests={test_filter or "all"}', flush=True)
+    if repeat > 1:
+        print(f'Repeat: {repeat}x per test', flush=True)
     print(f'ASSAY_API: {os.environ.get("ASSAY_API", "not set")}', flush=True)
     print(f'NYX_URL: {os.environ.get("NYX_URL", "not set")}', flush=True)
     print(flush=True)
@@ -181,9 +263,9 @@ def main():
     for suite_name, loader in SUITES.items():
         if suite_filter and suite_name not in suite_filter:
             continue
-        print(f'--- {suite_name} ---', flush=True)
+        print(f'--- {suite_name}{" ×" + str(repeat) if repeat > 1 else ""} ---', flush=True)
         tests = loader()
-        all_results.extend(run_suite(suite_name, tests, test_filter))
+        all_results.extend(run_suite(suite_name, tests, test_filter, repeat))
         print(flush=True)
 
     # Summary